<a href="https://colab.research.google.com/github/Danzigerrr/MultiClass-Entity-Linking-System/blob/NER-datasets/NER_english_18_classes_with_Flair.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NER with Flair

Model on huggingface: https://huggingface.co/flair/ner-english-ontonotes-large

Predicts 18 tags

In [1]:
!pip install flair

Collecting flair
  Downloading flair-0.14.0-py3-none-any.whl.metadata (12 kB)
Collecting boto3>=1.20.27 (from flair)
  Downloading boto3-1.35.76-py3-none-any.whl.metadata (6.7 kB)
Collecting conllu<5.0.0,>=4.0 (from flair)
  Downloading conllu-4.5.3-py2.py3-none-any.whl.metadata (19 kB)
Collecting ftfy>=6.1.0 (from flair)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting langdetect>=1.0.9 (from flair)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mpld3>=0.3 (from flair)
  Downloading mpld3-0.5.10-py3-none-any.whl.metadata (5.1 kB)
Collecting pptree>=3.1 (from flair)
  Downloading pptree-3.1.tar.gz (3.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pytorch-revgrad>=0.2.0 (from flair)
  Downloading pytorch_revgrad-0.2.0-py3-none-any.whl.metadata (1.7 kB)


In [2]:
from flair.data import Sentence
from flair.models import SequenceTagger

In [33]:
# Load the pre-trained NER tagger
tagger = SequenceTagger.load("flair/ner-english-ontonotes")
# tagger = SequenceTagger.load("flair/ner-english-ontonotes-large")
# tagger = SequenceTagger.load("flair/ner-english-ontonotes-fast")

pytorch_model.bin:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

2024-12-08 11:23:09,140 SequenceTagger predicts: Dictionary with 75 tags: O, S-PERSON, B-PERSON, E-PERSON, I-PERSON, S-GPE, B-GPE, E-GPE, I-GPE, S-ORG, B-ORG, E-ORG, I-ORG, S-DATE, B-DATE, E-DATE, I-DATE, S-CARDINAL, B-CARDINAL, E-CARDINAL, I-CARDINAL, S-NORP, B-NORP, E-NORP, I-NORP, S-MONEY, B-MONEY, E-MONEY, I-MONEY, S-PERCENT, B-PERCENT, E-PERCENT, I-PERCENT, S-ORDINAL, B-ORDINAL, E-ORDINAL, I-ORDINAL, S-LOC, B-LOC, E-LOC, I-LOC, S-TIME, B-TIME, E-TIME, I-TIME, S-WORK_OF_ART, B-WORK_OF_ART, E-WORK_OF_ART, I-WORK_OF_ART, S-FAC


In [34]:
# Create an example sentence
sentence = Sentence("Notre Dame, the iconic medieval cathedral in Paris, reopens after five years of speedy reconstruction work.")


In [35]:
# Predict NER tags
tagger.predict(sentence, return_probabilities_for_all_classes=True)

# Print the sentence with predicted tags
print(sentence)

# Print predicted NER spans
print('The following NER tags are found:')
for entity in sentence.get_spans('ner'):
    print(entity)

Sentence[19]: "Notre Dame, the iconic medieval cathedral in Paris, reopens after five years of speedy reconstruction work." → ["Notre Dame"/ORG, "Paris"/GPE, "five years"/DATE]
The following NER tags are found:
Span[0:2]: "Notre Dame" → ORG (0.8968)
Span[8:9]: "Paris" → GPE (0.9999)
Span[12:14]: "five years" → DATE (0.9825)


In [36]:
import html
from IPython.display import HTML

def generate_html(sentence):
    html_str = "<p>"
    start_idx = 0

    for entity in sentence.get_spans('ner'):
        html_str += sentence.text[start_idx:entity.start_position]
        html_str += f"<span class=\"{entity.tag}\" style=\"background-color: white;\">{entity.text} ({entity.tag})</span>"
        start_idx = entity.end_position

    html_str += sentence.text[start_idx:] + "</p>"

    # Add CSS styles for different entity classes (optional, for additional styling)
    css_styles = """
    <style>
        .CARDINAL { color: blue; }
        .DATE { color: green; }
        .EVENT { color: red; }
        .FAC { color: orange; }
        .GPE { color: purple; }
        .LANGUAGE { color: brown; }
        .LAW { color: pink; }
        .LOC { color: gray; }
        .MONEY { color: yellow; }
        .NORP { color: cyan; }
        .ORDINAL { color: olive; }
        .ORG { color: teal; }
        .PERCENT { color: navy; }
        .PERSON { color: maroon; }
        .PRODUCT { color: lime; }
        .QUANTITY { color: gold; }
        .TIME { color: indigo; }
        .WORK_OF_ART { color: violet; }
    </style>
    """

    return html_str + css_styles

# Generate HTML
html_output = generate_html(sentence)

# Assuming 'html_output' is your HTML string
display(HTML(html_output))

In [38]:
def extract_top_3(token_probabilities):
    # Parse the probabilities from the Label objects
    parsed_probabilities = [
        (token.text, label.value, label.score) for label in token_probabilities
    ]
    # Sort by the probability in descending order
    sorted_probabilities = sorted(parsed_probabilities, key=lambda x: x[2], reverse=True)
    # Get the top 3 probabilities
    return sorted_probabilities[:3]

# Iterate over tokens in the sentence
for token in sentence:
    print(f"\nToken: {token.text}")

    # Get the distribution of probabilities for all classes
    probabilities = token.get_tags_proba_dist("ner")

    # Extract the top 3 probabilities
    top_3_results = extract_top_3(probabilities)

    # Print the top 3 probabilities in a clear format
    for i, (token_text, label, probability) in enumerate(top_3_results):
        print(f"  Top {i+1} prediction: {label} ({probability:.4f})")


Token: Notre
  Top 1 prediction: B-ORG (0.7962)
  Top 2 prediction: B-FAC (0.1880)
  Top 3 prediction: B-GPE (0.0054)

Token: Dame
  Top 1 prediction: E-ORG (0.9975)
  Top 2 prediction: E-FAC (0.0020)
  Top 3 prediction: I-ORG (0.0003)

Token: ,
  Top 1 prediction: O (1.0000)
  Top 2 prediction: E-GPE (0.0000)
  Top 3 prediction: I-GPE (0.0000)

Token: the
  Top 1 prediction: O (0.9966)
  Top 2 prediction: B-QUANTITY (0.0012)
  Top 3 prediction: B-FAC (0.0005)

Token: iconic
  Top 1 prediction: O (0.9999)
  Top 2 prediction: I-DATE (0.0000)
  Top 3 prediction: I-QUANTITY (0.0000)

Token: medieval
  Top 1 prediction: O (0.9997)
  Top 2 prediction: S-NORP (0.0002)
  Top 3 prediction: B-NORP (0.0000)

Token: cathedral
  Top 1 prediction: O (1.0000)
  Top 2 prediction: E-LANGUAGE (0.0000)
  Top 3 prediction: E-FAC (0.0000)

Token: in
  Top 1 prediction: O (1.0000)
  Top 2 prediction: S-GPE (0.0000)
  Top 3 prediction: B-FAC (0.0000)

Token: Paris
  Top 1 prediction: S-GPE (0.9999)
  Top 2

In [40]:
def extract_entity_probabilities(entity):
    entity_text = entity.text
    entity_probabilities = {}

    for token in entity:
        token_probabilities = token.get_tags_proba_dist("ner")
        for token_prob in token_probabilities:
            label = token_prob.value[2:]  # Remove the prefix (e.g., B-, I-, E-)
            score = token_prob.score
            entity_probabilities[label] = entity_probabilities.get(label, 0) + score / len(entity)

    # Sort probabilities by score in descending order
    sorted_probabilities = sorted(entity_probabilities.items(), key=lambda x: x[1], reverse=True)

    return sorted_probabilities[:3]

# ... (rest of your code, including model loading and sentence prediction)

for entity in sentence.get_spans('ner'):
    print(f"\nEntity: {entity.text}")
    top_3_probabilities = extract_entity_probabilities(entity)

    for i, (label, probability) in enumerate(top_3_probabilities):
        print(f"  Top {i+1} prediction: {label} ({probability:.4f})")


Entity: Notre Dame
  Top 1 prediction: ORG (0.8982)
  Top 2 prediction: FAC (0.0951)
  Top 3 prediction: GPE (0.0028)

Entity: Paris
  Top 1 prediction: GPE (1.0000)
  Top 2 prediction: FAC (0.0000)
  Top 3 prediction: LOC (0.0000)

Entity: five years
  Top 1 prediction: DATE (0.9834)
  Top 2 prediction: CARDINAL (0.0081)
  Top 3 prediction: TIME (0.0048)


In [32]:
import requests

token = "Bearer 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"

def infer_with_hf_api(text, model_name="flair/ner-english-ontonotes"):
    url = "https://api-inference.huggingface.co/models/" + model_name
    headers = {"Authorization": token}  # Replace with your Hugging Face token

    payload = {"inputs": text}
    response = requests.post(url, headers=headers, json=payload)
    return response.json()

# Example usage
text = "Notre Dame is a beautiful cathedral located in Paris."
result = infer_with_hf_api(text)
print(result)

[{'entity_group': 'ORG', 'word': 'Notre Dame', 'start': 0, 'end': 10, 'score': 0.9255074560642242}, {'entity_group': 'GPE', 'word': 'Paris', 'start': 47, 'end': 52, 'score': 0.9998453855514526}]
