In [2]:
!pip install -U spacy scikit-learn
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m114.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [4]:
import spacy
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score


In [5]:
nlp = spacy.load("en_core_web_sm")



In [6]:
texts = [
    "Apple CEO Tim Cook visited India in April.",
    "Google announced a new AI center in London.",
    "Narendra Modi met Elon Musk in New York.",
    "Microsoft acquired a startup based in Bengaluru."
]


In [7]:
gold_entities = [
    [("Apple", "ORG"), ("Tim Cook", "PERSON"), ("India", "GPE"), ("April", "DATE")],
    [("Google", "ORG"), ("London", "GPE")],
    [("Narendra Modi", "PERSON"), ("Elon Musk", "PERSON"), ("New York", "GPE")],
    [("Microsoft", "ORG"), ("Bengaluru", "GPE")]
]


In [8]:
predicted_entities = []

for doc in nlp.pipe(texts):
    predicted_entities.append([(ent.text, ent.label_) for ent in doc.ents])



In [9]:
def get_token_labels(text, entities):
    doc = nlp(text)
    labels = ["O"] * len(doc)
    for ent_text, ent_label in entities:
        for ent in doc.ents:
            if ent.text == ent_text:
                for i in range(ent.start, ent.end):
                    labels[i] = ent_label
    return labels


In [10]:
y_true = []
y_pred = []

for text, gold, pred in zip(texts, gold_entities, predicted_entities):
    true_labels = get_token_labels(text, gold)
    pred_labels = get_token_labels(text, pred)
    y_true.extend(true_labels)
    y_pred.extend(pred_labels)


In [11]:
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average="macro", zero_division=0)
recall = recall_score(y_true, y_pred, average="macro", zero_division=0)
f1 = f1_score(y_true, y_pred, average="macro", zero_division=0)

print("Accuracy :", round(accuracy, 3))
print("Precision:", round(precision, 3))
print("Recall   :", round(recall, 3))
print("F1 Score :", round(f1, 3))


Accuracy : 0.971
Precision: 0.967
Recall   : 0.99
F1 Score : 0.977


In [12]:
for text, ents in zip(texts, predicted_entities):
    print(f"\nText: {text}")
    print("Entities:", ents)



Text: Apple CEO Tim Cook visited India in April.
Entities: [('Apple', 'ORG'), ('Tim Cook', 'PERSON'), ('India', 'GPE'), ('April', 'DATE')]

Text: Google announced a new AI center in London.
Entities: [('Google', 'ORG'), ('AI', 'GPE'), ('London', 'GPE')]

Text: Narendra Modi met Elon Musk in New York.
Entities: [('Narendra Modi', 'PERSON'), ('Elon Musk', 'PERSON'), ('New York', 'GPE')]

Text: Microsoft acquired a startup based in Bengaluru.
Entities: [('Microsoft', 'ORG'), ('Bengaluru', 'GPE')]
