In [2]:
!pip install spacy scikit-learn




In [3]:
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     - ------------------------------------- 0.5/12.8 MB 985.5 kB/s eta 0:00:13
     -- ------------------------------------ 0.8/12.8 MB 882.6 kB/s eta 0:00:14
     -- ------------------------------------ 0.8/12.8 MB 882.6 kB/s eta 0:00:14
     --- ----------------------------------- 1.0/12.8 MB 914.5 kB/s eta 0:00:13
     --- ----------------------------------- 1.3/12.8 MB 919.0 kB/s eta 0:00:13
     ---- ---------------------------------- 1.6/12.8 MB 942.3 k

In [4]:
import spacy
import pandas as pd

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report


In [5]:
nlp = spacy.load("en_core_web_sm")


In [6]:
texts = [
    "Apple is planning to acquire a startup in the United Kingdom.",
    "Elon Musk is the CEO of Tesla and SpaceX.",
    "India defeated Australia in the match held in Mumbai.",
    "Google launched new AI tools in California.",
    "I met Virat Kohli in Delhi yesterday."
]


In [7]:
true_entities = [
    [("Apple", "ORG"), ("United Kingdom", "GPE")],
    [("Elon Musk", "PERSON"), ("Tesla", "ORG"), ("SpaceX", "ORG")],
    [("India", "GPE"), ("Australia", "GPE"), ("Mumbai", "GPE")],
    [("Google", "ORG"), ("California", "GPE")],
    [("Virat Kohli", "PERSON"), ("Delhi", "GPE")]
]


In [8]:
predicted_entities = []

for text in texts:
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    predicted_entities.append(entities)

predicted_entities


[[('Apple', 'ORG'), ('the United Kingdom', 'GPE')],
 [('Elon Musk', 'PERSON'), ('Tesla', 'ORG')],
 [('India', 'GPE'), ('Australia', 'GPE'), ('Mumbai', 'GPE')],
 [('Google', 'ORG'), ('AI', 'GPE'), ('California', 'GPE')],
 [('Virat Kohli', 'PERSON'), ('Delhi', 'GPE'), ('yesterday', 'DATE')]]

In [9]:
y_true = []
y_pred = []

for true, pred in zip(true_entities, predicted_entities):
    true_dict = {e[0]: e[1] for e in true}
    pred_dict = {e[0]: e[1] for e in pred}
    
    all_entities = set(true_dict.keys()).union(set(pred_dict.keys()))
    
    for entity in all_entities:
        y_true.append(true_dict.get(entity, "O"))
        y_pred.append(pred_dict.get(entity, "O"))


In [10]:
accuracy = accuracy_score(y_true, y_pred)

precision, recall, f1, _ = precision_recall_fscore_support(
    y_true, y_pred, average="weighted"
)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


Accuracy: 0.6666666666666666
Precision: 0.6857142857142856
Recall: 0.6666666666666666
F1-score: 0.6695970695970695


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
print(classification_report(y_true, y_pred))


              precision    recall  f1-score   support

        DATE       0.00      0.00      0.00         0
         GPE       0.71      0.83      0.77         6
           O       0.00      0.00      0.00         3
         ORG       1.00      0.75      0.86         4
      PERSON       1.00      1.00      1.00         2

    accuracy                           0.67        15
   macro avg       0.54      0.52      0.53        15
weighted avg       0.69      0.67      0.67        15



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
results_df = pd.DataFrame({
    "True Label": y_true,
    "Predicted Label": y_pred
})

results_df


Unnamed: 0,True Label,Predicted Label
0,GPE,O
1,O,GPE
2,ORG,ORG
3,PERSON,PERSON
4,ORG,O
5,ORG,ORG
6,GPE,GPE
7,GPE,GPE
8,GPE,GPE
9,ORG,ORG
