In [3]:

!pip install spacy pandas

!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_trf

import spacy
import pandas as pd
from spacy import displacy
from pathlib import Path


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m107.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting en-core-web-trf==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (457.4 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')
[38;5;3m⚠ Restart to reload dependenci

In [4]:
def load_conll(filepath):
    sentences = []
    sentence = []
    labels = []
    label_seq = []

    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:  # end of sentence
                if sentence:
                    sentences.append(sentence)
                    labels.append(label_seq)
                    sentence = []
                    label_seq = []
            else:
                parts = line.split()
                token, tag = parts[0], parts[-1]
                sentence.append(token)
                label_seq.append(tag)
        if sentence:  # add last sentence
            sentences.append(sentence)
            labels.append(label_seq)

    return sentences, labels

train_sentences, train_labels = load_conll("train.txt")
test_sentences, test_labels = load_conll("test.txt")
valid_sentences, valid_labels = load_conll("valid.txt")

print("Example loaded sentence:", train_sentences[0])
print("Example labels:", train_labels[0])

Example loaded sentence: ['-DOCSTART-']
Example labels: ['O']


In [5]:
nlp_sm = spacy.load("en_core_web_sm")   # small model
nlp_trf = spacy.load("en_core_web_trf") # transformer model


In [6]:

# ========================
# 4. Run SpaCy NER
# ========================
def spacy_entities(nlp, sentence_tokens):
    text = " ".join(sentence_tokens)
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

In [7]:


# Example comparison
sample_idx = 10
sentence = test_sentences[sample_idx]
gold_labels = test_labels[sample_idx]

print("Sentence:", " ".join(sentence))
print("Gold labels:", gold_labels)
print("SpaCy (sm):", spacy_entities(nlp_sm, sentence))
print("SpaCy (trf):", spacy_entities(nlp_trf, sentence))


Sentence: Two goals from defensive errors in the last six minutes allowed Japan to come from behind and collect all three points from their opening meeting against Syria .
Gold labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O']
SpaCy (sm): [('Two', 'CARDINAL'), ('the last six minutes', 'TIME'), ('Japan', 'GPE'), ('three', 'CARDINAL'), ('Syria', 'GPE')]
SpaCy (trf): [('Two', 'CARDINAL'), ('Japan', 'GPE'), ('three', 'CARDINAL'), ('Syria', 'GPE')]


In [8]:

doc_sm = nlp_sm(" ".join(sentence))
doc_trf = nlp_trf(" ".join(sentence))

print("\nVisualization with en_core_web_sm:")
displacy.render(doc_sm, style="ent", jupyter=True)

print("\nVisualization with en_core_web_trf:")
displacy.render(doc_trf, style="ent", jupyter=True)



Visualization with en_core_web_sm:



Visualization with en_core_web_trf:
