In [1]:
import spacy
import pandas as pd
from spacy import displacy

In [2]:
 !python -m spacy download en_core_web_trf


Collecting en-core-web-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting spacy-curated-transformers<1.0.0,>=0.2.2 (from en-core-web-trf==3.8.0)
  Downloading spacy_curated_transformers-0.3.1-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting curated-transformers<0.2.0,>=0.1.0 (from spacy-curated-transformers<1.0.0,>=0.2.2->en-core-web-trf==3.8.0)
  Downloading curated_transformers-0.1.1-py2.py3-none-any.whl.metadata (965 bytes)
Collecting curated-tokenizers<0.1.0,>=0.0.9 (from spacy-curated-transformers<1.0.0,>=0.2.2->en-core-web-trf==3.8.0)
  Downloading curated_tokenizers-0.0.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.9 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.12.0->spacy-cur

In [3]:
pip install spacy-transformers


Collecting spacy-transformers
  Downloading spacy_transformers-1.3.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting transformers<4.50.0,>=3.4.0 (from spacy-transformers)
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-alignments<1.0.0,>=0.7.2 (from spacy-transformers)
  Downloading spacy_alignments-0.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.6 kB)
Collecting huggingface-hub<1.0,>=0.26.0 (from transformers<4.50.0,>=3.4.0->spacy-transformers)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting numpy>=1.19.0 (from spacy-transformers)
  Downloading numpy-2.3.4-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m4.0 MB/s[0m 

In [5]:
nlp_sm = spacy.load("en_core_web_sm")


In [6]:
def read_conll_file(path):
    sentences = []
    current_sentence = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:  # blank line means end of sentence
                if current_sentence:
                    sentences.append(" ".join([tok for tok, tag in current_sentence]))
                    current_sentence = []
                continue
            parts = line.split()
            if len(parts) >= 2:
                token, tag = parts[0], parts[-1]
                current_sentence.append((token, tag))
        if current_sentence:  
            sentences.append(" ".join([tok for tok, tag in current_sentence]))
    return sentences

In [7]:
train_sentences = read_conll_file("/kaggle/input/conll003-englishversion/train.txt")
valid_sentences = read_conll_file("/kaggle/input/conll003-englishversion/valid.txt")
test_sentences = read_conll_file("/kaggle/input/conll003-englishversion/test.txt")


In [8]:
df = pd.DataFrame({
    "dataset": (["train"] * len(train_sentences)) +
               (["valid"] * len(valid_sentences)) +
               (["test"] * len(test_sentences)),
    "text": train_sentences + valid_sentences + test_sentences
})

In [10]:
def extract_entities(text, nlp_model):
    doc = nlp_model(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

In [11]:
df["entities_sm"] = df["text"].apply(lambda x: extract_entities(x, nlp_sm))


In [12]:
df.to_csv("CoNLL003_NER_results.csv", index=False)
print("✅ NER results saved to CoNLL003_NER_results.csv")


✅ NER results saved to CoNLL003_NER_results.csv


In [15]:

sample_text = df.loc[8, "text"]
print("\nSample text:\n", sample_text)

print("\nLaunching displaCy visualization...")
doc = nlp_sm(sample_text)
displacy.render(doc, style="ent", jupyter=True)


Sample text:
 He said a proposal last month by EU Farm Commissioner Franz Fischler to ban sheep brains , spleens and spinal cords from the human and animal food chains was a highly specific and precautionary move to protect human health .

Launching displaCy visualization...
