# **Install spaCy and models**

In [None]:
!pip install -U spacy
!pip install "pandas==2.2.2"
!pip install "datasets<4.0.0"
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_md

Collecting datasets<4.0.0
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 4.0.0
    Uninstalling datasets-4.0.0:
      Successfully uninstalled datasets-4.0.0
Successfully installed datasets-3.6.0
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m89.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Cola

# **Import libraries**

In [None]:
import pandas as pd
import spacy
from spacy import displacy
from spacy.matcher import Matcher
from datasets import load_dataset

# **Load CoNLL data**

In [None]:
dataset = load_dataset("conll2003")
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

conll2003.py: 0.00B [00:00, ?B/s]

The repository for conll2003 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conll2003.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

# **Create & view train DF**

In [None]:
train_df = pd.DataFrame(dataset['train'])
train_df.head()

Unnamed: 0,id,tokens,pos_tags,chunk_tags,ner_tags
0,0,"[EU, rejects, German, call, to, boycott, Briti...","[22, 42, 16, 21, 35, 37, 16, 21, 7]","[11, 21, 11, 12, 21, 22, 11, 12, 0]","[3, 0, 7, 0, 0, 0, 7, 0, 0]"
1,1,"[Peter, Blackburn]","[22, 22]","[11, 12]","[1, 2]"
2,2,"[BRUSSELS, 1996-08-22]","[22, 11]","[11, 12]","[5, 0]"
3,3,"[The, European, Commission, said, on, Thursday...","[12, 22, 22, 38, 15, 22, 28, 38, 15, 16, 21, 3...","[11, 12, 12, 21, 13, 11, 11, 21, 13, 11, 12, 1...","[0, 3, 4, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, ..."
4,4,"[Germany, 's, representative, to, the, Europea...","[22, 27, 21, 35, 12, 22, 22, 27, 16, 21, 22, 2...","[11, 11, 12, 13, 11, 12, 12, 11, 12, 12, 12, 1...","[5, 0, 0, 0, 0, 3, 4, 0, 0, 0, 1, 2, 0, 0, 0, ..."


# **Tokens to Sentences Preview**

In [None]:
def tokens_to_sentence(tokens):
    return " ".join(tokens)

train_df['sentence'] = train_df['tokens'].apply(tokens_to_sentence)
train_df[['sentence', 'ner_tags']].head()

Unnamed: 0,sentence,ner_tags
0,EU rejects German call to boycott British lamb .,"[3, 0, 7, 0, 0, 0, 7, 0, 0]"
1,Peter Blackburn,"[1, 2]"
2,BRUSSELS 1996-08-22,"[5, 0]"
3,The European Commission said on Thursday it di...,"[0, 3, 4, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, ..."
4,Germany 's representative to the European Unio...,"[5, 0, 0, 0, 0, 3, 4, 0, 0, 0, 1, 2, 0, 0, 0, ..."


# **Load SpaCy Models**

In [None]:
nlp_sm = spacy.load("en_core_web_sm")
nlp_md = spacy.load("en_core_web_md")

# **NER Model Comparison**

In [None]:
sample_text = train_df['sentence'].iloc[0]
doc_sm = nlp_sm(sample_text)
doc_md = nlp_md(sample_text)

print("🔹 Small Model Entities:")
for ent in doc_sm.ents:
    print(ent.text, ent.label_)

print("\n🔹 Medium Model Entities:")
for ent in doc_md.ents:
    print(ent.text, ent.label_)

🔹 Small Model Entities:
EU ORG
German NORP
British NORP

🔹 Medium Model Entities:
EU ORG
German NORP
British NORP


# **NER Visualization**

In [None]:
displacy.render(doc_sm, style="ent", jupyter=True)
displacy.render(doc_md, style="ent", jupyter=True)

# **Rule-Based NER Matching**

In [None]:
matcher = Matcher(nlp_sm.vocab)
pattern = [{"TEXT": {"REGEX": "^[A-Z][a-z]+"}}, {"IS_TITLE": True}]
matcher.add("PROPER_NOUN", [pattern])

doc = nlp_sm(sample_text)
matches = matcher(doc)

print("🔹 Rule-Based Matches:")
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

🔹 Rule-Based Matches:


# **Comparison Table**

In [None]:
def get_entities(doc):
    return [(ent.text, ent.label_) for ent in doc.ents]

comparison_df = pd.DataFrame({
    "Small Model": [get_entities(doc_sm)],
    "Medium Model": [get_entities(doc_md)]
})
comparison_df

Unnamed: 0,Small Model,Medium Model
0,"[(EU, ORG), (German, NORP), (British, NORP)]","[(EU, ORG), (German, NORP), (British, NORP)]"


# **Extract & Save NER Entities**

In [None]:
entities_list = []
for sentence in train_df['sentence'][:100]:
    doc = nlp_sm(sentence)
    for ent in doc.ents:
        entities_list.append({"Sentence": sentence, "Entity": ent.text, "Label": ent.label_})

entities_df = pd.DataFrame(entities_list)
entities_df.to_csv("extracted_entities.csv", index=False)
entities_df.head()

Unnamed: 0,Sentence,Entity,Label
0,EU rejects German call to boycott British lamb .,EU,ORG
1,EU rejects German call to boycott British lamb .,German,NORP
2,EU rejects German call to boycott British lamb .,British,NORP
3,Peter Blackburn,Peter Blackburn,PERSON
4,BRUSSELS 1996-08-22,BRUSSELS,GPE
