In [None]:
# ------------------------------
# Task 4: Named Entity Recognition (NER) - FINAL
# ------------------------------

!pip install spacy pandas

!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_trf

import zipfile, os
import spacy
from spacy import displacy
import pandas as pd
from collections import Counter

# ------------------------------
# 1. Extract files from ZIP
# ------------------------------
zip_path = "/content/archive.zip"
extract_path = "/content/archive_data"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Files extracted:", os.listdir(extract_path))
file_path = os.path.join(extract_path, "valid.txt")  #  choose train.txt, valid.txt, test.txt
print("Using file:", file_path)

# ------------------------------
# 2. Read CoNLL-formatted file
# ------------------------------
def read_conll(file_path):
    sentences = []
    sentence = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                if sentence:
                    sentences.append(" ".join([w[0] for w in sentence]))
                    sentence = []
            else:
                parts = line.split()
                if len(parts) >= 4:
                    word, pos, chunk, ner = parts
                    sentence.append((word, ner))
        if sentence:
            sentences.append(" ".join([w[0] for w in sentence]))
    return sentences

texts = read_conll(file_path)
print("✅ Loaded", len(texts), "sentences")

# ------------------------------
# 3. Initialize Rule-based NER
# ------------------------------
nlp_rule = spacy.load("en_core_web_sm")
ruler = nlp_rule.add_pipe("entity_ruler", before="ner")
patterns = [
    {"label": "ORG", "pattern": [{"LOWER": "google"}]},
    {"label": "ORG", "pattern": [{"LOWER": "microsoft"}]},
    {"label": "GPE", "pattern": [{"LOWER": "pakistan"}]},
    {"label": "PERSON", "pattern": [{"LOWER": "ahsan"}]},
]
ruler.add_patterns(patterns)

# ------------------------------
# 4. Initialize Model-based NER
# ------------------------------
nlp_sm = spacy.load("en_core_web_sm")
nlp_trf = spacy.load("en_core_web_trf")

def extract_entities(nlp_model, text):
    doc = nlp_model(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities, doc

# ------------------------------
# 5. Process all sentences and collect entity counts
# ------------------------------
counter_sm = Counter()
counter_trf = Counter()

for i, text in enumerate(texts):
    # Rule-based
    entities_rule, doc_rule = extract_entities(nlp_rule, text)

    # Model-based small
    entities_sm, doc_sm = extract_entities(nlp_sm, text)
    for _, label in entities_sm:
        counter_sm[label] += 1

    # Model-based transformer
    entities_trf, doc_trf = extract_entities(nlp_trf, text)
    for _, label in entities_trf:
        counter_trf[label] += 1

    # Optional visualization: only for first 3 sentences to save time
    if i < 3:
        print("\n--- SENTENCE", i+1, "---")
        print("Text preview:", text[:200], "...")
        print("Rule-based NER:", entities_rule)
        print("Model-based NER (sm):", entities_sm)
        print("Model-based NER (trf):", entities_trf)
        displacy.render(doc_rule, style="ent", jupyter=True)
        displacy.render(doc_sm, style="ent", jupyter=True)
        displacy.render(doc_trf, style="ent", jupyter=True)

# ------------------------------
# 6. Compare entity counts
# ------------------------------
df_compare = pd.DataFrame({
    'Entity_Label': list(set(list(counter_sm.keys()) + list(counter_trf.keys()))),
})
df_compare['Count_SM'] = df_compare['Entity_Label'].apply(lambda x: counter_sm.get(x, 0))
df_compare['Count_TRF'] = df_compare['Entity_Label'].apply(lambda x: counter_trf.get(x, 0))
print("\n✅ Entity count comparison between models:")
print(df_compare.sort_values(by='Count_TRF', ascending=False))


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m83.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting en-core-web-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting spacy-curated-tra




--- SENTENCE 2 ---
Text preview: CRICKET - LEICESTERSHIRE TAKE OVER AT TOP AFTER INNINGS VICTORY . ...
Rule-based NER: []
Model-based NER (sm): []
Model-based NER (trf): []





--- SENTENCE 3 ---
Text preview: LONDON 1996-08-30 ...
Rule-based NER: [('LONDON', 'GPE'), ('1996-08-30', 'DATE')]
Model-based NER (sm): [('LONDON', 'GPE'), ('1996-08-30', 'DATE')]
Model-based NER (trf): [('LONDON', 'GPE'), ('1996-08-30', 'DATE')]



✅ Entity count comparison between models:
   Entity_Label  Count_SM  Count_TRF
7      CARDINAL      1707       2053
6        PERSON      1568       1890
13          GPE      1605       1886
1          DATE      1505       1318
2           ORG       991        834
16         NORP       562        600
10      ORDINAL       267        326
4         MONEY       140        162
5         EVENT        76        142
15         TIME       104        115
3      QUANTITY        88         80
0       PERCENT        80         78
17          FAC        43         72
14          LOC        71         63
11  WORK_OF_ART        32         26
12      PRODUCT        52         19
9           LAW        23          4
8      LANGUAGE        11          2
