In [None]:
# ------------------------------
# Task 4: Named Entity Recognition (NER) from News Articles
# Full final code for CoNLL files
# ------------------------------

!pip install spacy pandas

!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_trf

import spacy
from spacy import displacy
from spacy.matcher import Matcher, PhraseMatcher
import pandas as pd
import os
import zipfile

# ------------------------------
# 1. Extract files from ZIP
# ------------------------------
zip_path = "/content/archive.zip"
extract_path = "/content/archive_data"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Files extracted:", os.listdir(extract_path))

# Use one of the txt files for NER (e.g., valid.txt)
file_path = os.path.join(extract_path, "valid.txt")
print("Using file:", file_path)

# ------------------------------
# 2. Read CoNLL-formatted file
# ------------------------------
def read_conll(file_path):
    sentences = []
    sentence = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                if sentence:
                    sentences.append(" ".join([w[0] for w in sentence]))
                    sentence = []
            else:
                parts = line.split()
                # CoNLL columns: word POS CHUNK NER
                if len(parts) >= 4:
                    word, pos, chunk, ner = parts
                    sentence.append((word, ner))
        if sentence:
            sentences.append(" ".join([w[0] for w in sentence]))
    return sentences

texts = read_conll(file_path)
print("✅ Loaded", len(texts), "sentences")

# ------------------------------
# 3. Rule-based NER (EntityRuler)
# ------------------------------
nlp_rule = spacy.load("en_core_web_sm")
ruler = nlp_rule.add_pipe("entity_ruler", before="ner")
patterns = [
    {"label": "ORG", "pattern": [{"LOWER": "google"}]},
    {"label": "ORG", "pattern": [{"LOWER": "microsoft"}]},
    {"label": "GPE", "pattern": [{"LOWER": "pakistan"}]},
    {"label": "PERSON", "pattern": [{"LOWER": "ahsan"}]},
]
ruler.add_patterns(patterns)

# ------------------------------
# 4. Model-based NER with two SpaCy models
# ------------------------------
nlp_sm = spacy.load("en_core_web_sm")
nlp_trf = spacy.load("en_core_web_trf")

def extract_entities(nlp_model, text):
    doc = nlp_model(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities, doc

# ------------------------------
# 5. Process first 5 sentences and visualize
# ------------------------------
for i, text in enumerate(texts[:5]):
    print("\n--- SENTENCE", i+1, "---")
    print("Text:", text)

    # Rule-based
    entities_rule, doc_rule = extract_entities(nlp_rule, text)
    print("Rule-based NER:", entities_rule)

    # Model-based small
    entities_sm, doc_sm = extract_entities(nlp_sm, text)
    print("Model-based NER (sm):", entities_sm)

    # Model-based transformer
    entities_trf, doc_trf = extract_entities(nlp_trf, text)
    print("Model-based NER (trf):", entities_trf)

    # Visualization
    displacy.render(doc_rule, style="ent", jupyter=True)
    displacy.render(doc_sm, style="ent", jupyter=True)
    displacy.render(doc_trf, style="ent", jupyter=True)

print("✅ NER processing completed")


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m91.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting en-core-web-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installati




--- SENTENCE 2 ---
Text: CRICKET - LEICESTERSHIRE TAKE OVER AT TOP AFTER INNINGS VICTORY .
Rule-based NER: []
Model-based NER (sm): []
Model-based NER (trf): []





--- SENTENCE 3 ---
Text: LONDON 1996-08-30
Rule-based NER: [('LONDON', 'GPE'), ('1996-08-30', 'DATE')]
Model-based NER (sm): [('LONDON', 'GPE'), ('1996-08-30', 'DATE')]
Model-based NER (trf): [('LONDON', 'GPE'), ('1996-08-30', 'DATE')]



--- SENTENCE 4 ---
Text: West Indian all-rounder Phil Simmons took four for 38 on Friday as Leicestershire beat Somerset by an innings and 39 runs in two days to take over at the head of the county championship .
Rule-based NER: [('West Indian', 'NORP'), ('Phil Simmons', 'PERSON'), ('four', 'CARDINAL'), ('38', 'CARDINAL'), ('Friday', 'DATE'), ('Leicestershire', 'FAC'), ('Somerset', 'GPE'), ('39', 'CARDINAL'), ('two days', 'DATE')]
Model-based NER (sm): [('West Indian', 'NORP'), ('Phil Simmons', 'PERSON'), ('four', 'CARDINAL'), ('38', 'CARDINAL'), ('Friday', 'DATE'), ('Leicestershire', 'FAC'), ('Somerset', 'GPE'), ('39', 'CARDINAL'), ('two days', 'DATE')]
Model-based NER (trf): [('West Indian', 'NORP'), ('Phil Simmons', 'PERSON'), ('four', 'CARDINAL'), ('38', 'CARDINAL'), ('Friday', 'DATE'), ('Leicestershire', 'ORG'), ('Somerset', 'ORG'), ('39', 'CARDINAL'), ('two days', 'DATE')]



--- SENTENCE 5 ---
Text: Their stay on top , though , may be short-lived as title rivals Essex , Derbyshire and Surrey all closed in on victory while Kent made up for lost time in their rain-affected match against Nottinghamshire .
Rule-based NER: [('Derbyshire', 'PERSON'), ('Surrey', 'PERSON'), ('Kent', 'PERSON'), ('Nottinghamshire', 'PERSON')]
Model-based NER (sm): [('Derbyshire', 'PERSON'), ('Surrey', 'PERSON'), ('Kent', 'PERSON'), ('Nottinghamshire', 'PERSON')]
Model-based NER (trf): [('Essex', 'ORG'), ('Derbyshire', 'ORG'), ('Surrey', 'ORG'), ('Kent', 'ORG'), ('Nottinghamshire', 'GPE')]


✅ NER processing completed
