Import Libraries

In [1]:
import spacy
from spacy.matcher import Matcher
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from spacy.tokens import Span
from spacy import displacy

Explore Dataset

In [2]:
df = pd.read_csv('Dataset/ner_dataset.csv', encoding = 'latin1')
print(df.head())
print("Shape: ",df.shape)
print(df.isnull().sum())

    Sentence #           Word  POS Tag
0  Sentence: 1      Thousands  NNS   O
1          NaN             of   IN   O
2          NaN  demonstrators  NNS   O
3          NaN           have  VBP   O
4          NaN        marched  VBN   O
Shape:  (1048575, 4)
Sentence #    1000616
Word               10
POS                 0
Tag                 0
dtype: int64


Fix Null Values

In [3]:
df['Sentence #'].ffill(inplace=True)
df = df.dropna(subset=['Word'])
print(df.isnull().sum())

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Sentence #'].ffill(inplace=True)


Sentence #    0
Word          0
POS           0
Tag           0
dtype: int64


In [4]:
print("Unique sentences:", df["Sentence #"].nunique())
print("Entity tags:", df["Tag"].unique())
print(df["Tag"].value_counts())

Unique sentences: 47959
Entity tags: ['O' 'B-geo' 'B-gpe' 'B-per' 'I-geo' 'B-org' 'I-org' 'B-tim' 'B-art'
 'I-art' 'I-per' 'I-gpe' 'I-tim' 'B-nat' 'B-eve' 'I-eve' 'I-nat']
Tag
O        887898
B-geo     37644
B-tim     20333
B-org     20143
I-per     17251
B-per     16990
I-org     16784
B-gpe     15870
I-geo      7414
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: count, dtype: int64


In [5]:
sentences = df.groupby("Sentence #")["Word"].apply(list).tolist()
labels = df.groupby("Sentence #")["Tag"].apply(list).tolist()

print("Example sentence:", sentences[0])
print("Example labels:", labels[0])

Example sentence: ['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.']
Example labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


Rule Based NER

Test on first 5 rows to check result

In [6]:
nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)

# Rules
# Organization
pattern_org = [{"TEXT": {"REGEX": ".*(Inc\\.|Ltd\\.|Corp\\.|University)$"}}]
matcher.add("ORGANIZATION", [pattern_org])

# Person
pattern_person = [{"TEXT": {"REGEX": "(Mr\\.|Mrs\\.|Dr\\.|Prof\\.)"}}, {"IS_TITLE": True}]
matcher.add("PERSON", [pattern_person])

# Countires/Cities
pattern_gpe = [{"LOWER": {"IN": ["pakistan", "india", "china", "london", "usa", "france"]}}]
matcher.add("LOCATION", [pattern_gpe])

# Day
pattern_day = [{"LOWER": {"IN": ["monday","tuesday","wednesday","thursday","friday","saturday","sunday","january","february","march","april","may","june","july","august","september","october","november","december"]}}]
matcher.add("DAY/MONTH", [pattern_day])

# Year
pattern_year = [{"SHAPE": "dddd"}]  
matcher.add("YEAR", [pattern_year])

# Time
pattern_time = [{"TEXT": {"REGEX": "^(\\d{1,2}:\\d{2}|\\d{1,2}(am|pm|AM|PM))$"}}]
matcher.add("TIME", [pattern_time])

# Event
pattern_event = [{"LOWER": {"IN": ["olympics","summit","world cup","conference"]}}]
matcher.add("EVENT", [pattern_event])

# Title in Quotes (artwork)
pattern_art = [{"TEXT": {"REGEX": "^[\"“].+[\"”]$"}}]
matcher.add("ART", [pattern_art])

# Nationalities/Religions/Political groups
pattern_norp = [{"LOWER": {"IN": ["muslim","christian","hindu","republican","democrat","asian","european"]}}]
matcher.add("NATIONALITY/RELIGION/POLITICAL GROUP", [pattern_norp])

sentences = df.groupby("Sentence #")["Word"].apply(list).tolist()
sample_sentences = [" ".join(s) for s in sentences[:5]]   

for sent in sample_sentences:
    doc = nlp(sent)
    matches = matcher(doc)
    print(f"\nSentence: {sent}")
    for match_id, start, end in matches:
        span = doc[start:end]
        print(f"Rule matched: {span.text} --> {nlp.vocab.strings[match_id]}")


Sentence: Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .
Rule matched: London --> LOCATION

Sentence: Iranian officials say they expect to get access to sealed sensitive parts of the plant Wednesday , after an IAEA surveillance system begins functioning .
Rule matched: Wednesday --> DAY/MONTH

Sentence: Helicopter gunships Saturday pounded militant hideouts in the Orakzai tribal region , where many Taliban militants are believed to have fled to avoid an earlier military offensive in nearby South Waziristan .
Rule matched: Saturday --> DAY/MONTH

Sentence: They left after a tense hour-long standoff with riot police .

Sentence: U.N. relief coordinator Jan Egeland said Sunday , U.S. , Indonesian and Australian military helicopters are ferrying out food and supplies to remote areas of western Aceh province that ground crews can not reach .
Rule matched: Sunday --> DAY/MONTH


Apply Rule based NER on whole dataset

In [8]:
nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)

# Rules
# Organization
pattern_org = [{"TEXT": {"REGEX": ".*(Inc\\.|Ltd\\.|Corp\\.|University)$"}}]
matcher.add("ORGANIZATION", [pattern_org])

# Person
pattern_person = [{"TEXT": {"REGEX": "(Mr\\.|Mrs\\.|Dr\\.|Prof\\.)"}}, {"IS_TITLE": True}]
matcher.add("PERSON", [pattern_person])

# Countires/Cities
pattern_gpe = [{"LOWER": {"IN": ["pakistan", "india", "china", "london", "usa", "france"]}}]
matcher.add("LOCATION", [pattern_gpe])

# Day
pattern_day = [{"LOWER": {"IN": ["monday","tuesday","wednesday","thursday","friday","saturday","sunday","january","february","march","april","may","june","july","august","september","october","november","december"]}}]
matcher.add("DAY/MONTH", [pattern_day])

# Year
pattern_year = [{"SHAPE": "dddd"}]  
matcher.add("YEAR", [pattern_year])

# Time
pattern_time = [{"TEXT": {"REGEX": "^(\\d{1,2}:\\d{2}|\\d{1,2}(am|pm|AM|PM))$"}}]
matcher.add("TIME", [pattern_time])

# Event
pattern_event = [{"LOWER": {"IN": ["olympics","summit","world cup","conference"]}}]
matcher.add("EVENT", [pattern_event])

# Title in Quotes (artwork)
pattern_art = [{"TEXT": {"REGEX": "^[\"“].+[\"”]$"}}]
matcher.add("ART", [pattern_art])

# Nationalities/Religions/Political groups
pattern_norp = [{"LOWER": {"IN": ["muslim","christian","hindu","republican","democrat","asian","european"]}}]
matcher.add("NATIONALITY/RELIGION/POLITICAL GROUP", [pattern_norp])

sentences = df.groupby("Sentence #")["Word"].apply(list).tolist()
all_sentences = [" ".join(s) for s in sentences]

# Apply Rules and save output to seperate file
with open("OutPut_Files/rule_based_output.txt", "w", encoding="utf-8") as f:
    for sent in all_sentences:
        doc = nlp(sent)
        matches = matcher(doc)
        f.write(f"\nSentence: {sent}\n")
        for match_id, start, end in matches:
            span = doc[start:end]
            f.write(f"Rule matched: {span.text} --> {nlp.vocab.strings[match_id]}\n")


Model Based NER

On first five rows

In [None]:
nlp = spacy.load("en_core_web_md")
sentences = df.groupby("Sentence #")["Word"].apply(list).tolist()
sample_sentences = [" ".join(s) for s in sentences[:5]] 

for sent in sample_sentences:
    doc = nlp(sent)
    print(f"\nSentence: {sent}")
    for ent in doc.ents:
        print(f"Model detected: {ent.text} --> {ent.label_}")


Sentence: Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .
Model detected: Thousands --> CARDINAL
Model detected: London --> GPE
Model detected: Iraq --> GPE
Model detected: British --> NORP

Sentence: Iranian officials say they expect to get access to sealed sensitive parts of the plant Wednesday , after an IAEA surveillance system begins functioning .
Model detected: Iranian --> NORP
Model detected: Wednesday --> DATE
Model detected: IAEA --> ORG

Sentence: Helicopter gunships Saturday pounded militant hideouts in the Orakzai tribal region , where many Taliban militants are believed to have fled to avoid an earlier military offensive in nearby South Waziristan .
Model detected: Saturday --> DATE
Model detected: Orakzai --> PERSON
Model detected: Taliban --> ORG
Model detected: South Waziristan --> GPE

Sentence: They left after a tense hour-long standoff with riot police .
Model detected

Model based NER on whole dataset

In [9]:
nlp = spacy.load("en_core_web_md")
sentences = df.groupby("Sentence #")["Word"].apply(list).tolist()
all_sentences = [" ".join(s) for s in sentences]

# Apply model and save output
with open("OutPut_Files/model_based_output.txt", "w", encoding="utf-8") as f:
    for sent in all_sentences:
        doc = nlp(sent)
        f.write(f"\nSentence: {sent}\n")
        for ent in doc.ents:
            f.write(f"Model detected: {ent.text} --> {ent.label_}\n")


Model Based NER - HTML Content for entities Visualization

In [12]:
nlp = spacy.load("en_core_web_md")
sentences = df.groupby("Sentence #")["Word"].apply(list).tolist()
all_sentences = [" ".join(s) for s in sentences]


html_content = ""
for sent in all_sentences:
    doc = nlp(sent)
    html_content += displacy.render(doc, style="ent", jupyter=False)
    html_content += "<hr>"

with open("OutPut_Files/model_based_output_ALL.html", "w", encoding="utf-8") as f:
    f.write(f"""
    <html>
    <head>
        <title>NER Visualization - Entire Dataset</title>
        <style>
            body {{ font-family: Arial, sans-serif; margin: 20px; }}
            .sentence {{ margin-bottom: 30px; border-bottom: 2px solid #ccc; padding-bottom: 20px; }}
        </style>
    </head>
    <body>
        <h1>Named Entity Recognition - Entire Dataset</h1>
        {html_content}
    </body>
    </html>
    """)

print("All sentences visualized! Open 'model_based_output_ALL.html' in your browser.")



All sentences visualized! Open 'model_based_output_ALL.html' in your browser.


Rule Based NER - HTML Content for entities Visualization

In [13]:
nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)

# Rules
pattern_org = [{"TEXT": {"REGEX": ".*(Inc\\.|Ltd\\.|Corp\\.|University)$"}}]
matcher.add("ORGANIZATION", [pattern_org])

pattern_person = [{"TEXT": {"REGEX": "(Mr\\.|Mrs\\.|Dr\\.|Prof\\.)"}}, {"IS_TITLE": True}]
matcher.add("PERSON", [pattern_person])

pattern_gpe = [{"LOWER": {"IN": ["pakistan", "india", "china", "london", "usa", "france"]}}]
matcher.add("LOCATION", [pattern_gpe])

pattern_day = [{"LOWER": {"IN": ["monday","tuesday","wednesday","thursday","friday","saturday","sunday","january","february","march","april","may","june","july","august","september","october","november","december"]}}]
matcher.add("DAY/MONTH", [pattern_day])

pattern_year = [{"SHAPE": "dddd"}]  
matcher.add("YEAR", [pattern_year])

pattern_time = [{"TEXT": {"REGEX": "^(\\d{1,2}:\\d{2}|\\d{1,2}(am|pm|AM|PM))$"}}]
matcher.add("TIME", [pattern_time])

pattern_event = [{"LOWER": {"IN": ["olympics","summit","world cup","conference"]}}]
matcher.add("EVENT", [pattern_event])

pattern_art = [{"TEXT": {"REGEX": "^[\"\"].+[\"\"]$"}}]
matcher.add("ART", [pattern_art])

pattern_norp = [{"LOWER": {"IN": ["muslim","christian","hindu","republican","democrat","asian","european"]}}]
matcher.add("NATIONALITY/RELIGION/POLITICAL GROUP", [pattern_norp])

sentences = df.groupby("Sentence #")["Word"].apply(list).tolist()
all_sentences = [" ".join(s) for s in sentences]

# HTML content 
html_content = ""
for i, sent in enumerate(all_sentences[:100]):  #First 100 sentences
    doc = nlp(sent)
    matches = matcher(doc)
    
    spans = []
    for match_id, start, end in matches:
        span = Span(doc, start, end, label=nlp.vocab.strings[match_id])
        spans.append(span)
    doc.ents = spans
    
    html_content += f"<h3>Sentence {i+1}:</h3>"
    html_content += displacy.render(doc, style="ent", jupyter=False)
    html_content += "<hr>"

with open("OutPut_Files/rule_based_output_100.html", "w", encoding="utf-8") as f:
    f.write(f"""
    <html>
    <head>
        <title>Rule Based NER Visualization - First 100 Sentences</title>
        <style>
            body {{ font-family: Arial, sans-serif; margin: 20px; }}
        </style>
    </head>
    <body>
        <h1>Rule Based Named Entity Recognition - First 100 Sentences</h1>
        {html_content}
    </body>
    </html>
    """)

print("Rule Based NER visualization complete! Open 'rule_based_output_100.html' in your browser.")



Rule Based NER visualization complete! Open 'rule_based_output_100.html' in your browser.


Spacy Model Comparison

In [17]:
nlp_sm = spacy.load("en_core_web_sm")
nlp_md = spacy.load("en_core_web_md")

sentences = df.groupby("Sentence #")["Word"].apply(list).tolist()
all_sentences = [" ".join(s) for s in sentences]

with open("OutPut_Files/model_comparison.txt", "w", encoding="utf-8") as f:
    for sent in all_sentences[:500]:  #for 500 sentences
        doc_sm = nlp_sm(sent)
        doc_md = nlp_md(sent)

        f.write(f"\nSentence: {sent}\n")
        f.write("SM Model Entities:\n")
        for ent in doc_sm.ents:
            f.write(f"   {ent.text} --> {ent.label_}\n")

        f.write("MD Model Entities:\n")
        for ent in doc_md.ents:
            f.write(f"   {ent.text} --> {ent.label_}\n")
        sm_count = sum(1 for sent in all_sentences[:500] for ent in nlp_sm(sent).ents)
        md_count = sum(1 for sent in all_sentences[:500] for ent in nlp_md(sent).ents)
    f.write(f"\nSUMMARY: en_core_web_md detected {md_count} entities vs en_core_web_sm detected {sm_count} entities - {'MD model performs better' if md_count > sm_count else 'SM model performs better' if sm_count > md_count else 'Both models detected same number of entities'}")