In [4]:
# Named Entity Recognition on News Articles

'''This project implements a Named Entity Recognition (NER) 
system using spaCy to identify entities such as persons, 
organizations, locations, and dates from news articles.'''


'This project implements a Named Entity Recognition (NER) \nsystem using spaCy to identify entities such as persons, \norganizations, locations, and dates from news articles.'

In [6]:
import json
import random
import spacy
import pandas as pd
from sklearn.model_selection import train_test_split
from spacy.training import Example


In [7]:
with open("data/labeled_news.json", "r") as f:
    data = json.load(f)

data


[{'text': 'Apple was founded by Steve Jobs in California in 1976.',
  'entities': [[0, 5, 'ORG'],
   [21, 31, 'PERSON'],
   [35, 45, 'GPE'],
   [49, 53, 'DATE']]},
 {'text': 'Microsoft acquired LinkedIn for 26 billion dollars.',
  'entities': [[0, 9, 'ORG'], [19, 27, 'ORG']]},
 {'text': 'Google announced new AI tools in Paris.',
  'entities': [[0, 6, 'ORG'], [35, 40, 'GPE']]},
 {'text': 'Amazon opened a new warehouse in Hyderabad.',
  'entities': [[0, 6, 'ORG'], [33, 42, 'GPE']]},
 {'text': 'Elon Musk is the CEO of Tesla.',
  'entities': [[0, 9, 'PERSON'], [24, 29, 'ORG']]}]

In [8]:
df = pd.DataFrame(data)
df


Unnamed: 0,text,entities
0,Apple was founded by Steve Jobs in California ...,"[[0, 5, ORG], [21, 31, PERSON], [35, 45, GPE],..."
1,Microsoft acquired LinkedIn for 26 billion dol...,"[[0, 9, ORG], [19, 27, ORG]]"
2,Google announced new AI tools in Paris.,"[[0, 6, ORG], [35, 40, GPE]]"
3,Amazon opened a new warehouse in Hyderabad.,"[[0, 6, ORG], [33, 42, GPE]]"
4,Elon Musk is the CEO of Tesla.,"[[0, 9, PERSON], [24, 29, ORG]]"


In [9]:
## Dataset Description

'''The dataset consists of labeled news sentences where named 
entities such as PERSON, ORG, GPE, and DATE are annotated using character offsets. 
This data is used to train and evaluate a Named Entity Recognition model.'''


'The dataset consists of labeled news sentences where named \nentities such as PERSON, ORG, GPE, and DATE are annotated using character offsets. \nThis data is used to train and evaluate a Named Entity Recognition model.'

In [10]:
import spacy

# Load pretrained English model
nlp = spacy.load("en_core_web_sm")


In [11]:
text = "Apple was founded by Steve Jobs in California in 1976."
doc = nlp(text)

[(ent.text, ent.label_) for ent in doc.ents]


[('Apple', 'ORG'),
 ('Steve Jobs', 'PERSON'),
 ('California', 'GPE'),
 ('1976', 'DATE')]

In [12]:
## Baseline NER using Pretrained spaCy Model

'''A pretrained spaCy NER model was used to identify named entities 
from news text. This serves as a baseline to compare performance 
before fine-tuning on domain-specific data.'''


'A pretrained spaCy NER model was used to identify named entities \nfrom news text. This serves as a baseline to compare performance \nbefore fine-tuning on domain-specific data.'

In [13]:
(text, {"entities": [...]})


('Apple was founded by Steve Jobs in California in 1976.',
 {'entities': [Ellipsis]})

In [14]:
TRAIN_DATA = []

for item in data:
    TRAIN_DATA.append(
        (item["text"], {"entities": item["entities"]})
    )

TRAIN_DATA


[('Apple was founded by Steve Jobs in California in 1976.',
  {'entities': [[0, 5, 'ORG'],
    [21, 31, 'PERSON'],
    [35, 45, 'GPE'],
    [49, 53, 'DATE']]}),
 ('Microsoft acquired LinkedIn for 26 billion dollars.',
  {'entities': [[0, 9, 'ORG'], [19, 27, 'ORG']]}),
 ('Google announced new AI tools in Paris.',
  {'entities': [[0, 6, 'ORG'], [35, 40, 'GPE']]}),
 ('Amazon opened a new warehouse in Hyderabad.',
  {'entities': [[0, 6, 'ORG'], [33, 42, 'GPE']]}),
 ('Elon Musk is the CEO of Tesla.',
  {'entities': [[0, 9, 'PERSON'], [24, 29, 'ORG']]})]

In [15]:
# Train-Test split

In [16]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(
    TRAIN_DATA, test_size=0.2, random_state=42
)

len(train_data), len(test_data)


(4, 1)

In [17]:
nlp = spacy.blank("en")

# Add NER pipeline
ner = nlp.add_pipe("ner")


In [18]:
for _, annotations in TRAIN_DATA:
    for ent in annotations["entities"]:
        ner.add_label(ent[2])


In [19]:
#Ner Model

In [20]:
from spacy.training import Example
import random

optimizer = nlp.initialize()

for epoch in range(15):
    random.shuffle(train_data)
    losses = {}
    
    for text, annotations in train_data:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        nlp.update([example], sgd=optimizer, losses=losses)
    
    print(f"Epoch {epoch+1} - Losses: {losses}")




Epoch 1 - Losses: {'ner': np.float32(30.580788)}
Epoch 2 - Losses: {'ner': np.float32(26.054432)}
Epoch 3 - Losses: {'ner': np.float32(15.128336)}
Epoch 4 - Losses: {'ner': np.float32(11.235793)}
Epoch 5 - Losses: {'ner': np.float32(6.3508987)}
Epoch 6 - Losses: {'ner': np.float32(16.494062)}
Epoch 7 - Losses: {'ner': np.float32(5.570075)}
Epoch 8 - Losses: {'ner': np.float32(3.5467112)}
Epoch 9 - Losses: {'ner': np.float32(1.944821)}
Epoch 10 - Losses: {'ner': np.float32(0.59164566)}
Epoch 11 - Losses: {'ner': np.float32(0.3624523)}
Epoch 12 - Losses: {'ner': np.float32(0.13479827)}
Epoch 13 - Losses: {'ner': np.float32(0.009880613)}
Epoch 14 - Losses: {'ner': np.float32(0.0020644509)}
Epoch 15 - Losses: {'ner': np.float32(0.00017905756)}


In [21]:
#Evaluate Model (Precision, Recall, F1)

In [22]:
tp, fp, fn = 0, 0, 0

for text, annotations in test_data:
    doc = nlp(text)
    
    predicted = {(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents}
    actual = set(tuple(ent) for ent in annotations["entities"])
    
    tp += len(predicted & actual)
    fp += len(predicted - actual)
    fn += len(actual - predicted)

precision = tp / (tp + fp) if (tp + fp) else 0
recall = tp / (tp + fn) if (tp + fn) else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0

precision, recall, f1


(0, 0.0, 0)

In [23]:
with open("reports/evaluation_metrics.txt", "w") as f:
    f.write(f"Precision: {precision}\n")
    f.write(f"Recall: {recall}\n")
    f.write(f"F1 Score: {f1}\n")


In [None]:
## Model Evaluation

'''The trained NER model was evaluated using precision, 
recall, and F1-score on a held-out test set. These metrics 
indicate how accurately the model identifies named entities.'''


In [24]:
#Annotate New Articles (REAL-WORLD USAGE)

In [25]:
with open("data/new_articles.txt", "r") as f:
    articles = f.readlines()

annotated_articles = []

for text in articles:
    doc = nlp(text)
    annotated_text = text.strip()
    
    for ent in doc.ents:
        annotated_text = annotated_text.replace(
            ent.text, f"[{ent.label_}]{ent.text}[/{ent.label_}]"
        )
    
    annotated_articles.append(annotated_text)

annotated_articles


['[ORG]Tesla[/ORG] announced a new factory in [DATE]Germany[/DATE].',
 '[ORG]Google[/ORG] hired new engineers in [DATE]India[/DATE].',
 '[ORG]Apple[/ORG] is planning to launch a product in [DATE]2025[/DATE].']

In [26]:
with open("outputs/annotated_articles.txt", "w") as f:
    for line in annotated_articles:
        f.write(line + "\n")


In [28]:
#Conclusion

'''In this project, an end-to-end Named Entity Recognition pipeline 
was built using spaCy. The model was trained on labeled news articles, 
evaluated using standard metrics, and applied to annotate unseen news data. 
This project demonstrates practical NLP model development and evaluation skills.'''

'In this project, an end-to-end Named Entity Recognition pipeline \nwas built using spaCy. The model was trained on labeled news articles, \nevaluated using standard metrics, and applied to annotate unseen news data. \nThis project demonstrates practical NLP model development and evaluation skills.'