# Named Entity Recognition Solution

In [None]:
# Import necessary libraries

import pandas as pd
import torch
import numpy as np
import os
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, BertForTokenClassification
from torch.utils.data import Dataset
from transformers import BertForTokenClassification, BertTokenizerFast
import torch
from transformers import Trainer, TrainingArguments
from transformers import pipeline

# Loading Dataset and data Pre-Processing

In [None]:
# Ensure data directory exists
os.makedirs("data", exist_ok=True)

# Load dataset
file_path = "data/ner_dataset.csv"
df = pd.read_csv(file_path, encoding="unicode_escape")
df.head()


In [None]:
df = df[['Sentence #', 'Word', 'Tag']]
df.head()

In [None]:
# ----  Missing value check ----------

df.isnull().sum()

In [None]:
# ----- Handling missing values

# Fill missing "Sentence #" values (forward fill)
df['Sentence #'].fillna(method='ffill', inplace=True)

# Drop missing words
df.dropna(subset=['Word'], inplace=True)

# Step 3: Replace missing tags with "O" (Outside entity)
df['Tag'].fillna("O", inplace=True)

# Group words and tags into sentences
grouped = df.groupby("Sentence #").agg(lambda x: list(x))
grouped.head()

In [None]:
# Split the dataset into training, testing and validation data

train_data, test_data = train_test_split(grouped, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.125, random_state=42)  # 10% validation

# Reset index
train_data, val_data, test_data = train_data.reset_index(drop=True), val_data.reset_index(drop=True), test_data.reset_index(drop=True)

# Creating Labels and model training

In [2]:
# Load Tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

# Create label mappings
unique_tags = sorted(list(set(tag for tags in grouped["Tag"] for tag in tags)))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# Save label mappings
import pickle
with open("data/tag2id.pkl", "wb") as f:
    pickle.dump(tag2id, f)
with open("data/id2tag.pkl", "wb") as f:
    pickle.dump(id2tag, f)


In [None]:
# NER Dataset Class
class NERDataset(Dataset):
    def __init__(self, data, tokenizer, tag2id, max_len=128):
        self.data = data
        self.tokenizer = tokenizer
        self.tag2id = tag2id
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        words = self.data.iloc[index]["Word"]
        tags = self.data.iloc[index]["Tag"]

        # Tokenize and map labels
        encoding = self.tokenizer(words, is_split_into_words=True, padding="max_length",
                                  truncation=True, max_length=self.max_len, return_tensors="pt")
        
        # Convert tags to IDs
        tag_ids = [self.tag2id[tag] for tag in tags] + [self.tag2id["O"]] * (self.max_len - len(tags))

        encoding["labels"] = torch.tensor(tag_ids[:self.max_len])

        return {key: val.squeeze() for key, val in encoding.items()}  # Remove batch dimension

In [None]:
# Prepare datasets
train_dataset = NERDataset(train_data, tokenizer, tag2id)
val_dataset = NERDataset(val_data, tokenizer, tag2id)
test_dataset = NERDataset(test_data, tokenizer, tag2id)

In [3]:
# Define model path
MODEL_PATH = "data/ner_model"
os.makedirs(MODEL_PATH, exist_ok=True)

# Load pre-trained BERT for token classification
model = BertForTokenClassification.from_pretrained("bert-base-cased", num_labels=len(tag2id))

# Training arguments
training_args = TrainingArguments(
    output_dir=MODEL_PATH,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=100,
    per_device_train_batch_size=4,  # Reduce batch size for memory efficiency
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Save trained model
model.save_pretrained(MODEL_PATH)
tokenizer.save_pretrained(MODEL_PATH)
print("Model training complete and saved at:", MODEL_PATH)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.0296,0.028804
2,0.0241,0.021714
3,0.0119,0.019274


Model training complete and saved at: data/ner_model


# Testing

In [4]:
# Load the trained model for inference
ner_pipeline = pipeline("ner", model=MODEL_PATH, tokenizer=MODEL_PATH)

# Sample inference
sample_sentence = "Elon Musk founded SpaceX in 2002 in California."
ner_results = ner_pipeline(sample_sentence)

print(ner_results)

Device set to use mps:0


[{'entity': 'LABEL_14', 'score': 0.9947083, 'index': 1, 'word': 'El', 'start': 0, 'end': 2}, {'entity': 'LABEL_16', 'score': 0.99945503, 'index': 2, 'word': '##on', 'start': 2, 'end': 4}, {'entity': 'LABEL_5', 'score': 0.49935323, 'index': 3, 'word': 'Mu', 'start': 5, 'end': 7}, {'entity': 'LABEL_16', 'score': 0.9850702, 'index': 4, 'word': '##sk', 'start': 7, 'end': 9}, {'entity': 'LABEL_7', 'score': 0.9955851, 'index': 5, 'word': 'founded', 'start': 10, 'end': 17}, {'entity': 'LABEL_16', 'score': 0.99072886, 'index': 6, 'word': 'Space', 'start': 18, 'end': 23}, {'entity': 'LABEL_2', 'score': 0.9563591, 'index': 7, 'word': '##X', 'start': 23, 'end': 24}, {'entity': 'LABEL_16', 'score': 0.99983764, 'index': 8, 'word': 'in', 'start': 25, 'end': 27}, {'entity': 'LABEL_16', 'score': 0.9999869, 'index': 9, 'word': '2002', 'start': 28, 'end': 32}, {'entity': 'LABEL_16', 'score': 0.9999913, 'index': 10, 'word': 'in', 'start': 33, 'end': 35}, {'entity': 'LABEL_16', 'score': 0.9999927, 'index'

In [7]:
# Test 2
sample_sentence = "Welcome to California."
ner_results = ner_pipeline(sample_sentence)
ner_results

[{'entity': 'LABEL_16',
  'score': 0.9998516,
  'index': 1,
  'word': 'Welcome',
  'start': 0,
  'end': 7},
 {'entity': 'LABEL_2',
  'score': 0.9891896,
  'index': 2,
  'word': 'to',
  'start': 8,
  'end': 10},
 {'entity': 'LABEL_16',
  'score': 0.9999844,
  'index': 3,
  'word': 'California',
  'start': 11,
  'end': 21},
 {'entity': 'LABEL_16',
  'score': 0.9999927,
  'index': 4,
  'word': '.',
  'start': 21,
  'end': 22}]

# Prediction on new data

In [9]:
# Load model and tokenizer
loaded_model = BertForTokenClassification.from_pretrained(MODEL_PATH)
loaded_tokenizer = BertTokenizerFast.from_pretrained(MODEL_PATH)

# Load label mappings
with open("data/id2tag.pkl", "rb") as f:
    id2tag = pickle.load(f)

# Function to predict NER tags
def predict_ner(sentence):
    tokens = loaded_tokenizer(sentence, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    outputs = loaded_model(**tokens)
    predictions = torch.argmax(outputs.logits, dim=-1).squeeze().tolist()
    
    # Convert predictions to labels
    predicted_tags = [id2tag[p] for p in predictions]

    return list(zip(sentence.split(), predicted_tags[:len(sentence.split())]))

# Example Prediction
test_sentence = "Barack Obama was born in Hawaii."
test_sentence = "Elon Musk founded SpaceX in 2002 in California."
print(predict_ner(test_sentence))

[('Elon', 'B-per'), ('Musk', 'I-per'), ('founded', 'O'), ('SpaceX', 'B-org'), ('in', 'O'), ('2002', 'B-tim'), ('in', 'O'), ('California.', 'B-geo')]


# Model Evaluation

In [14]:
# from seqeval.metrics import classification_report
# import numpy as np

# # Function to evaluate the model
# def evaluate_ner(model, dataset, tokenizer, id2tag):
#     model.eval()  # Set model to evaluation mode

#     true_labels, pred_labels = [], []

#     for example in dataset:
#         tokens = {key: val.unsqueeze(0) for key, val in example.items() if key != "labels"}  # Prepare input
#         labels = example["labels"].tolist()

#         # Get model predictions
#         with torch.no_grad():
#             outputs = model(**tokens)
#             predictions = torch.argmax(outputs.logits, dim=-1).squeeze().tolist()

#         # Convert label IDs to tag names
#         true_labels.append([id2tag[label] for label in labels if label != -100])  # Ignore padding tokens
#         pred_labels.append([id2tag[pred] for pred in predictions if pred != -100])  # Ignore padding tokens

#     # Print the classification report
#     print(classification_report(true_labels, pred_labels, digits=4))

# # Load the trained model
# model = BertForTokenClassification.from_pretrained("data/ner_model")
# tokenizer = BertTokenizerFast.from_pretrained("data/ner_model")

# # Load label mappings
# import pickle
# with open("data/id2tag.pkl", "rb") as f:
#     id2tag = pickle.load(f)

# # Evaluate on the validation dataset
# evaluate_ner(model, val_dataset, tokenizer, id2tag)


              precision    recall  f1-score   support

         art     0.1034    0.0566    0.0732        53
         eve     0.2444    0.3143    0.2750        35
         geo     0.8285    0.8700    0.8487      3792
         gpe     0.9396    0.9222    0.9308      1517
         nat     0.3636    0.4444    0.4000         9
         org     0.6894    0.6812    0.6853      1929
         per     0.7468    0.7659    0.7562      1683
         tim     0.8390    0.8304    0.8347      2052

   micro avg     0.8043    0.8151    0.8096     11070
   macro avg     0.5943    0.6106    0.6005     11070
weighted avg     0.8033    0.8151    0.8089     11070

