# Named Entity Recognition Solution

In [1]:
# Import necessary libraries

import pandas as pd
import torch
import numpy as np
import os
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, BertForTokenClassification
from torch.utils.data import Dataset
from transformers import BertForTokenClassification, BertTokenizerFast
import torch
from transformers import Trainer, TrainingArguments
from transformers import pipeline
from transformers import TrainingArguments, Trainer, AutoModelForTokenClassification
from transformers import AutoTokenizer
from torch.utils.data import Dataset



# Loading Dataset and data Pre-Processing

In [2]:
# Ensure data directory exists
os.makedirs("data", exist_ok=True)

# Load dataset
file_path = "data/ner_dataset.csv"
df = pd.read_csv(file_path, encoding="unicode_escape")
df.head()


Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [3]:
df = df[['Sentence #', 'Word', 'Tag']]
df.head()

Unnamed: 0,Sentence #,Word,Tag
0,Sentence: 1,Thousands,O
1,,of,O
2,,demonstrators,O
3,,have,O
4,,marched,O


In [4]:
# ----  Missing value check ----------

df.isnull().sum()

Sentence #    1000616
Word               10
Tag                 0
dtype: int64

In [5]:
# ----- Handling missing values

# Fill missing "Sentence #" values (forward fill)
df['Sentence #'].fillna(method='ffill', inplace=True)

# Drop missing words
df.dropna(subset=['Word'], inplace=True)

# Step 3: Replace missing tags with "O" (Outside entity)
df['Tag'].fillna("O", inplace=True)

# Group words and tags into sentences
grouped = df.groupby("Sentence #").agg(lambda x: list(x))
grouped.head()

Unnamed: 0_level_0,Word,Tag
Sentence #,Unnamed: 1_level_1,Unnamed: 2_level_1
Sentence: 1,"[Thousands, of, demonstrators, have, marched, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo..."
Sentence: 10,"[Iranian, officials, say, they, expect, to, ge...","[B-gpe, O, O, O, O, O, O, O, O, O, O, O, O, O,..."
Sentence: 100,"[Helicopter, gunships, Saturday, pounded, mili...","[O, O, B-tim, O, O, O, O, O, B-geo, O, O, O, O..."
Sentence: 1000,"[They, left, after, a, tense, hour-long, stand...","[O, O, O, O, O, O, O, O, O, O, O]"
Sentence: 10000,"[U.N., relief, coordinator, Jan, Egeland, said...","[B-geo, O, O, B-per, I-per, O, B-tim, O, B-geo..."


In [6]:
# Split the dataset into training, testing and validation data

train_data, test_data = train_test_split(grouped, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.125, random_state=42)  # 10% validation

# Reset index
train_data, val_data, test_data = train_data.reset_index(drop=True), val_data.reset_index(drop=True), test_data.reset_index(drop=True)

# Creating Labels and model training

In [7]:
# Load RoBERTa Tokenizer
MODEL_NAME = "roberta-base"  
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, add_prefix_space=True)


# Create label mappings
unique_tags = sorted(set(tag for tags in grouped["Tag"] for tag in tags))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# Save label mappings
import pickle
# Ensure data directory exists
os.makedirs("data_roberta", exist_ok=True)

# Save label mappings (Separate from BERT)
with open("data_roberta/tag2id_roberta.pkl", "wb") as f:
    pickle.dump(tag2id, f)
with open("data_roberta/id2tag_roberta.pkl", "wb") as f:
    pickle.dump(id2tag, f)


In [8]:
# NER Dataset Class
class NERDataset(Dataset):
    def __init__(self, data, tokenizer, tag2id, max_len=128):
        self.data = data
        self.tokenizer = tokenizer
        self.tag2id = tag2id
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        words = self.data.iloc[index]["Word"]
        tags = self.data.iloc[index]["Tag"]

        # Tokenize and map labels
        encoding = self.tokenizer(words, is_split_into_words=True, padding="max_length",
                                  truncation=True, max_length=self.max_len, return_tensors="pt")

        # Convert tags to IDs
        tag_ids = [self.tag2id[tag] for tag in tags] + [self.tag2id["O"]] * (self.max_len - len(tags))

        encoding["labels"] = torch.tensor(tag_ids[:self.max_len])

        return {key: val.squeeze() for key, val in encoding.items()}  # Remove batch dimension

In [9]:
# Prepare datasets
train_dataset = NERDataset(train_data, tokenizer, tag2id)
val_dataset = NERDataset(val_data, tokenizer, tag2id)
test_dataset = NERDataset(test_data, tokenizer, tag2id)

In [10]:

# Define model save path
MODEL_PATH = "models_roberta"
os.makedirs(MODEL_PATH, exist_ok=True)

# Load pre-trained RoBERTa for token classification
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, num_labels=len(tag2id))

# Training arguments
training_args = TrainingArguments(
    output_dir=MODEL_PATH,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=100,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Save trained model separately
model.save_pretrained(MODEL_PATH)
tokenizer.save_pretrained(MODEL_PATH)

print(f"RoBERTa model training complete and saved at: {MODEL_PATH}")

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.0271,0.027024
2,0.0204,0.019479
3,0.0117,0.017674


RoBERTa model training complete and saved at: models_roberta


# Testing

In [13]:
from transformers import pipeline
import torch.nn.functional as F

# Load the trained RoBERTa model for inference
model_path = "models_roberta"
ner_pipeline = pipeline("ner", model=model_path, tokenizer=model_path)

def predict_ner(text):
    tokens = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)

    with torch.no_grad():
        outputs = model(**tokens)
        logits = outputs.logits
        probs = F.softmax(logits, dim=-1)
        max_probs, predictions = torch.max(probs, dim=-1)

    # Load label mappings
    with open("data_roberta/id2tag_roberta.pkl", "rb") as f:
        id2tag = pickle.load(f)

    # Convert predictions to labels
    predicted_tags = [id2tag[p] for p in predictions.squeeze().tolist()][:len(text.split())]
    confidence_scores = max_probs.squeeze().tolist()[:len(text.split())]

    return list(zip(text.split(), predicted_tags, confidence_scores))

# Example Prediction
test_sentence = "Barack Obama was born in Hawaii."
print(predict_ner(test_sentence))

Device set to use mps:0


[('Barack', 'B-per', 0.9990480542182922), ('Obama', 'I-per', 0.9967117309570312), ('was', 'O', 0.9999501705169678), ('born', 'O', 0.9999850988388062), ('in', 'O', 0.999919056892395), ('Hawaii.', 'B-geo', 0.9960993528366089)]


In [12]:
import torch

device = torch.device("cpu")  # Force CPU usage

# Move model to CPU
model.to(device)

# Example: Ensure tensors are also on CPU
tokens = tokenizer("Barack Obama was born in Hawaii.", return_tensors="pt")
tokens = {key: val.to(device) for key, val in tokens.items()}  # Move tensors to CPU

# Make predictions
with torch.no_grad():
    outputs = model(**tokens)


In [14]:
# Test 2
sample_sentence = "Welcome to California."
ner_results = ner_pipeline(sample_sentence)
ner_results

[{'entity': 'LABEL_16',
  'score': 0.99958795,
  'index': 1,
  'word': 'ĠWelcome',
  'start': 0,
  'end': 7},
 {'entity': 'LABEL_2',
  'score': 0.9826774,
  'index': 2,
  'word': 'Ġto',
  'start': 8,
  'end': 10},
 {'entity': 'LABEL_16',
  'score': 0.9995703,
  'index': 3,
  'word': 'ĠCalifornia',
  'start': 11,
  'end': 21},
 {'entity': 'LABEL_16',
  'score': 0.9987747,
  'index': 4,
  'word': '.',
  'start': 21,
  'end': 22}]

# Prediction on new data

In [15]:
import torch
import pickle
from transformers import AutoModelForTokenClassification, AutoTokenizer

# Define RoBERTa Model Path
MODEL_PATH = "models_roberta"  # Ensure this is the correct directory for your RoBERTa model

# Load model and tokenizer (RoBERTa)
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
loaded_model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH).to(device)
loaded_tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, add_prefix_space=True)  # Required for RoBERTa

# Load label mappings
with open("data_roberta/id2tag_roberta.pkl", "rb") as f:
    id2tag = pickle.load(f)

# Function to predict NER tags with RoBERTa
def predict_ner(sentence):
    tokens = loaded_tokenizer(sentence, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    tokens = {key: val.to(device) for key, val in tokens.items()}  # Ensure tensors are on the same device
    
    with torch.no_grad():
        outputs = loaded_model(**tokens)
        predictions = torch.argmax(outputs.logits, dim=-1).squeeze().tolist()

    # Convert predictions to labels
    predicted_tags = [id2tag[p] for p in predictions][:len(sentence.split())]

    return list(zip(sentence.split(), predicted_tags))

# Example Prediction
test_sentences = [
    "Barack Obama was born in Hawaii.",
    "Elon Musk founded SpaceX in 2002 in California."
]

for sentence in test_sentences:
    print(predict_ner(sentence))


[('Barack', 'B-per'), ('Obama', 'I-per'), ('was', 'O'), ('born', 'O'), ('in', 'O'), ('Hawaii.', 'B-geo')]
[('Elon', 'B-per'), ('Musk', 'I-per'), ('founded', 'O'), ('SpaceX', 'B-org'), ('in', 'O'), ('2002', 'B-tim'), ('in', 'O'), ('California.', 'B-geo')]


# Model Evaluation

In [16]:
from seqeval.metrics import classification_report
import numpy as np
import torch
import pickle
from transformers import AutoModelForTokenClassification, AutoTokenizer

# Define RoBERTa Model Path
MODEL_PATH = "models_roberta"  # Ensure this is the correct directory for your RoBERTa model

# Load model and tokenizer (RoBERTa)
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH).to(device)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, add_prefix_space=True)  # Required for RoBERTa

# Load label mappings
with open("data_roberta/id2tag_roberta.pkl", "rb") as f:
    id2tag = pickle.load(f)

# Function to evaluate the model
def evaluate_ner(model, dataset, tokenizer, id2tag):
    model.eval()  # Set model to evaluation mode

    true_labels, pred_labels = [], []

    for example in dataset:
        # Move inputs to the correct device
        tokens = {key: val.unsqueeze(0).to(device) for key, val in example.items() if key != "labels"}
        labels = example["labels"].tolist()

        # Get model predictions
        with torch.no_grad():
            outputs = model(**tokens)
            predictions = torch.argmax(outputs.logits, dim=-1).squeeze().tolist()

        # Convert label IDs to tag names
        true_labels.append([id2tag[label] for label in labels if label != -100])  # Ignore padding tokens
        pred_labels.append([id2tag[pred] for pred in predictions if pred != -100])  # Ignore padding tokens

    # Print the classification report
    print(classification_report(true_labels, pred_labels, digits=4))

# Evaluate on the validation dataset
evaluate_ner(model, val_dataset, tokenizer, id2tag)


              precision    recall  f1-score   support

         art     0.3000    0.0566    0.0952        53
         eve     0.4000    0.3429    0.3692        35
         geo     0.8338    0.8813    0.8569      3792
         gpe     0.9492    0.9354    0.9422      1517
         nat     0.3333    0.4444    0.3810         9
         org     0.7205    0.6895    0.7046      1929
         per     0.7652    0.7689    0.7670      1683
         tim     0.8498    0.8489    0.8493      2052

   micro avg     0.8209    0.8262    0.8235     11070
   macro avg     0.6440    0.6210    0.6207     11070
weighted avg     0.8181    0.8262    0.8214     11070



# Observations

 RoBERTa performed slightly better than BERT overall
Biggest improvements in event (eve), organization (org), and geopolitical (gpe) entities
Artifacts (art) still struggle due to low dataset representation
RoBERTa performs slightly worse on nationality (nat) due to very low support (only 9 samples)


RoBERTa is slightly better than BERT-cased, especially for event detection, organizations, and geopolitical entities. However, the improvement is not drastic—it mainly enhances rare entity recognition.