In [61]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
from transformers import RobertaTokenizerFast, RobertaForTokenClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import torch
from collections import Counter
import matplotlib.pyplot as plt
from seqeval.metrics import classification_report

In [62]:
# Step 2: Load the dataset
file_path = "ner dataset/data/ner_dataset.csv"  # Replace with your file path
data = pd.read_csv(file_path, encoding='latin1', low_memory=True)

In [63]:
# Drop the POS column
data = data.drop(columns=["POS"])
data = data.dropna(subset=['Word'])
data["Sentence #"] = data["Sentence #"].fillna(method="ffill")
data = data[~data.applymap(lambda x: '�' in str(x)).any(axis=1)]

  data["Sentence #"] = data["Sentence #"].fillna(method="ffill")
  data = data[~data.applymap(lambda x: '�' in str(x)).any(axis=1)]


In [64]:
# Step 3: Preprocess the dataset
tag2id = {tag: idx for idx, tag in enumerate(sorted(data['Tag'].unique(), reverse=True))}
id2tag = {idx: tag for tag, idx in tag2id.items()}

result = data.groupby("Sentence #").agg(
    Sentence=('Word', ' '.join),
    Tags=('Tag', ' '.join)
).reset_index()

rows_with_issues = result.apply(
    lambda row: len(row['Sentence'].split()) != len(row['Tags'].split()), axis=1
)
cleaned_data = result[~rows_with_issues]

cleaned_data=cleaned_data[:10000]


In [65]:
print(cleaned_data.head())

print(cleaned_data.info)

        Sentence #                                           Sentence  \
0      Sentence: 1  Thousands of demonstrators have marched throug...   
1     Sentence: 10  Iranian officials say they expect to get acces...   
2    Sentence: 100  Helicopter gunships Saturday pounded militant ...   
3   Sentence: 1000  They left after a tense hour-long standoff wit...   
4  Sentence: 10000  U.N. relief coordinator Jan Egeland said Sunda...   

                                                Tags  
0  O O O O O O B-geo O O O O O B-geo O O O O O B-...  
1  B-gpe O O O O O O O O O O O O O O B-tim O O O ...  
2  O O B-tim O O O O O B-geo O O O O O B-org O O ...  
3                              O O O O O O O O O O O  
4  B-geo O O B-per I-per O B-tim O B-geo O B-gpe ...  
<bound method DataFrame.info of             Sentence #                                           Sentence  \
0          Sentence: 1  Thousands of demonstrators have marched throug...   
1         Sentence: 10  Iranian officials say

In [66]:
# Step 4: Tokenizer initialization
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', add_prefix_space=True)

def tokenize_and_align_labels(sentence, labels):
    tokenized_inputs = tokenizer(
        sentence.split(),
        is_split_into_words=True,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors="pt"
    )
    word_ids = tokenized_inputs.word_ids()
    aligned_labels = []
    previous_word_idx = None

    for word_idx in word_ids:
        if word_idx is None:
            aligned_labels.append(-100)
        elif word_idx != previous_word_idx:
            aligned_labels.append(tag2id[labels[word_idx]])
        else:
            aligned_labels.append(tag2id[labels[word_idx]] if word_idx < len(labels) else -100)
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = torch.tensor(aligned_labels, dtype=torch.long)
    return tokenized_inputs

# Tokenize the dataset
tokenized_dataset = [
    tokenize_and_align_labels(sentence, tags.split())
    for sentence, tags in zip(cleaned_data['Sentence'], cleaned_data['Tags'])
]

class NERDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_dataset):
        self.tokenized_dataset = tokenized_dataset

    def __len__(self):
        return len(self.tokenized_dataset)

    def __getitem__(self, idx):
        item = self.tokenized_dataset[idx]
        return {key: val.squeeze() for key, val in item.items()}

dataset = NERDataset(tokenized_dataset)
train_size = int(0.7 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = int(0.2 * len(dataset))

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
    dataset, [train_size, val_size, test_size]
)


In [67]:
# Step 5: Load the RoBERTa model
model = RobertaForTokenClassification.from_pretrained('roberta-base', num_labels=len(tag2id))

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [68]:
# Step 6: Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=10,
    save_total_limit=2
)



In [69]:
# Step 7: Define compute_metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=-1)
    true_labels = labels.flatten()
    true_predictions = predictions.flatten()
    mask = true_labels != -100
    true_labels = true_labels[mask]
    true_predictions = true_predictions[mask]

    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, true_predictions, average="weighted")
    accuracy = accuracy_score(true_labels, true_predictions)
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [70]:
# Step 8: Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [71]:
# Step 9: Train and evaluate the model
trainer.train()
eval_results = trainer.evaluate(eval_dataset=val_dataset)
print("Evaluation results:", eval_results)

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1862,0.174624,0.951546,0.948969,0.951546,0.949458
2,0.1433,0.161403,0.95451,0.952313,0.95451,0.952812
3,0.1143,0.159422,0.955663,0.953681,0.955663,0.954268


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Evaluation results: {'eval_loss': 0.15942157804965973, 'eval_accuracy': 0.9556625910831172, 'eval_precision': 0.9536809810539453, 'eval_recall': 0.9556625910831172, 'eval_f1': 0.9542678999304063, 'eval_runtime': 85.2039, 'eval_samples_per_second': 11.737, 'eval_steps_per_second': 0.739, 'epoch': 3.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [72]:
# Get predictions for the validation dataset
predictions, labels, _ = trainer.predict(val_dataset)

# Convert predictions to tag names
predicted_tags = np.argmax(predictions, axis=2)

# Remove padding tokens using the attention mask
attention_masks = [val_dataset[i]['attention_mask'].numpy() for i in range(len(val_dataset))]
predicted_tags = [[id2tag[pred] for pred, mask in zip(preds, mask) if mask] for preds, mask in zip(predicted_tags, attention_masks)]
true_tags = [[id2tag[label] if label != -100 else 'O' for label, mask in zip(label_seq, mask) if mask] for label_seq, mask in zip(labels, attention_masks)]

# Create a DataFrame with actual tags and predicted tags
eval_df = pd.DataFrame({
    'Sentence': [tokenizer.decode(val_dataset[i]['input_ids'], skip_special_tokens=True) for i in range(len(val_dataset))],
    'Actual Tags': [' '.join(tags) for tags in true_tags],
    'Predicted Tags': [' '.join(tags) for tags in predicted_tags]
})

# Print the DataFrame
print(eval_df)
eval_df.to_csv("eval_df.csv", index=False)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                                              Sentence  \
0     Since the documentary was aired last week , I...   
1     Merck withdrew the popular drug last year aft...   
2     The announcement of the release of the two Ca...   
3     Police said members of the group later threw ...   
4     Syria and Israel last held peace talks in Jan...   
..                                                 ...   
995   The report in the state-run China Daily quote...   
996   Though the government has the tacit support o...   
997   Key domestic issues include immigration and i...   
998   Aircraft also struck a building allegedly use...   
999   The latter was suspended until May 2007 due t...   

                                           Actual Tags  \
0    O O O O O O O O O B-geo O O O O O O B-gpe O O ...   
1    O B-org B-org O O O O O O O O O O O O O O O O ...   
2    O O O O O O O O O B-gpe O O B-gpe B-gpe O O O ...   
3    O O O O O O O O O O O O O O O O O O O O O O O ...   
4            

In [73]:
# Convert the 'Tags' column to a list of lists
actual_tags_list = eval_df['Actual Tags'].apply(lambda x: x.split()).tolist()

# Print the result
print(actual_tags_list)

# Convert the 'Tags' column to a list of lists
pred_tags_list = eval_df['Predicted Tags'].apply(lambda x: x.split()).tolist()

# Print the result
print(pred_tags_list)

[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'B-org', 'B-org', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'B-gpe', 'B-gpe', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'B-org', 'I-org', 'O', 'B-per', 'I-per', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'B-gpe', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O', 'B-tim', 'I-tim', 'O', 'O'], ['O', 'B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-tim', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'B-per', 'I-per', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 

In [74]:
report = classification_report(actual_tags_list, pred_tags_list)
print(report)

              precision    recall  f1-score   support

         art       0.00      0.00      0.00         7
         eve       0.00      0.00      0.00        14
         geo       0.43      0.90      0.58      1246
         gpe       0.92      0.90      0.91       375
         nat       0.00      0.00      0.00         2
         org       0.67      0.59      0.63       617
         per       0.77      0.81      0.79       522
         tim       0.83      0.80      0.81       479

   micro avg       0.58      0.80      0.68      3262
   macro avg       0.45      0.50      0.47      3262
weighted avg       0.64      0.80      0.69      3262



  _warn_prf(average, modifier, msg_start, len(result))


In [75]:
# Evaluate the model on the test dataset
test_results = trainer.evaluate(eval_dataset=test_dataset)

# Print the evaluation results
print("Test results:", test_results)

# Get predictions for the test dataset
predictions, labels, _ = trainer.predict(test_dataset)

# Convert predictions to tag names
predicted_tags = np.argmax(predictions, axis=2)

# Remove padding tokens using the attention mask
attention_masks = [test_dataset[i]['attention_mask'].numpy() for i in range(len(test_dataset))]
predicted_tags = [[id2tag[pred] for pred, mask in zip(preds, mask) if mask] for preds, mask in zip(predicted_tags, attention_masks)]
true_tags = [[id2tag[label] if label != -100 else 'O' for label, mask in zip(label_seq, mask) if mask] for label_seq, mask in zip(labels, attention_masks)]

# Create a DataFrame with actual tags and predicted tags
test_df = pd.DataFrame({
    'Sentence': [tokenizer.decode(test_dataset[i]['input_ids'], skip_special_tokens=True) for i in range(len(test_dataset))],
    'Actual Tags': [' '.join(tags) for tags in true_tags],
    'Predicted Tags': [' '.join(tags) for tags in predicted_tags]
})

# Print the DataFrame
print(test_df)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Test results: {'eval_loss': 0.15916268527507782, 'eval_accuracy': 0.9544792130510464, 'eval_precision': 0.9519449464041476, 'eval_recall': 0.9544792130510464, 'eval_f1': 0.9528934016313911, 'eval_runtime': 160.5671, 'eval_samples_per_second': 12.456, 'eval_steps_per_second': 0.778, 'epoch': 3.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                                               Sentence  \
0      Al Jazeera did not broadcast any sound with t...   
1      During his radio address Saturday , the presi...   
2      The company originally announced plans for th...   
3      The statement says that since the beginning o...   
4      Mr. Talabani was seeking Iranian investment i...   
...                                                 ...   
1995   U.S. military officials in Afghanistan say co...   
1996   Health officials said the small ward will mea...   
1997   Nardiello is hoping to rejoin the team for th...   
1998   Just hours after Thursday 's bombings , a gro...   
1999   Venezuelan President Hugo Chavez says Colombi...   

                                            Actual Tags  \
0               O B-org I-org O O O O O O O O B-tim O O   
1     O O O O O B-tim O O O O O B-gpe O O O O O O O ...   
2                         O O O O O O O O O O B-tim O O   
3     O O O O O O O O O O O O O O O O O O O B-org I-...

In [76]:
test_df.to_csv("test_df.csv", index=False)

In [77]:
# Convert the 'Tags' column to a list of lists
actual_tags_list = test_df['Actual Tags'].apply(lambda x: x.split()).tolist()

# Print the result
print(actual_tags_list)

# Convert the 'Tags' column to a list of lists
pred_tags_list = test_df['Predicted Tags'].apply(lambda x: x.split()).tolist()

# Print the result
print(pred_tags_list)

[['O', 'B-org', 'I-org', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-tim', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'B-tim', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-tim', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-org', 'I-org', 'O', 'O', 'B-geo', 'I-geo', 'O', 'O'], ['O', 'B-per', 'B-per', 'I-per', 'I-per', 'I-per', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-tim', 'O', 'B-tim', 'B-tim', 'B-tim', 'B-tim', 'B-tim', 'I-tim', 'I-tim', 'I-tim', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'B-org', 'B-org', 'B-org', 'B-org', 'I-org', 'I-org', 'I-org', 'I-org', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-tim', 'I-tim', 'O', 'B-tim', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'B-org', 'B-org', 'I-org', 'I-org', 'I-org', 'I-org', 'O', 'B-org', 'B-org', 'O', 'O

In [78]:
report = classification_report(actual_tags_list, pred_tags_list)
print(report)

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         art       0.00      0.00      0.00        19
         eve       0.00      0.00      0.00        16
         geo       0.43      0.88      0.58      2419
         gpe       0.91      0.92      0.91       775
         nat       0.00      0.00      0.00        19
         org       0.70      0.61      0.65      1400
         per       0.78      0.79      0.79      1115
         tim       0.81      0.79      0.80       940

   micro avg       0.59      0.80      0.68      6703
   macro avg       0.45      0.50      0.47      6703
weighted avg       0.65      0.80      0.69      6703

