In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForTokenClassification, AdamW
from tqdm import tqdm
from datasets import load_dataset
import numpy as np
from seqeval.metrics import classification_report

# Load dataset
dataset = load_dataset("conll2003")
print(dataset)

# Define label mapping (ensure it matches the dataset)
label_list = dataset["train"].features["ner_tags"].feature.names
label_dict = {label: i for i, label in enumerate(label_list)}
label_dict_inverse = {v: k for k, v in label_dict.items()}

print("Label dictionary:", label_dict)

def preprocess_data(data, max_samples=None):
    tokens = []
    labels = []
    
    # Use all data if max_samples is None
    sample_count = len(data) if max_samples is None else min(max_samples, len(data))
    
    for i in range(sample_count):
        item = data[i]
        token = item["tokens"]
        # converting token to lowercase
        token = [t.lower() for t in
                 token]
        
        # We shouldn't remove the stopwords as they are important for NER and change the order of the words and ner_tags
        ner_tags = item["ner_tags"]
        tokens.append(token)
        labels.append(ner_tags)
    
    return tokens, labels

train_tokens, train_labels = preprocess_data(dataset["train"], max_samples=2500)
test_tokens, test_labels = preprocess_data(dataset["test"], max_samples=500)


  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})
Label dictionary: {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}


In [2]:
# Load tokenizer and model
num_labels = len(label_dict)
print(f"Number of labels: {num_labels}")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Hyperparameters
max_length = 150
batch_size = 16
learning_rate = 5e-5  # Lower learning rate (standard for BERT fine-tuning)
num_epochs = 20

def encode_tokens_and_labels(tokenizer, tokens_list, labels_list, max_length):
    input_ids_list = []
    attention_mask_list = []
    labels_list_encoded = []
    
    for tokens, labels in zip(tokens_list, labels_list):
        # Skip examples that are too long
        if len(tokens) > max_length - 2:  # Account for [CLS] and [SEP]
            continue
            
        # BERT tokenization may split words into subwords
        # We need to track this to align labels correctly
        word_ids = []
        subwords = []
        subword_ids = []
        subword_attention_mask = []
        
        # Add [CLS] token
        subwords.append("[CLS]")
        subword_ids.append(tokenizer.convert_tokens_to_ids("[CLS]"))
        subword_attention_mask.append(1)
        word_ids.append(None)
        
        # Process each word
        for word_idx, (word, label) in enumerate(zip(tokens, labels)):
            # Tokenize word into subwords
            word_tokens = tokenizer.tokenize(word)
            if not word_tokens:  # Handle empty tokenization edge case
                word_tokens = [tokenizer.unk_token]
                
            # Add subwords to lists
            for i, subword in enumerate(word_tokens):
                subwords.append(subword)
                subword_ids.append(tokenizer.convert_tokens_to_ids(subword))
                subword_attention_mask.append(1)
                word_ids.append(word_idx)
        
        # Add [SEP] token
        subwords.append("[SEP]")
        subword_ids.append(tokenizer.convert_tokens_to_ids("[SEP]"))
        subword_attention_mask.append(1)
        word_ids.append(None)
        
        # Pad sequences
        padding_length = max_length - len(subword_ids)
        subword_ids.extend([0] * padding_length)
        subword_attention_mask.extend([0] * padding_length)
        word_ids.extend([None] * padding_length)
        
        # Align labels with subwords
        label_ids = []
        prev_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                # Special tokens get -100 label (ignored in loss calculation)
                label_ids.append(-100)
            elif word_idx != prev_word_idx:
                # First subword of a word gets the word's label
                label_ids.append(labels[word_idx])
                prev_word_idx = word_idx
            else:
                # Subsequent subwords of the same word get -100 (ignored)
                label_ids.append(-100)
        
        input_ids_list.append(subword_ids)
        attention_mask_list.append(subword_attention_mask)
        labels_list_encoded.append(label_ids)
    
    return input_ids_list, attention_mask_list, labels_list_encoded

# Encode the data properly
train_inputs, train_masks, train_labels_encoded = encode_tokens_and_labels(
    tokenizer, train_tokens, train_labels, max_length
)
test_inputs, test_masks, test_labels_encoded = encode_tokens_and_labels(
    tokenizer, test_tokens, test_labels, max_length
)

print(f"Processed {len(train_inputs)} training examples")
print(f"Processed {len(test_inputs)} test examples")

# Create tensors and dataloaders
train_inputs = torch.tensor(train_inputs)
train_masks = torch.tensor(train_masks)
train_labels = torch.tensor(train_labels_encoded)

test_inputs = torch.tensor(test_inputs)
test_masks = torch.tensor(test_masks)
test_labels = torch.tensor(test_labels_encoded)

# Create datasets and dataloaders
train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = TensorDataset(test_inputs, test_masks, test_labels)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)


Number of labels: 9


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processed 2500 training examples
Processed 500 test examples


In [3]:

# Set up training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)

# Use proper optimization settings for BERT
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")
    
    for batch in progress_bar:
        input_ids, attention_mask, labels = [t.to(device) for t in batch]
        
        # Clear gradients
        model.zero_grad()
        
        # Forward pass - use BERT's built-in loss calculation
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        total_loss += loss.item()
        
        # Backward pass
        loss.backward()
        
        # Update parameters
        optimizer.step()
        
        # Update progress bar
        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})
    
    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1} complete. Average training loss: {avg_train_loss:.4f}")

# Save the model
model.save_pretrained('fine_tuned_ner_model')




Using device: cpu


Epoch 1/20: 100%|██████████| 157/157 [12:39<00:00,  4.84s/it, loss=0.0498]


Epoch 1 complete. Average training loss: 0.3050


Epoch 2/20: 100%|██████████| 157/157 [11:11<00:00,  4.28s/it, loss=0.0030]


Epoch 2 complete. Average training loss: 0.0515


Epoch 3/20: 100%|██████████| 157/157 [10:08<00:00,  3.87s/it, loss=0.0739]


Epoch 3 complete. Average training loss: 0.0222


Epoch 4/20: 100%|██████████| 157/157 [10:16<00:00,  3.93s/it, loss=0.0011]


Epoch 4 complete. Average training loss: 0.0201


Epoch 5/20: 100%|██████████| 157/157 [10:31<00:00,  4.02s/it, loss=0.0021]


Epoch 5 complete. Average training loss: 0.0086


Epoch 6/20: 100%|██████████| 157/157 [10:49<00:00,  4.14s/it, loss=0.0010]


Epoch 6 complete. Average training loss: 0.0071


Epoch 7/20: 100%|██████████| 157/157 [10:52<00:00,  4.16s/it, loss=0.0009]


Epoch 7 complete. Average training loss: 0.0059


Epoch 8/20: 100%|██████████| 157/157 [12:26<00:00,  4.76s/it, loss=0.0040]


Epoch 8 complete. Average training loss: 0.0041


Epoch 9/20: 100%|██████████| 157/157 [12:46<00:00,  4.88s/it, loss=0.0008]


Epoch 9 complete. Average training loss: 0.0040


Epoch 10/20: 100%|██████████| 157/157 [12:28<00:00,  4.77s/it, loss=0.0008]


Epoch 10 complete. Average training loss: 0.0033


Epoch 11/20: 100%|██████████| 157/157 [12:51<00:00,  4.92s/it, loss=0.0007]


Epoch 11 complete. Average training loss: 0.0038


Epoch 12/20: 100%|██████████| 157/157 [11:12<00:00,  4.29s/it, loss=0.0006]


Epoch 12 complete. Average training loss: 0.0054


Epoch 13/20: 100%|██████████| 157/157 [13:07<00:00,  5.02s/it, loss=0.0005]


Epoch 13 complete. Average training loss: 0.0055


Epoch 14/20: 100%|██████████| 157/157 [13:18<00:00,  5.09s/it, loss=0.0026]


Epoch 14 complete. Average training loss: 0.0058


Epoch 15/20: 100%|██████████| 157/157 [14:32<00:00,  5.56s/it, loss=0.0006]


Epoch 15 complete. Average training loss: 0.0043


Epoch 16/20: 100%|██████████| 157/157 [14:57<00:00,  5.72s/it, loss=0.0002]


Epoch 16 complete. Average training loss: 0.0057


Epoch 17/20: 100%|██████████| 157/157 [15:59<00:00,  6.11s/it, loss=0.0743]


Epoch 17 complete. Average training loss: 0.0056


Epoch 18/20: 100%|██████████| 157/157 [16:03<00:00,  6.14s/it, loss=0.0005]


Epoch 18 complete. Average training loss: 0.0049


Epoch 19/20: 100%|██████████| 157/157 [16:28<00:00,  6.29s/it, loss=0.2505]


Epoch 19 complete. Average training loss: 0.0074


Epoch 20/20: 100%|██████████| 157/157 [16:30<00:00,  6.31s/it, loss=0.0013]


Epoch 20 complete. Average training loss: 0.0109


In [4]:

# Evaluation
model.eval()
test_loss = 0
all_predictions = []
all_true_labels = []

for batch in tqdm(test_dataloader, desc="Evaluating"):
    input_ids, attention_mask, labels = [t.to(device) for t in batch]
    
    with torch.no_grad():
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
    
    test_loss += outputs.loss.item()
    
    # Get predictions
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=2)
    
    # Move predictions and labels to CPU
    predictions = predictions.detach().cpu().numpy()
    labels = labels.to('cpu').numpy()
    
    # Store predictions and true labels
    for i in range(predictions.shape[0]):
        pred_ids = predictions[i]
        label_ids = labels[i]
        
        # Filter out ignored indices (-100)
        mask = label_ids != -100
        true_label_ids = label_ids[mask]
        pred_label_ids = pred_ids[mask]
        
        all_predictions.append([label_dict_inverse[id] for id in pred_label_ids])
        all_true_labels.append([label_dict_inverse[id] for id in true_label_ids])

# Calculate average test loss
avg_test_loss = test_loss / len(test_dataloader)
print(f"Test loss: {avg_test_loss:.4f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(all_true_labels, all_predictions))

# Print some example predictions
print("\nSample Predictions:")
for i in range(min(5, len(all_predictions))):
    print(f"Example {i+1}:")
    print(f"True: {all_true_labels[i]}")
    print(f"Pred: {all_predictions[i]}")
    print()

Evaluating: 100%|██████████| 32/32 [00:47<00:00,  1.49s/it]

Test loss: 0.2897

Classification Report:
              precision    recall  f1-score   support

         LOC       0.91      0.93      0.92       288
        MISC       0.53      0.80      0.63        65
         ORG       0.89      0.74      0.81       188
         PER       0.96      0.98      0.97       442

   micro avg       0.89      0.91      0.90       983
   macro avg       0.82      0.86      0.83       983
weighted avg       0.90      0.91      0.90       983


Sample Predictions:
Example 1:
True: ['O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O']
Pred: ['O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O']

Example 2:
True: ['B-PER', 'I-PER']
Pred: ['B-PER', 'I-PER']

Example 3:
True: ['B-LOC', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'O']
Pred: ['B-LOC', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'O']

Example 4:
True: ['B-LOC', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'I-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']




In [32]:
# load the saved model
model = BertForTokenClassification.from_pretrained('fine_tuned_ner_model')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def preprocess(text):
    if(isinstance(text, list)):
        text = ' '.join(text)
    tokens = []
    tokens = tokenizer.tokenize(text)
    tokens = [t.lower() for t in tokens]
    return tokens

def predict(text):
    tokens = preprocess(text)
    input_ids = tokenizer.encode(tokens, return_tensors="pt")
    with torch.no_grad():
        outputs = model(input_ids)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=2)
    predictions = predictions[:, 1:]
    predicted_label_ids = predictions[0].numpy()
    predicted_labels = [label_dict_inverse[id] for id in predicted_label_ids]
    return list(zip(tokens, predicted_labels))

def display_predictions(predictions):
    # annotate the text with the predicted labels
    annotated_text = ""
    for token, label in predictions:
        if(label=='O'):
            annotated_text += f"{token} "
        else:
            annotated_text += f"{token} ({label}) "
    print(annotated_text)

# Test the model
text = "Anuj is very great developer at the Arist Networks from India"
# text = dataset["train"][0]["tokens"]
predictions = predict(text)
# print(predictions)
display_predictions(predictions)



an (B-PER) ##uj (B-PER) is very great developer at the linux (B-ORG) foundation (I-ORG) from india (B-LOC) 
