In [26]:
import pandas as pd

path="/data/home/arunkumar12/.cache/kagglehub/datasets/naseralqaydeh/named-entity-recognition-ner-corpus/versions/3"
data=pd.read_csv(f'{path}/ner.csv')

In [27]:
import ast

data['POS'] = data['POS'].apply(ast.literal_eval)
data['Tag'] = data['Tag'].apply(ast.literal_eval)

unique_tags = set(tag for doc in data['Tag'] for tag in doc)

tag_to_id = {tag: i for i, tag in enumerate(unique_tags)}
id_to_tag = {i: tag for tag, i in tag_to_id.items()}

print(f"Tag to ID mapping: {tag_to_id}")
data['Tag'] = data['Tag'].apply(lambda x: [tag_to_id[tag] for tag in x])

for i, row in data.iterrows():
    if not (len(row['Sentence'].split()) == len(row['POS']) == len(row['Tag'])):
        data.drop(i,inplace=True)
for i, row in data.iterrows():
    assert len(row['Sentence'].split()) == len(row['POS']) == len(row['Tag']), f"Row {i} has mismatch in lengths."



Tag to ID mapping: {'B-tim': 0, 'I-tim': 1, 'I-geo': 2, 'B-gpe': 3, 'B-org': 4, 'I-gpe': 5, 'I-eve': 6, 'I-art': 7, 'I-per': 8, 'B-per': 9, 'I-org': 10, 'B-art': 11, 'B-geo': 12, 'O': 13, 'I-nat': 14, 'B-eve': 15, 'B-nat': 16}


In [28]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

data['Sentence'] = data['Sentence'].apply(lambda x: x.split())
def tokenize_and_align_labels(sentences, labels):
    print(f"Number of sentences: {len(sentences)}")
    print(f"Number of label sets: {len(labels)}")
    tokenized_inputs = tokenizer(sentences, truncation=True, is_split_into_words=True, return_offsets_mapping=True, padding=True)
    
    print(f"Tokenized inputs keys: {tokenized_inputs.keys()}")

    offset_mappings = tokenized_inputs.pop("offset_mapping")
    aligned_labels = []
    for i, offset_mapping in enumerate(offset_mappings):
        label_ids = []
        label = labels[i]
        word_ids = tokenized_inputs.word_ids(batch_index=i)

        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])  
            else:
                label_ids.append(-100)  
            previous_word_idx = word_idx

        aligned_labels.append(label_ids)

    tokenized_inputs['labels'] = aligned_labels
    return tokenized_inputs


sentences = data['Sentence'].tolist()
labels = data['Tag'].tolist()
tokenized_data = tokenize_and_align_labels(sentences, labels)

Number of sentences: 47955
Number of label sets: 47955
Tokenized inputs keys: dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping'])


In [29]:
# Dividing the dataset into train and test

from datasets import Dataset

dataset = Dataset.from_dict(tokenized_data)
train_test_split = dataset.train_test_split(test_size=0.2)

train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [30]:
# Training the model for finetune

from transformers import BertForTokenClassification, Trainer, TrainingArguments
import torch

num_labels = len(set([label for sublist in data['Tag'].tolist() for label in sublist]))


model = BertForTokenClassification.from_pretrained("bert-base-cased", num_labels=num_labels)


device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
model.to(device)


training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=10,
    run_name='ner_experiment_1',
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=64,   
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',            
    logging_steps=10,
    evaluation_strategy="epoch",   
    save_strategy="epoch",            # Save at the end of each epoch
    save_total_limit=1    
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,  
)


trainer.train()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss
1,0.1191,0.112916
2,0.0969,0.090599
3,0.0767,0.085683
4,0.0595,0.089177
5,0.0446,0.09374
6,0.0379,0.10404
7,0.0288,0.110552
8,0.0216,0.120481
9,0.016,0.126349
10,0.0133,0.130697




TrainOutput(global_step=3000, training_loss=0.10264405244092147, metrics={'train_runtime': 1690.4703, 'train_samples_per_second': 226.943, 'train_steps_per_second': 1.775, 'total_flos': 3.054720161911392e+16, 'train_loss': 0.10264405244092147, 'epoch': 10.0})

In [31]:
trainer.evaluate()




{'eval_loss': 0.13069701194763184,
 'eval_runtime': 13.5518,
 'eval_samples_per_second': 707.73,
 'eval_steps_per_second': 1.402,
 'epoch': 10.0}

In [32]:
# Save the model and tokenizer
model.save_pretrained('./results')
tokenizer.save_pretrained('./results')


('./results/tokenizer_config.json',
 './results/special_tokens_map.json',
 './results/vocab.txt',
 './results/added_tokens.json',
 './results/tokenizer.json')

In [34]:
# Testing the model

from transformers import BertTokenizerFast, BertForTokenClassification
import torch

# Load your fine-tuned model and tokenizer
model_path = './results'  # Path where your fine-tuned model is saved
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained(model_path)
model = BertForTokenClassification.from_pretrained(model_path)

# Set model to evaluation mode
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()




BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [35]:
import torch
import numpy as np

def ner_inference(sentence):
    # Tokenize input sentence and prepare input tensor
    inputs = tokenizer(sentence.split(), return_tensors="pt", is_split_into_words=True, truncation=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs).logits
    
    # Get predicted label IDs and convert them to tag names
    predictions = torch.argmax(outputs, dim=2)
    predicted_labels = predictions[0].cpu().numpy()
    
    # Convert IDs to tags using id_to_tag mapping
    tokens = inputs['input_ids'][0].cpu().numpy()
    tags = [id_to_tag[id] for id in predicted_labels]
    
    # Combine tokens with their predicted tags
    results = []
    for token_id, tag in zip(tokens, tags):
        token = tokenizer.decode([token_id]).strip()
        if token not in ["[CLS]", "[SEP]"]:  # Skip special tokens
            results.append((token, tag))
    
    return results


In [39]:
sentence = "Nothing is there"
results = ner_inference(sentence)

print("NER Results:")
for token, tag in results:
    print(f"{token}: {tag}")


NER Results:
Nothing: O
is: O
there: O
