In [10]:
import pandas as pd

path="/data/home/arunkumar12/.cache/kagglehub/datasets/naseralqaydeh/named-entity-recognition-ner-corpus/versions/3"
data=pd.read_csv(f'{path}/ner.csv')

In [11]:
import ast

data['POS'] = data['POS'].apply(ast.literal_eval)
data['Tag'] = data['Tag'].apply(ast.literal_eval)

unique_tags = set(tag for doc in data['Tag'] for tag in doc)

tag_to_id = {tag: i for i, tag in enumerate(unique_tags)}
id_to_tag = {i: tag for tag, i in tag_to_id.items()}

print(f"Tag to ID mapping: {tag_to_id}")
data['Tag'] = data['Tag'].apply(lambda x: [tag_to_id[tag] for tag in x])

for i, row in data.iterrows():
    if not (len(row['Sentence'].split()) == len(row['POS']) == len(row['Tag'])):
        data.drop(i,inplace=True)
for i, row in data.iterrows():
    assert len(row['Sentence'].split()) == len(row['POS']) == len(row['Tag']), f"Row {i} has mismatch in lengths."



Tag to ID mapping: {'I-per': 0, 'B-gpe': 1, 'I-tim': 2, 'I-eve': 3, 'I-org': 4, 'B-org': 5, 'I-art': 6, 'B-eve': 7, 'I-gpe': 8, 'B-per': 9, 'I-geo': 10, 'B-geo': 11, 'B-nat': 12, 'O': 13, 'B-tim': 14, 'B-art': 15, 'I-nat': 16}


In [12]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

data['Sentence'] = data['Sentence'].apply(lambda x: x.split())
def tokenize_and_align_labels(sentences, labels):
    print(f"Number of sentences: {len(sentences)}")
    print(f"Number of label sets: {len(labels)}")
    tokenized_inputs = tokenizer(sentences, truncation=True, is_split_into_words=True, return_offsets_mapping=True, padding=True)
    
    print(f"Tokenized inputs keys: {tokenized_inputs.keys()}")

    offset_mappings = tokenized_inputs.pop("offset_mapping")
    aligned_labels = []
    for i, offset_mapping in enumerate(offset_mappings):
        label_ids = []
        label = labels[i]
        word_ids = tokenized_inputs.word_ids(batch_index=i)

        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])  
            else:
                label_ids.append(-100)  
            previous_word_idx = word_idx

        aligned_labels.append(label_ids)

    tokenized_inputs['labels'] = aligned_labels
    return tokenized_inputs


sentences = data['Sentence'].tolist()
labels = data['Tag'].tolist()
tokenized_data = tokenize_and_align_labels(sentences, labels)

Number of sentences: 47955
Number of label sets: 47955
Tokenized inputs keys: dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping'])


In [13]:
# Dividing the dataset into train and test

from datasets import Dataset

dataset = Dataset.from_dict(tokenized_data)
train_test_split = dataset.train_test_split(test_size=0.2)

train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [15]:
# Training the model for finetune

from transformers import BertForTokenClassification, Trainer, TrainingArguments
import torch

num_labels = len(set([label for sublist in data['Tag'].tolist() for label in sublist]))


model = BertForTokenClassification.from_pretrained("bert-base-cased", num_labels=num_labels)


device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
model.to(device)


training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=10,
    run_name='ner_experiment_1',
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=64,   
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',            
    logging_steps=10,
    evaluation_strategy="epoch",     
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,  
)


trainer.train()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss
1,0.1208,0.112488
2,0.0983,0.09024
3,0.0785,0.08647
4,0.0608,0.088008
5,0.0431,0.09417
6,0.0363,0.102176
7,0.0295,0.111751
8,0.0218,0.11857
9,0.0163,0.125922
10,0.013,0.130378




TrainOutput(global_step=3000, training_loss=0.10014132274935643, metrics={'train_runtime': 1791.7219, 'train_samples_per_second': 214.118, 'train_steps_per_second': 1.674, 'total_flos': 3.054720161911392e+16, 'train_loss': 0.10014132274935643, 'epoch': 10.0})

In [None]:
trainer.evaluate()


{'eval_loss': 0.08916840702295303,
 'eval_runtime': 6.2407,
 'eval_samples_per_second': 768.5,
 'eval_steps_per_second': 1.602,
 'epoch': 3.0}