# Token Classification

In [1]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, load_metric

  from .autonotebook import tqdm as notebook_tqdm


### Loading Dataset

In [2]:
def filter_null_rows(example):
    '''Checking and removing examples with None values in 'text' or 'label'.'''
    return example['text'] is not None and example['label'] is not None

# Preparing the dataset
dataset_path = "eriktks/conll2003"
dataset = load_dataset(dataset_path).remove_columns(['id', 'pos_tags', 'chunk_tags'])
num_labels = len(dataset["train"].features["ner_tags"].feature.names)

# Split into train, validation and test
train_dataset = dataset['train']
val_dataset = dataset['validation']
test_dataset = dataset['test']

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


### Fine-Tuning

In [3]:
def Training_Tok_Clas(model_name, dataset_path, train, val):

    def set_seed(seed):
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
 
    set_seed(123)# Set the seed to ensure reproducibility

    # Load the pre-trained tokenizer and model for Token classification
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels)
    
    def tokenize_and_align_labels(examples):
        '''Function to tokenize and align labels for token classification'''
        tokenized_inputs = tokenizer(examples["tokens"], truncation=True,padding='max_length', is_split_into_words=True, max_length= 100)
        labels = []
        for i, label in enumerate(examples[f"ner_tags"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            label_ids = [-100 if word_id is None else label[word_id] for word_id in word_ids]
            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    tok_train = train.map(tokenize_and_align_labels, batched=True)
    tok_val = val.map(tokenize_and_align_labels, batched=True)

    # Define training arguments for the Trainer
    training_args = TrainingArguments(
        seed=123,
        data_seed=123,
        output_dir=f"./results_{model_name.split('/')[1]}_{dataset_path.split('/')[1]}", # Output directory for results
        evaluation_strategy='epoch',  # Evaluate the model at the end of each epoch
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01)  # Weight decay for regularization
    
    # Initialize the Trainer with the model, training arguments, and datasets
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tok_train,
        eval_dataset=tok_val)
    
    trainer.train()
    
    # Save the trained model and tokenizer to the specified directory
    model.save_pretrained(f"./{model_name.split('/')[1]}_{dataset_path.split('/')[1]}")
    tokenizer.save_pretrained(f"./{model_name.split('/')[1]}_{dataset_path.split('/')[1]}")

In [6]:
Training_Tok_Clas("squeezebert/squeezebert-uncased", "eriktks/conll2003", train_dataset, val_dataset)

Some weights of SqueezeBertForTokenClassification were not initialized from the model checkpoint at squeezebert/squeezebert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 19%|█▉        | 500/2634 [01:04<04:31,  7.87it/s]

{'loss': 0.7718, 'grad_norm': 1.0940091609954834, 'learning_rate': 1.6203492786636296e-05, 'epoch': 0.57}


 33%|███▎      | 877/2634 [01:53<03:45,  7.79it/s]
 33%|███▎      | 879/2634 [02:00<49:40,  1.70s/it]

{'eval_loss': 0.26540929079055786, 'eval_runtime': 6.824, 'eval_samples_per_second': 476.262, 'eval_steps_per_second': 29.895, 'epoch': 1.0}


 38%|███▊      | 1000/2634 [02:16<03:25,  7.95it/s]

{'loss': 0.3335, 'grad_norm': 1.6419051885604858, 'learning_rate': 1.240698557327259e-05, 'epoch': 1.14}


 57%|█████▋    | 1500/2634 [03:20<02:24,  7.86it/s]

{'loss': 0.2249, 'grad_norm': 2.707181215286255, 'learning_rate': 8.610478359908885e-06, 'epoch': 1.71}


 67%|██████▋   | 1755/2634 [03:54<01:49,  7.99it/s]
 67%|██████▋   | 1757/2634 [04:00<24:15,  1.66s/it]

{'eval_loss': 0.16735686361789703, 'eval_runtime': 6.6649, 'eval_samples_per_second': 487.627, 'eval_steps_per_second': 30.608, 'epoch': 2.0}


 76%|███████▌  | 2000/2634 [04:31<01:18,  8.05it/s]

{'loss': 0.1803, 'grad_norm': 0.8626317977905273, 'learning_rate': 4.8139711465451785e-06, 'epoch': 2.28}


 95%|█████████▍| 2500/2634 [05:35<00:16,  7.95it/s]

{'loss': 0.1577, 'grad_norm': 1.0971603393554688, 'learning_rate': 1.0174639331814731e-06, 'epoch': 2.85}


                                                   
100%|██████████| 2634/2634 [05:59<00:00,  7.33it/s]


{'eval_loss': 0.14918379485607147, 'eval_runtime': 6.6378, 'eval_samples_per_second': 489.618, 'eval_steps_per_second': 30.733, 'epoch': 3.0}
{'train_runtime': 359.4554, 'train_samples_per_second': 117.186, 'train_steps_per_second': 7.328, 'train_loss': 0.32506431660691987, 'epoch': 3.0}


In [5]:
Training_Tok_Clas("google-bert/bert-base-uncased", "eriktks/conll2003", train_dataset, val_dataset)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 14041/14041 [00:03<00:00, 4423.30 examples/s]
Map: 100%|██████████| 3250/3250 [00:00<00:00, 3740.93 examples/s]
  attn_output = torch.nn.functional.scaled_dot_product_attention(
 19%|█▉        | 500/2634 [01:13<05:13,  6.82it/s]

{'loss': 0.2305, 'grad_norm': 2.7367660999298096, 'learning_rate': 1.6203492786636296e-05, 'epoch': 0.57}


 33%|███▎      | 878/2634 [02:12<03:55,  7.47it/s]
 33%|███▎      | 879/2634 [02:21<1:22:49,  2.83s/it]

{'eval_loss': 0.06300098448991776, 'eval_runtime': 8.9823, 'eval_samples_per_second': 361.821, 'eval_steps_per_second': 22.711, 'epoch': 1.0}


 38%|███▊      | 1000/2634 [02:39<03:57,  6.88it/s] 

{'loss': 0.0763, 'grad_norm': 1.6752288341522217, 'learning_rate': 1.240698557327259e-05, 'epoch': 1.14}


 57%|█████▋    | 1500/2634 [04:01<02:46,  6.82it/s]  

{'loss': 0.0461, 'grad_norm': 5.283290386199951, 'learning_rate': 8.610478359908885e-06, 'epoch': 1.71}


 67%|██████▋   | 1756/2634 [04:42<01:56,  7.55it/s]
 67%|██████▋   | 1757/2634 [04:51<41:20,  2.83s/it]

{'eval_loss': 0.05373615771532059, 'eval_runtime': 8.9761, 'eval_samples_per_second': 362.073, 'eval_steps_per_second': 22.727, 'epoch': 2.0}


 76%|███████▌  | 2000/2634 [05:27<01:32,  6.87it/s]

{'loss': 0.033, 'grad_norm': 1.7722903490066528, 'learning_rate': 4.8139711465451785e-06, 'epoch': 2.28}


 95%|█████████▍| 2500/2634 [06:48<00:19,  6.82it/s]

{'loss': 0.0258, 'grad_norm': 0.18871574103832245, 'learning_rate': 1.0174639331814731e-06, 'epoch': 2.85}


100%|██████████| 2634/2634 [07:16<00:00,  7.50it/s]
100%|██████████| 2634/2634 [07:25<00:00,  5.91it/s]


{'eval_loss': 0.05526485666632652, 'eval_runtime': 8.978, 'eval_samples_per_second': 361.994, 'eval_steps_per_second': 22.722, 'epoch': 3.0}
{'train_runtime': 445.4917, 'train_samples_per_second': 94.554, 'train_steps_per_second': 5.913, 'train_loss': 0.07933414371609235, 'epoch': 3.0}


### Testing

In [10]:
def Testing_Tok_Clas(model_name, dataset_path, test):
    
    model = AutoModelForTokenClassification.from_pretrained(f"./{model_name.split('/')[1]}_{dataset_path.split('/')[1]}")
    tokenizer = AutoTokenizer.from_pretrained(f"./{model_name.split('/')[1]}_{dataset_path.split('/')[1]}")

    def set_seed(seed):
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

    set_seed(123)# Set the seed to ensure reproducibility
    
    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(examples["tokens"], truncation=True,padding='max_length', is_split_into_words=True, max_length= 100)
        labels = []
        for i, label in enumerate(examples[f"ner_tags"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            label_ids = [-100 if word_id is None else label[word_id] for word_id in word_ids]
            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    tok_test = test.map(tokenize_and_align_labels, batched=True)
    label_list = dataset["train"].features["ner_tags"].feature.names

    metric = load_metric("seqeval")

    def compute_metrics(eval_preds):
        '''Cmputing evaluation metrics'''
    
        pred_logits, labels = eval_preds 
        
        pred_logits = np.argmax(pred_logits, axis=2) 

        predictions = [ 
            [label_list[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100] 
            for prediction, label in zip(pred_logits, labels)] 
        
        true_labels = [ 
        [label_list[l] for (eval_preds, l) in zip(prediction, label) if l != -100] 
        for prediction, label in zip(pred_logits, labels)] 
        results = metric.compute(predictions=predictions, references=true_labels) 
        return { 
            "precision": results["overall_precision"], 
            "recall": results["overall_recall"], 
            "f1": results["overall_f1"], 
            "accuracy": results["overall_accuracy"]}
    
    # Define training arguments for the Trainer
    testing_args = TrainingArguments(
        seed=123,
        data_seed=123,
        output_dir="./results", # Output directory for results
        use_cpu=True) # Use CPU for testing (set to False if GPU is available)

    trainer = Trainer(
        args=testing_args,
        model=model,
        eval_dataset=tok_test,
        compute_metrics=compute_metrics)

    return trainer.predict(tok_test)[-1]

In [11]:
Testing_Tok_Clas("squeezebert/squeezebert-uncased", "eriktks/conll2003", test_dataset)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
100%|██████████| 432/432 [02:52<00:00,  2.50it/s]


{'test_loss': 0.17675328254699707,
 'test_precision': 0.8467713787085515,
 'test_recall': 0.8655012486621477,
 'test_f1': 0.8560338743824982,
 'test_accuracy': 0.9674151670728967,
 'test_runtime': 172.9217,
 'test_samples_per_second': 19.969,
 'test_steps_per_second': 2.498}

In [12]:
Testing_Tok_Clas("google-bert/bert-base-uncased", "eriktks/conll2003", test_dataset)

Map: 100%|██████████| 3453/3453 [00:00<00:00, 4689.11 examples/s]
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
100%|██████████| 432/432 [04:59<00:00,  1.44it/s]


{'test_loss': 0.12570184469223022,
 'test_precision': 0.8886158886158886,
 'test_recall': 0.9031989535022,
 'test_f1': 0.8958480773767398,
 'test_accuracy': 0.9756261992428564,
 'test_runtime': 300.034,
 'test_samples_per_second': 11.509,
 'test_steps_per_second': 1.44}