# Sequence Classification

In [2]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, load_metric

### Dataset Loading

In [3]:
def filter_null_rows(example):
    '''It checks if there is some None examples in the data, and remove it'''
    return example['text'] is not None and example['label'] is not None

def map_labels(example):
    label_map = {label: i for i, label in enumerate(set(dataset['label']))}
    example['label'] = label_map[example['label']]
    return example

dataset_path = "valurank/News_Articles_Categorization"

dataset = load_dataset(dataset_path)['train'].rename_column("Text", "text").rename_column("Category", "label")
dataset = dataset.filter(filter_null_rows).map(map_labels)

num_labels = len(np.unique(dataset['label']))

train_val_test_split = dataset.train_test_split(test_size=0.2, seed=123)
train_dataset = train_val_test_split['train']
temp_dataset = train_val_test_split['test']


val_test_split = temp_dataset.train_test_split(test_size=0.5, seed=123)
val_dataset = val_test_split['train']
test_dataset = val_test_split['test']

### Fine-Tuning

In [6]:
def Training_Seq_Clas(model_name, dataset_path, train, val):

    def set_seed(seed):
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

    set_seed(123)

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
    
    def tokenizer_function(examples):
        return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=300)

    tok_train = train.map(tokenizer_function, batched=True)
    tok_val = val.map(tokenizer_function, batched=True)

    training_args = TrainingArguments(
        seed=123,
        data_seed=123,
        output_dir=f"./results_{model_name.split('/')[1]}_{dataset_path.split('/')[1]}",
        evaluation_strategy='epoch',
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01)
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tok_train,
        eval_dataset=tok_val)
    
    trainer.train()

    model.save_pretrained(f"./{model_name.split('/')[1]}_{dataset_path.split('/')[1]}")
    tokenizer.save_pretrained(f"./{model_name.split('/')[1]}_{dataset_path.split('/')[1]}")

In [12]:
Training_Seq_Clas("squeezebert/squeezebert-uncased", "valurank/News_Articles_Categorization", train_dataset, val_dataset)

Some weights of SqueezeBertForSequenceClassification were not initialized from the model checkpoint at squeezebert/squeezebert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 33%|███▎      | 187/561 [01:11<02:02,  3.05it/s]
 33%|███▎      | 187/561 [01:14<02:02,  3.05it/s]

{'eval_loss': 0.7639732956886292, 'eval_runtime': 2.2966, 'eval_samples_per_second': 161.981, 'eval_steps_per_second': 10.45, 'epoch': 1.0}


 67%|██████▋   | 374/561 [02:27<00:56,  3.34it/s]
 67%|██████▋   | 374/561 [02:29<00:56,  3.34it/s]

{'eval_loss': 0.43972089886665344, 'eval_runtime': 2.3062, 'eval_samples_per_second': 161.304, 'eval_steps_per_second': 10.407, 'epoch': 2.0}


 89%|████████▉ | 500/561 [03:15<00:22,  2.76it/s]

{'loss': 0.812, 'grad_norm': 3.6935312747955322, 'learning_rate': 2.1746880570409984e-06, 'epoch': 2.67}


100%|██████████| 561/561 [03:38<00:00,  3.44it/s]
100%|██████████| 561/561 [03:40<00:00,  2.54it/s]


{'eval_loss': 0.3686646521091461, 'eval_runtime': 2.215, 'eval_samples_per_second': 167.943, 'eval_steps_per_second': 10.835, 'epoch': 3.0}
{'train_runtime': 220.5584, 'train_samples_per_second': 40.493, 'train_steps_per_second': 2.544, 'train_loss': 0.7600495615532479, 'epoch': 3.0}


In [8]:
Training_Seq_Clas("google-bert/bert-base-uncased", "valurank/News_Articles_Categorization", train_dataset, val_dataset)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 2977/2977 [00:04<00:00, 735.49 examples/s]
Map: 100%|██████████| 372/372 [00:00<00:00, 677.93 examples/s]
  attn_output = torch.nn.functional.scaled_dot_product_attention(
                                                 
 33%|███▎      | 187/561 [01:17<01:55,  3.23it/s]

{'eval_loss': 0.3143351376056671, 'eval_runtime': 2.844, 'eval_samples_per_second': 130.803, 'eval_steps_per_second': 8.439, 'epoch': 1.0}


                                                 
 67%|██████▋   | 374/561 [02:34<00:58,  3.20it/s]

{'eval_loss': 0.21037505567073822, 'eval_runtime': 2.8678, 'eval_samples_per_second': 129.717, 'eval_steps_per_second': 8.369, 'epoch': 2.0}


 89%|████████▉ | 500/561 [03:25<00:24,  2.48it/s]

{'loss': 0.4318, 'grad_norm': 0.5544067025184631, 'learning_rate': 2.1746880570409984e-06, 'epoch': 2.67}


                                                 
100%|██████████| 561/561 [03:55<00:00,  2.39it/s]


{'eval_loss': 0.17613372206687927, 'eval_runtime': 2.861, 'eval_samples_per_second': 130.026, 'eval_steps_per_second': 8.389, 'epoch': 3.0}
{'train_runtime': 235.0752, 'train_samples_per_second': 37.992, 'train_steps_per_second': 2.386, 'train_loss': 0.39512487401299295, 'epoch': 3.0}


### Testing

In [13]:
def Testing_Seq_Clas(model_name, dataset_path, test):
    np.random.seed(123)
    torch.manual_seed(123)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(123)
    
    model = AutoModelForSequenceClassification.from_pretrained(f"./{model_name.split('/')[1]}_{dataset_path.split('/')[1]}", num_labels=num_labels)
    tokenizer = AutoTokenizer.from_pretrained(f"./{model_name.split('/')[1]}_{dataset_path.split('/')[1]}")
    
    def tokenizer_function(examples):
        return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=300)

    tok_test = test.map(tokenizer_function, batched=True)

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        
        accuracy_metric = load_metric("accuracy", trust_remote_code=True)
        f1_metric = load_metric("f1", trust_remote_code=True)
        precision_metric = load_metric("precision", trust_remote_code=True)
        recall_metric = load_metric("recall", trust_remote_code=True)
        
        accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
        f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
        precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")
        recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")
        
        return {
            "accuracy": accuracy["accuracy"],
            "f1": f1["f1"],
            "precision": precision["precision"],
            "recall": recall["recall"]}
    
    testing_args = TrainingArguments(
        seed=123,
        data_seed=123,
        output_dir="./results",
        use_cpu=True)

    trainer = Trainer(
        args=testing_args,
        model=model,
        eval_dataset=tok_test,
        compute_metrics=compute_metrics)

    return trainer.predict(tok_test)[-1]

In [14]:
Testing_Seq_Clas("squeezebert/squeezebert-uncased", "valurank/News_Articles_Categorization", test_dataset)

100%|██████████| 47/47 [01:01<00:00,  1.31s/it]


{'test_loss': 0.3123633563518524,
 'test_accuracy': 0.9463806970509383,
 'test_f1': 0.947022070532522,
 'test_precision': 0.9489325271094708,
 'test_recall': 0.9463806970509383,
 'test_runtime': 62.6655,
 'test_samples_per_second': 5.952,
 'test_steps_per_second': 0.75}

In [16]:
Testing_Seq_Clas("google-bert/bert-base-uncased", "valurank/News_Articles_Categorization", test_dataset)

100%|██████████| 47/47 [01:37<00:00,  2.08s/it]


{'test_loss': 0.12301452457904816,
 'test_accuracy': 0.9705093833780161,
 'test_f1': 0.9704613911860033,
 'test_precision': 0.9713580432079093,
 'test_recall': 0.9705093833780161,
 'test_runtime': 99.3161,
 'test_samples_per_second': 3.756,
 'test_steps_per_second': 0.473}