# Transformers Text Classification

In [1]:
import json
import torch
import evaluate
import numpy as np
import pandas as pd
import transformers

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, EvalPrediction, Trainer, TrainingArguments, DataCollatorWithPadding, set_seed, EarlyStoppingCallback

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

### Dataset

In [3]:
dir_path = "/mnt/c/Users/fede6/Desktop/HW1/"
train_path = "train.csv"
dev_path = "valid.csv"

train_df = pd.read_csv(dir_path + train_path, encoding='utf-8')
dev_df = pd.read_csv(dir_path + dev_path, encoding='utf-8')

In [4]:
def save_txt(filename, path, txt):
    with open(path + filename, 'w', encoding='utf-8') as output:
        json.dump(txt, output, ensure_ascii=False, indent=2)

def load_txt(filename, path):
    with open(path + filename, 'r', encoding='utf-8') as input_file:
        return json.load(input_file)

train_txt = load_txt(filename="train_txts.txt", path=dir_path)
valid_txt = load_txt(filename="dev_txts.txt",   path=dir_path)

train_df['paragraph'] = train_txt
dev_df['paragraph'] = valid_txt

In [5]:
mapper = {
    'cultural agnostic':       2,
    'cultural representative': 1,
    'cultural exclusive':      0
}

class PLMDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = [mapper[label] for label in labels]

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]).to(device) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx]).to(device)
        return item

    def __len__(self):
        return len(self.labels)

### Metrics

In [6]:
def compute_metrics(eval_pred):
   load_accuracy = evaluate.load("accuracy")
   load_f1 = evaluate.load("f1")

   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels, average="macro")["f1"]
   return {"accuracy": accuracy, "f1": f1}

## Model initialization

In [7]:
def model_init(model_name, n_classes=3, padding=True, truncation=True):
  model = AutoModelForSequenceClassification.from_pretrained(model_name, ignore_mismatched_sizes=True, output_attentions=False, output_hidden_states=False, num_labels=n_classes).to(device)
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
  return model, tokenizer, data_collator

def tokenization(df, tokenizer):
    return tokenizer(df["paragraph"].to_list(), padding=True, truncation=True)

### DistilBERT

In [9]:
model_name = "distilbert-base-uncased"

In [10]:
torch.cuda.empty_cache()

In [11]:
model, tokenizer, data_coll = model_init(model_name)

tokenized_trainset = tokenization(train_df, tokenizer=tokenizer)
tokenized_devset =   tokenization(dev_df, tokenizer=tokenizer)

train_dataset = PLMDataset(tokenized_trainset, train_df['label'])
val_dataset = PLMDataset(tokenized_devset, dev_df['label'])

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
SEED = 42
N_EPOCHS = 5
BATCH_SIZE = 8
WARMUP_STEPS = 391
WEIGHT_DECAY = 0.01
LEARNING_RATE = 1e-5

set_seed(SEED)

In [13]:
training_args = TrainingArguments(
    report_to="none",
    save_only_model=True,
    save_strategy="steps",
    eval_strategy="steps",
    greater_is_better=False,
    eval_steps=WARMUP_STEPS,
    save_steps=WARMUP_STEPS,
    warmup_steps=WARMUP_STEPS,
    num_train_epochs=N_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    logging_steps=WARMUP_STEPS,
    logging_dir=dir_path+"logs",
    load_best_model_at_end=True,
    learning_rate=LEARNING_RATE,
    metric_for_best_model="eval_loss",
    per_device_train_batch_size=BATCH_SIZE,
    output_dir=dir_path + model_name + "_res/results"
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_coll,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    train_dataset=train_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)

In [14]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1
391,0.8745,0.61244,0.73,0.709178
782,0.5554,0.541131,0.77,0.755749
1173,0.4755,0.532408,0.796667,0.785511
1564,0.4506,0.599164,0.756667,0.740003
1955,0.358,0.705475,0.763333,0.753761
2346,0.351,0.796741,0.746667,0.732461
2737,0.2766,0.87894,0.753333,0.739275


Downloading builder script: 100%|█████████████████████████████████████████| 4.20k/4.20k [00:00<00:00, 4.58MB/s]
Downloading builder script: 100%|█████████████████████████████████████████| 6.79k/6.79k [00:00<00:00, 15.9MB/s]


TrainOutput(global_step=2737, training_loss=0.4773688464431289, metrics={'train_runtime': 1009.3288, 'train_samples_per_second': 30.966, 'train_steps_per_second': 3.874, 'total_flos': 2898570840966144.0, 'train_loss': 0.4773688464431289, 'epoch': 3.5})

In [15]:
trainer.evaluate()

{'eval_loss': 0.5324076414108276,
 'eval_accuracy': 0.7966666666666666,
 'eval_f1': 0.7855108834189486,
 'eval_runtime': 6.1237,
 'eval_samples_per_second': 48.99,
 'eval_steps_per_second': 6.205,
 'epoch': 3.5}

### BERT

In [20]:
model_name = "bert-base-uncased"

In [21]:
model, tokenizer, data_coll = model_init(model_name)

tokenized_trainset = tokenization(train_df, tokenizer=tokenizer)
tokenized_devset =   tokenization(dev_df, tokenizer=tokenizer)

train_dataset = PLMDataset(tokenized_trainset, train_df['label'])
val_dataset = PLMDataset(tokenized_devset, dev_df['label'])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
training_args = TrainingArguments(
    report_to="none",
    save_only_model=True,
    save_strategy="steps",
    eval_strategy="steps",
    greater_is_better=False,
    eval_steps=WARMUP_STEPS,
    save_steps=WARMUP_STEPS,
    warmup_steps=WARMUP_STEPS,
    num_train_epochs=N_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    logging_steps=WARMUP_STEPS,
    logging_dir=dir_path+"logs",
    load_best_model_at_end=True,
    learning_rate=LEARNING_RATE,
    metric_for_best_model="eval_loss",
    per_device_train_batch_size=BATCH_SIZE,
    output_dir=dir_path + model_name + "_res/results"
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_coll,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    train_dataset=train_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)

In [23]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1
391,0.8831,0.64064,0.716667,0.696475
782,0.5553,0.553624,0.776667,0.761861
1173,0.4579,0.552142,0.766667,0.753139
1564,0.4482,0.628857,0.753333,0.736799
1955,0.3204,0.673191,0.79,0.776473
2346,0.3245,0.78885,0.753333,0.739483
2737,0.2121,0.905424,0.776667,0.761111


TrainOutput(global_step=2737, training_loss=0.4573457471898406, metrics={'train_runtime': 1946.8589, 'train_samples_per_second': 16.054, 'train_steps_per_second': 2.008, 'total_flos': 5757184693306368.0, 'train_loss': 0.4573457471898406, 'epoch': 3.5})

In [24]:
trainer.evaluate()

{'eval_loss': 0.5521418452262878,
 'eval_accuracy': 0.7666666666666667,
 'eval_f1': 0.7531392880965376,
 'eval_runtime': 9.9522,
 'eval_samples_per_second': 30.144,
 'eval_steps_per_second': 3.818,
 'epoch': 3.5}

## RoBERTa

In [25]:
torch.cuda.empty_cache()

In [26]:
model_name = "roberta-base"

In [27]:
model, tokenizer, data_coll = model_init(model_name)

tokenized_trainset = tokenization(train_df, tokenizer=tokenizer)
tokenized_devset =   tokenization(dev_df, tokenizer=tokenizer)

train_dataset = PLMDataset(tokenized_trainset, train_df['label'])
val_dataset = PLMDataset(tokenized_devset, dev_df['label'])

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
training_args = TrainingArguments(
    report_to="none",
    save_only_model=True,
    save_strategy="steps",
    eval_strategy="steps",
    greater_is_better=False,
    eval_steps=WARMUP_STEPS,
    save_steps=WARMUP_STEPS,
    warmup_steps=WARMUP_STEPS,
    num_train_epochs=N_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    logging_steps=WARMUP_STEPS,
    logging_dir=dir_path+"logs",
    load_best_model_at_end=True,
    learning_rate=LEARNING_RATE,
    metric_for_best_model="eval_loss",
    per_device_train_batch_size=BATCH_SIZE,
    output_dir=dir_path + model_name + "_res/results"
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_coll,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    train_dataset=train_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)

In [None]:
trainer.train()

In [50]:
trainer.evaluate()

{'eval_loss': 0.5547690987586975,
 'eval_model_preparation_time': 0.0066,
 'eval_accuracy': 0.81,
 'eval_f1': 0.8007156768624658,
 'eval_runtime': 10.1975,
 'eval_samples_per_second': 29.419,
 'eval_steps_per_second': 3.726}

---

## Best Model

After several experiments, observing the metrics on the development set, the best model is **RoBERTa base model** 

In [46]:
best_model_path = dir_path + "BestRes/results/checkpoint-782"

model, tokenizer, data_coll = model_init(best_model_path)

tokenized_trainset = tokenization(train_df, tokenizer=tokenizer)
tokenized_devset =   tokenization(dev_df, tokenizer=tokenizer)

train_dataset = PLMDataset(tokenized_trainset, train_df['label'])
val_dataset = PLMDataset(tokenized_devset, dev_df['label'])

In [47]:
training_args = TrainingArguments(
    report_to="none",
    save_only_model=True,
    save_strategy="steps",
    eval_strategy="steps",
    greater_is_better=False,
    eval_steps=WARMUP_STEPS,
    save_steps=WARMUP_STEPS,
    warmup_steps=WARMUP_STEPS,
    num_train_epochs=N_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    logging_steps=WARMUP_STEPS,
    logging_dir=dir_path+"logs",
    load_best_model_at_end=True,
    learning_rate=LEARNING_RATE,
    metric_for_best_model="eval_loss",
    per_device_train_batch_size=BATCH_SIZE
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_coll,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    train_dataset=train_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)

In [48]:
results = trainer.evaluate()

In [49]:
accuracy = results['eval_accuracy']
f1_score = results['eval_f1']

print(f"RoBERTa base score:")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 score: {f1_score:.4f}")

RoBERTa base score:
Accuracy: 0.8100
F1 score: 0.8007
