In [None]:
import numpy as np
import pandas as pd
from torch.nn.functional import softmax
from datasets import DatasetDict, load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from sklearn.metrics import f1_score, accuracy_score

In [None]:
dataset = load_dataset("liar", trust_remote_code=True)
dataset

In [None]:
model_path_name = 'distilbert/distilbert-base-uncased'

In [None]:
# 'false' (0), 'half-true' (1), 'mostly-true' (2), 'true' (3), 'barely-true' (4), 'pants-fire' (5)
idx_labels = {0: 'false', 1: 'half-true', 2: 'mostly-true', 3: 'true', 4: 'barely-true', 5: 'pants-fire'}
labels_idx = {'false': 0, 'half-true': 1, 'mostly-true': 2, 'true': 3, 'barely-true': 4, 'pants-fire': 5}

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_path_name,
                                                          num_labels=len(idx_labels),
                                                          id2label=idx_labels,
                                                          label2id=labels_idx)

In [None]:
# for name, param in model.named_parameters():
#     print(name, param.requires_grad)

# initial freezing of distilbert parameters
for p in model.distilbert.parameters():
    p.requires_grad = False

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_path_name)

In [None]:
def preprocess_func(data):
    return tokenizer(data['statement'], truncation=True, max_length = 128, padding='max_length')

In [None]:
tokenized_data = dataset.map(preprocess_func, batched=True)
tokenized_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# keep the "necessary" columns (label, input_ids, and the mask), cleaner
features = ['id', 'statement', 'subject', 'speaker', 'job_title', 'state_info', 'party_affiliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context']
tokenized_data = tokenized_data.remove_columns(features)

tokenized_data

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred    
    probs = softmax(logits, axis=-1)
    preds = argmax(probs, axis=-1)
    return {
        'accuracy:': accuracy_score(labels, preds),
        'macro_f1 score:': f1_score(labels, preds, average="macro")
    }    

In [None]:
lr = 2e-4
batch_size = 16
num_epochs = 5

training_args = TrainingArguments(
    output_dir='output/distilbert-liar-classifier',
    learning_rate=lr, 
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

In [None]:
trainer = Trainer(
    model=model, 
    args=training_args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['validation'],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()