# Transformers Text Classification

In this notebook is showed the training of 3 BERT's family moddels: DistilBERT, BERT-base, RoBERTa-base. The goal was to evaluate these models in order to choose the one with best performance on the validation set.

---

**ATTENTION!** The training arguments are set such that each trained model is saved in a directory called as *model_name*!!!

In order to do **NOT SAVE** them, **check the TrainingArgs parameters** in each section of examined models, where are initialized. **Check** those sections to **avoid** undesired save strategy for log and evaluation.

---

In [1]:
!pip install evaluate wikidata transformers accelerate



In [2]:
import re
import json
import tqdm
import torch
import evaluate
import requests
import numpy as np
import pandas as pd
import transformers
import concurrent.futures
from bs4 import BeautifulSoup
from wikidata.client import Client
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, EvalPrediction, Trainer, TrainingArguments, DataCollatorWithPadding, set_seed, EarlyStoppingCallback

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

### Dataset

In [4]:
dir_path = "/mnt/c/Users/fede6/Desktop/HW1/"
train_path = "train.csv"
dev_path = "valid.csv"
test_path =  "test_unlabeled.csv"

train_df = pd.read_csv(dir_path + train_path, encoding='utf-8')
dev_df = pd.read_csv(dir_path + dev_path, encoding='utf-8')
test_df = pd.read_csv(dir_path + test_path, encoding='utf-8')

In [5]:
def save_txt(filename, path, txt):
    with open(path + filename, 'w', encoding='utf-8') as output:
        json.dump(txt, output, ensure_ascii=False, indent=2)

def load_txt(filename, path):
    with open(path + filename, 'r', encoding='utf-8') as input_file:
        return json.load(input_file)

In [6]:
client = Client()

def extract_qid(url):
    return url.strip().split("/")[-1]

def get_wiki_link(qid, lang='en'):
    try:
        entity = client.get(qid, load=True)
        sitelinks = entity.data.get('sitelinks', {})
        page_info = sitelinks.get(f'{lang}wiki')
        return page_info['url'] if page_info else None
    except Exception as e:
        print(f"ERROR retrieving Wikipedia link for {qid}: {e}")
        return None

def get_paragraphs(wikipedia_link):
    try:
        response = requests.get(wikipedia_link, allow_redirects=True)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')
        content = soup.find('div', class_='mw-content-ltr mw-parser-output')
        if not content:
            return None

        paragraphs = []
        for p in content.find_all('p'):
            text = p.get_text(separator=" ", strip=True)
            text = re.sub(r'\[\s*\d+\s*\]', '', text)
            text = re.sub(r'\s{2,}', ' ', text)
            if text:
                paragraphs.append(text)

        return "\n\n".join(paragraphs) if paragraphs else None

    except Exception as e:
        print(f"ERROR! Link {wikipedia_link}: {e}")
        return None

def process_item(index, item, df, lang):
    try:
        qid = extract_qid(item)
        link = get_wiki_link(qid, lang)
        paragraph = get_paragraphs(link)

        if not link:
            print(f"WARNING: missing Wikipedia link for QID {qid}")
            return index, df['description'][df['item'] == item].values[0]

        if not paragraph:
            print(f"WARNING: empty or missing content for {link}")
            return index, df['description'][df['item'] == item].values[0]

        return index, paragraph

    except Exception as e:
        print(f"ERROR processing item {item} (QID: {qid if 'qid' in locals() else 'UNKNOWN'}): {e}")
        return index, df['description'][df['item'] == item].values[0]

def text_extraction(df, lang='en', max_workers=16):
    results = [None] * len(df)
    items = list(enumerate(df['item']))

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_item, idx, item, df, lang): idx for idx, item in items}

        for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
            idx, paragraph = future.result()
            results[idx] = paragraph

    return results

In [7]:
EXTRACTED = True

In [9]:
if EXTRACTED:
    train_txt = load_txt(filename="train_txts.txt", path=dir_path)
    valid_txt = load_txt(filename="dev_txts.txt",   path=dir_path)
    test_txt = load_txt(filename="test_txts.txt",   path=dir_path)
else:
    train_txt = text_extraction(df=train_df)
    valid_txt = text_extraction(df=dev_df)
    test_txt  = text_extraction(df=test_df)

train_df['paragraph'] = train_txt
dev_df['paragraph'] = valid_txt
test_df['paragraph'] = test_txt

In [None]:
train_df.head()

Unnamed: 0,item,name,description,type,category,subcategory,label,paragraph
0,http://www.wikidata.org/entity/Q32786,916,2012 film by M. Mohanan,entity,films,film,cultural exclusive,916 is a 2012 Indian Malayalam -language drama...
1,http://www.wikidata.org/entity/Q371,!!!,American dance-punk band from California,entity,music,musical group,cultural representative,!!! ( / tʃ ( ɪ ) k . tʃ ( ɪ ) k . tʃ ( ɪ ) k /...
2,http://www.wikidata.org/entity/Q3729947,¡Soborno!,Mort & Phil comic,entity,comics and anime,comics,cultural representative,¡Soborno! (English: Bribery! ) is a 1977 comic...
3,http://www.wikidata.org/entity/Q158611,+44,American band,entity,music,musical group,cultural representative,+44 (read as Plus Forty-four ) was an American...
4,http://www.wikidata.org/entity/Q280375,1 Monk Street,"building in Monmouth, Wales",entity,architecture,building,cultural exclusive,"1 Monk Street, Monmouth was built as a Working..."


### **Attention**!
In order to deliver the predicitons on the unlabeled test_set. In order to use test is sufficient to do set **is_labeled=True**;

In [10]:
mapper = {
    'cultural agnostic':       2,
    'cultural representative': 1,
    'cultural exclusive':      0
}

class PLMDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, df, is_labeled=True):
        self.is_labeled = is_labeled
        self.encodings = encodings
        self.size = len(df['item'].to_list())
        if self.is_labeled:
            self.labels = [mapper[label] for label in df['label']]

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]).to(device) for k, v in self.encodings.items()}

        if self.is_labeled:
            item["labels"] = torch.tensor(self.labels[idx]).to(device)

        return item

    def __len__(self):
        return self.size

### Metrics

In [17]:
def compute_metrics(eval_pred):
   load_accuracy = evaluate.load("accuracy")
   load_precision = evaluate.load("precision")
   load_recall = evaluate.load("recall")
   load_f1 = evaluate.load("f1")

   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   precision = load_precision.compute(predictions=predictions, references=labels, average="macro")["precision"]
   recall = load_recall.compute(predictions=predictions, references=labels, average="macro")["recall"]
   f1 = load_f1.compute(predictions=predictions, references=labels, average="macro")["f1"]
   return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

## Model initialization

In [12]:
def model_init(model_name, n_classes=3, padding=True, truncation=True):
  model = AutoModelForSequenceClassification.from_pretrained(model_name, ignore_mismatched_sizes=True, output_attentions=False, output_hidden_states=False, num_labels=n_classes).to(device)
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
  return model, tokenizer, data_collator

def tokenization(df, tokenizer):
    return tokenizer(df["paragraph"].to_list(), padding=True, truncation=True)

#### Training global parameters

In [13]:
SEED = 42
N_EPOCHS = 5
BATCH_SIZE = 8
WARMUP_STEPS = 391
WEIGHT_DECAY = 0.01
LEARNING_RATE = 1e-5

set_seed(SEED)

### DistilBERT

In [13]:
model_name = "distilbert-base-uncased"

In [None]:
torch.cuda.empty_cache()

In [20]:
model, tokenizer, data_coll = model_init(model_name)

tokenized_trainset = tokenization(train_df, tokenizer=tokenizer)
tokenized_devset =   tokenization(dev_df, tokenizer=tokenizer)

train_dataset = PLMDataset(tokenized_trainset, df=train_df, is_labeled=True)
val_dataset = PLMDataset(tokenized_devset, df=dev_df, is_labeled=True)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### **TRAINING ARGUMENTS**

In [16]:
training_args = TrainingArguments(
    report_to="none",
    save_only_model=True,
    save_strategy="steps",
    eval_strategy="steps",
    greater_is_better=False,
    eval_steps=WARMUP_STEPS,
    save_steps=WARMUP_STEPS,
    warmup_steps=WARMUP_STEPS,
    num_train_epochs=N_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    logging_steps=WARMUP_STEPS,
    logging_dir=dir_path+"logs",
    load_best_model_at_end=True,
    learning_rate=LEARNING_RATE,
    metric_for_best_model="eval_loss",
    per_device_train_batch_size=BATCH_SIZE,
    output_dir=dir_path + model_name + "_res/results"
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_coll,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    train_dataset=train_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)

#### Training & Evaluation

In [17]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1
391,0.8816,0.621045,0.726667,0.705515
782,0.5699,0.559214,0.756667,0.744374
1173,0.4794,0.534775,0.776667,0.762704
1564,0.4515,0.600148,0.766667,0.751146
1955,0.3621,0.636,0.78,0.768821
2346,0.3514,0.750408,0.76,0.745849
2737,0.2756,0.81658,0.753333,0.740811


TrainOutput(global_step=2737, training_loss=0.4816564212195578, metrics={'train_runtime': 1059.653, 'train_samples_per_second': 29.496, 'train_steps_per_second': 3.69, 'total_flos': 2898570840966144.0, 'train_loss': 0.4816564212195578, 'epoch': 3.5})

In [None]:
trainer.evaluate()

{'eval_loss': 0.534774661064148,
 'eval_accuracy': 0.7766666666666666,
 'eval_f1': 0.762704101582325,
 'eval_runtime': 5.0213,
 'eval_samples_per_second': 59.745,
 'eval_steps_per_second': 7.568,
 'epoch': 3.5}

### BERT

In [None]:
model_name = "bert-base-uncased"

In [None]:
model, tokenizer, data_coll = model_init(model_name)

tokenized_trainset = tokenization(train_df, tokenizer=tokenizer)
tokenized_devset =   tokenization(dev_df, tokenizer=tokenizer)

train_dataset = PLMDataset(tokenized_trainset, df=train_df, is_labeled=True)
val_dataset = PLMDataset(tokenized_devset, df=dev_df, is_labeled=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### **TRAINING ARGUMENTS**

In [None]:
training_args = TrainingArguments(
    report_to="none",
    save_only_model=True,
    save_strategy="steps",
    eval_strategy="steps",
    greater_is_better=False,
    eval_steps=WARMUP_STEPS,
    save_steps=WARMUP_STEPS,
    warmup_steps=WARMUP_STEPS,
    num_train_epochs=N_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    logging_steps=WARMUP_STEPS,
    logging_dir=dir_path+"logs",
    load_best_model_at_end=True,
    learning_rate=LEARNING_RATE,
    metric_for_best_model="eval_loss",
    per_device_train_batch_size=BATCH_SIZE,
    output_dir=dir_path + model_name + "_res/results"
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_coll,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    train_dataset=train_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)

#### Training & Evaluation

In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1
391,0.8737,0.654126,0.666667,0.631042
782,0.5609,0.566412,0.763333,0.74707
1173,0.4629,0.61511,0.753333,0.737141
1564,0.4401,0.640821,0.756667,0.740719
1955,0.3221,0.705668,0.763333,0.750575
2346,0.3245,0.793203,0.753333,0.740052


TrainOutput(global_step=2346, training_loss=0.49736401166858984, metrics={'train_runtime': 1744.8959, 'train_samples_per_second': 17.912, 'train_steps_per_second': 2.241, 'total_flos': 4934165922653184.0, 'train_loss': 0.49736401166858984, 'epoch': 3.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.5664123296737671,
 'eval_accuracy': 0.7633333333333333,
 'eval_f1': 0.747069841162474,
 'eval_runtime': 10.7632,
 'eval_samples_per_second': 27.873,
 'eval_steps_per_second': 3.531,
 'epoch': 3.0}

## RoBERTa

In [None]:
torch.cuda.empty_cache()

In [None]:
model_name = "roberta-base"

In [None]:
model, tokenizer, data_coll = model_init(model_name)

tokenized_trainset = tokenization(train_df, tokenizer=tokenizer)
tokenized_devset =   tokenization(dev_df, tokenizer=tokenizer)

train_dataset = PLMDataset(tokenized_trainset, df=train_df, is_labeled=True)
val_dataset = PLMDataset(tokenized_devset, df=dev_df, is_labeled=True)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### **TRAINING ARGUMENTS**

In [None]:
training_args = TrainingArguments(
    report_to="none",
    save_only_model=True,
    save_strategy="steps",
    eval_strategy="steps",
    greater_is_better=False,
    eval_steps=WARMUP_STEPS,
    save_steps=WARMUP_STEPS,
    warmup_steps=WARMUP_STEPS,
    num_train_epochs=N_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    logging_steps=WARMUP_STEPS,
    logging_dir=dir_path+"logs",
    load_best_model_at_end=True,
    learning_rate=LEARNING_RATE,
    metric_for_best_model="eval_loss",
    per_device_train_batch_size=BATCH_SIZE,
    output_dir=dir_path + model_name + "_res/results"
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_coll,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    train_dataset=train_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)

#### Training & Evaluation

In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1
391,0.8779,0.602486,0.74,0.721386
782,0.5417,0.575793,0.803333,0.794663
1173,0.4671,0.578462,0.78,0.768546
1564,0.4628,0.618061,0.79,0.776604
1955,0.3564,0.73101,0.78,0.771274
2346,0.3608,0.866831,0.75,0.740185


TrainOutput(global_step=2346, training_loss=0.5111339468277121, metrics={'train_runtime': 1766.1796, 'train_samples_per_second': 17.696, 'train_steps_per_second': 2.214, 'total_flos': 4934165922653184.0, 'train_loss': 0.5111339468277121, 'epoch': 3.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.5757932066917419,
 'eval_accuracy': 0.8033333333333333,
 'eval_f1': 0.7946633866399488,
 'eval_runtime': 9.9552,
 'eval_samples_per_second': 30.135,
 'eval_steps_per_second': 3.817,
 'epoch': 3.0}

After different trials, the RoBERTa-base model showed better performance with respect to the DistilBERT and BERT-base models.  
All the models showed that after 1 epoch the train loss and validation loss start to diverge. In order to show this behavior, the models has been trained for at most 5 epochs and the early stopping callback has been used to interrupt the training when the validation loss increase for 4 *eval_steps*.  
In the next section [**Best Model**], the best trained RoBERTa-base model predictions are showed and saved as file csv.

---

## Best Model

After several experiments, observing the metrics on the development set, the best model is **RoBERTa-base model**. Here is presented the evaluation metrics on the validation set and then the predictions on the first elements of the test set. The best model's predictions are saved in file csv, delivered as final result.

In [14]:
best_model_path = dir_path + "BestRes/results/checkpoint-782"

model, tokenizer, data_coll = model_init(best_model_path)

tokenized_trainset = tokenization(train_df, tokenizer=tokenizer)
tokenized_devset =   tokenization(dev_df, tokenizer=tokenizer)

train_dataset = PLMDataset(tokenized_trainset, df=train_df, is_labeled=True)
val_dataset = PLMDataset(tokenized_devset, df=dev_df, is_labeled=True)

#### **TRAINING ARGUMENTS**

In [18]:
training_args = TrainingArguments(
    report_to="none",
    eval_strategy="steps",
    greater_is_better=False,
    eval_steps=WARMUP_STEPS,
    warmup_steps=WARMUP_STEPS,
    num_train_epochs=N_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    logging_steps=WARMUP_STEPS,
    learning_rate=LEARNING_RATE,
    metric_for_best_model="eval_loss",
    per_device_train_batch_size=BATCH_SIZE
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_coll,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    train_dataset=train_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)

#### Evaluation results

In [19]:
results = trainer.evaluate()

In [26]:
accuracy = results['eval_accuracy']
precision = results['eval_precision']
recall = results['eval_recall']
f1_score = results['eval_f1']

print(f"RoBERTa base score:")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 score:  {f1_score:.4f}")

RoBERTa base score:
Accuracy:  0.8100
Precision: 0.8030
Recall:    0.7989
F1 score:  0.8007


#### **TEST PHASE**

With the labeled test set, set **is_labeled = True** for evaluating the performance.

In [24]:
test_dataset = PLMDataset(tokenization(test_df, tokenizer=tokenizer), df=test_df, is_labeled=False)

**IS_TEST_LABELED** is set as **is_labeled**, so in order to evaluate the model on the labeled test set, check the **is_labeled** parameter on the PLMDataset initialization.

In [25]:
IS_TEST_LABELED = test_dataset.is_labeled

if IS_TEST_LABELED:
    test_results = trainer.evaluate(eval_dataset=test_dataset)
    
    test_accuracy = test_results['eval_accuracy']
    test_precision = test_results['eval_precision']
    test_recall = test_results['eval_recall']
    test_f1_score = test_results['eval_f1']
    
    print(f"RoBERTa base score:")
    print(f"Accuracy:  {test_accuracy:.4f}")
    print(f"Precision: {test_precision:.4f}")
    print(f"Recall:    {test_recall:.4f}")
    print(f"F1 score:  {test_f1_score:.4f}")

In [28]:
preds_struct = trainer.predict(test_dataset=test_dataset)
predictions = np.argmax(preds_struct.predictions, axis=1)

The labels of the predictions have been mapped in the corresponding cultural class in order to provide a easier readable table. This mapping can be easily removed setting **REMAP = False**

In [31]:
REMAP = True

results = pd.DataFrame()
results['item'] = test_df['item']
results['name'] = test_df['name']
results['predictions'] = predictions

if REMAP:
    remap_dict = {
        0: 'cultural exclusive',
        1: 'cultural representative',
        2: 'cultural agnostic'
    }
    results['predictions'] = results['predictions'].map(remap_dict)

results.head(15)

Unnamed: 0,item,name,predictions
0,http://www.wikidata.org/entity/Q2427430,Northeast Flag Replacement,cultural exclusive
1,http://www.wikidata.org/entity/Q125482,imam,cultural representative
2,http://www.wikidata.org/entity/Q15789,FC Bayern Munich,cultural representative
3,http://www.wikidata.org/entity/Q582496,Fome Zero,cultural exclusive
4,http://www.wikidata.org/entity/Q572811,Anthony Award,cultural exclusive
5,http://www.wikidata.org/entity/Q1866547,Livraria Bertrand,cultural exclusive
6,http://www.wikidata.org/entity/Q19081,prokaryotes,cultural agnostic
7,http://www.wikidata.org/entity/Q474090,narrative poetry,cultural agnostic
8,http://www.wikidata.org/entity/Q1266300,Neue Slowenische Kunst,cultural exclusive
9,http://www.wikidata.org/entity/Q193654,short-track speed skating,cultural agnostic


In [None]:
results.to_csv(dir_path + "RoBERTa_predictions.csv")