# Transformers Text Classification

In [1]:
import re
import json
import tqdm
import torch
import evaluate
import requests
import numpy as np
import pandas as pd
import transformers
import concurrent.futures
from bs4 import BeautifulSoup
from wikidata.client import Client
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, EvalPrediction, Trainer, TrainingArguments, DataCollatorWithPadding, set_seed, EarlyStoppingCallback

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

### Dataset

In [3]:
dir_path = "/mnt/c/Users/fede6/Desktop/HW1/"
train_path = "train.csv"
dev_path = "valid.csv"
test_path =  "test_unlabeled.csv"

train_df = pd.read_csv(dir_path + train_path, encoding='utf-8')
dev_df = pd.read_csv(dir_path + dev_path, encoding='utf-8')
test_df = pd.read_csv(dir_path + test_path, encoding='utf-8')

In [4]:
def save_txt(filename, path, txt):
    with open(path + filename, 'w', encoding='utf-8') as output:
        json.dump(txt, output, ensure_ascii=False, indent=2)

def load_txt(filename, path):
    with open(path + filename, 'r', encoding='utf-8') as input_file:
        return json.load(input_file)

In [5]:
client = Client()

def extract_qid(url):
    return url.strip().split("/")[-1]

def get_wiki_link(qid, lang='en'):
    try:
        entity = client.get(qid, load=True)
        sitelinks = entity.data.get('sitelinks', {})
        page_info = sitelinks.get(f'{lang}wiki')
        return page_info['url'] if page_info else None
    except Exception as e:
        print(f"ERROR retrieving Wikipedia link for {qid}: {e}")
        return None

def get_paragraphs(wikipedia_link):
    try:
        response = requests.get(wikipedia_link, allow_redirects=True)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')
        content = soup.find('div', class_='mw-content-ltr mw-parser-output')
        if not content:
            return None

        paragraphs = []
        for p in content.find_all('p'):
            text = p.get_text(separator=" ", strip=True)
            text = re.sub(r'\[\s*\d+\s*\]', '', text)
            text = re.sub(r'\s{2,}', ' ', text)
            if text:
                paragraphs.append(text)

        return "\n\n".join(paragraphs) if paragraphs else None

    except Exception as e:
        print(f"ERROR! Link {wikipedia_link}: {e}")
        return None

def process_item(index, item, df, lang):
    try:
        qid = extract_qid(item)
        link = get_wiki_link(qid, lang)
        paragraph = get_paragraphs(link)

        if not link:
            print(f"WARNING: missing Wikipedia link for QID {qid}")
            return index, df['description'][df['item'] == item].values[0]

        if not paragraph:
            print(f"WARNING: empty or missing content for {link}")
            return index, df['description'][df['item'] == item].values[0]

        return index, paragraph

    except Exception as e:
        print(f"ERROR processing item {item} (QID: {qid if 'qid' in locals() else 'UNKNOWN'}): {e}")
        return index, df['description'][df['item'] == item].values[0]

def text_extraction(df, lang='en', max_workers=16):
    results = [None] * len(df)
    items = list(enumerate(df['item']))

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_item, idx, item, df, lang): idx for idx, item in items}

        for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
            idx, paragraph = future.result()
            results[idx] = paragraph

    return results

In [6]:
EXTRACTED = True

In [7]:
if EXTRACTED:
    train_txt = load_txt(filename="train_txts.txt", path=dir_path)
    valid_txt = load_txt(filename="dev_txts.txt",   path=dir_path)
    test_txt = load_txt(filename="test_txts.txt",   path=dir_path)
else:
    train_txt = text_extraction(df=train_df)
    valid_txt = text_extraction(df=dev_df)
    test_txt  = text_extraction(df=test_df)

train_df['paragraph'] = train_txt
dev_df['paragraph'] = valid_txt
test_df['paragraph'] = test_txt

In [8]:
train_df.head()

Unnamed: 0,item,name,description,type,category,subcategory,label,paragraph
0,http://www.wikidata.org/entity/Q32786,916,2012 film by M. Mohanan,entity,films,film,cultural exclusive,916 is a 2012 Indian Malayalam -language drama...
1,http://www.wikidata.org/entity/Q371,!!!,American dance-punk band from California,entity,music,musical group,cultural representative,!!! ( / tʃ ( ɪ ) k . tʃ ( ɪ ) k . tʃ ( ɪ ) k /...
2,http://www.wikidata.org/entity/Q3729947,¡Soborno!,Mort & Phil comic,entity,comics and anime,comics,cultural representative,¡Soborno! (English: Bribery! ) is a 1977 comic...
3,http://www.wikidata.org/entity/Q158611,+44,American band,entity,music,musical group,cultural representative,+44 (read as Plus Forty-four ) was an American...
4,http://www.wikidata.org/entity/Q280375,1 Monk Street,"building in Monmouth, Wales",entity,architecture,building,cultural exclusive,"1 Monk Street, Monmouth was built as a Working..."


### **Attention**!
In order to deliver the predicitons on the unlabeled test_set. In order to evaluate is sufficient to do set **isTest=True**;

In [9]:
mapper = {
    'cultural agnostic':       2,
    'cultural representative': 1,
    'cultural exclusive':      0
}

class PLMDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, df, isTest=False):
        self.isTest = isTest
        self.encodings = encodings
        self.size = len(df['item'].to_list())
        if self.isTest == False:
            self.labels = [mapper[label] for label in df['label']]
            
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]).to(device) for k, v in self.encodings.items()}
        
        if self.isTest:
            return item
        else:
            item["labels"] = torch.tensor(self.labels[idx]).to(device)            
        return item

    def __len__(self):
        return self.size

### Metrics

In [10]:
def compute_metrics(eval_pred):
   load_accuracy = evaluate.load("accuracy")
   load_f1 = evaluate.load("f1")

   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels, average="macro")["f1"]
   return {"accuracy": accuracy, "f1": f1}

## Model initialization

In [11]:
def model_init(model_name, n_classes=3, padding=True, truncation=True):
  model = AutoModelForSequenceClassification.from_pretrained(model_name, ignore_mismatched_sizes=True, output_attentions=False, output_hidden_states=False, num_labels=n_classes).to(device)
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
  return model, tokenizer, data_collator

def tokenization(df, tokenizer):
    return tokenizer(df["paragraph"].to_list(), padding=True, truncation=True)

#### Training global parameters

In [12]:
SEED = 42
N_EPOCHS = 5
BATCH_SIZE = 8
WARMUP_STEPS = 391
WEIGHT_DECAY = 0.01
LEARNING_RATE = 1e-5

set_seed(SEED)

### DistilBERT

In [13]:
model_name = "distilbert-base-uncased"

In [14]:
torch.cuda.empty_cache()

In [15]:
model, tokenizer, data_coll = model_init(model_name)

tokenized_trainset = tokenization(train_df, tokenizer=tokenizer)
tokenized_devset =   tokenization(dev_df, tokenizer=tokenizer)

train_dataset = PLMDataset(tokenized_trainset, df=train_df, isTest=False)
val_dataset = PLMDataset(tokenized_devset, df=dev_df, isTest=False)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
training_args = TrainingArguments(
    report_to="none",
    save_only_model=True,
    save_strategy="steps",
    eval_strategy="steps",
    greater_is_better=False,
    eval_steps=WARMUP_STEPS,
    save_steps=WARMUP_STEPS,
    warmup_steps=WARMUP_STEPS,
    num_train_epochs=N_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    logging_steps=WARMUP_STEPS,
    logging_dir=dir_path+"logs",
    load_best_model_at_end=True,
    learning_rate=LEARNING_RATE,
    metric_for_best_model="eval_loss",
    per_device_train_batch_size=BATCH_SIZE,
    output_dir=dir_path + model_name + "_res/results"
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_coll,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    train_dataset=train_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)

In [17]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1
391,0.8816,0.620792,0.726667,0.705515
782,0.57,0.559818,0.753333,0.741703
1173,0.4793,0.534733,0.766667,0.752723
1564,0.4514,0.600848,0.763333,0.748466
1955,0.3618,0.636548,0.78,0.768821
2346,0.3511,0.752649,0.76,0.745714
2737,0.2753,0.817434,0.753333,0.740811


TrainOutput(global_step=2737, training_loss=0.481494403207742, metrics={'train_runtime': 1063.3355, 'train_samples_per_second': 29.393, 'train_steps_per_second': 3.677, 'total_flos': 2898570840966144.0, 'train_loss': 0.481494403207742, 'epoch': 3.5})

In [18]:
trainer.evaluate()

{'eval_loss': 0.5347334742546082,
 'eval_accuracy': 0.7666666666666667,
 'eval_f1': 0.7527233115468409,
 'eval_runtime': 6.7134,
 'eval_samples_per_second': 44.687,
 'eval_steps_per_second': 5.66,
 'epoch': 3.5}

### BERT

In [19]:
model_name = "bert-base-uncased"

In [20]:
model, tokenizer, data_coll = model_init(model_name)

tokenized_trainset = tokenization(train_df, tokenizer=tokenizer)
tokenized_devset =   tokenization(dev_df, tokenizer=tokenizer)

train_dataset = PLMDataset(tokenized_trainset, df=train_df, isTest=False)
val_dataset = PLMDataset(tokenized_devset, df=dev_df, isTest=False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
training_args = TrainingArguments(
    report_to="none",
    save_only_model=True,
    save_strategy="steps",
    eval_strategy="steps",
    greater_is_better=False,
    eval_steps=WARMUP_STEPS,
    save_steps=WARMUP_STEPS,
    warmup_steps=WARMUP_STEPS,
    num_train_epochs=N_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    logging_steps=WARMUP_STEPS,
    logging_dir=dir_path+"logs",
    load_best_model_at_end=True,
    learning_rate=LEARNING_RATE,
    metric_for_best_model="eval_loss",
    per_device_train_batch_size=BATCH_SIZE,
    output_dir=dir_path + model_name + "_res/results"
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_coll,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    train_dataset=train_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)

In [22]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1
391,0.8737,0.654126,0.666667,0.631042
782,0.5609,0.566412,0.763333,0.74707
1173,0.4629,0.61511,0.753333,0.737141
1564,0.4401,0.640821,0.756667,0.740719
1955,0.3221,0.705668,0.763333,0.750575
2346,0.3245,0.793203,0.753333,0.740052


TrainOutput(global_step=2346, training_loss=0.49736401166858984, metrics={'train_runtime': 1744.8959, 'train_samples_per_second': 17.912, 'train_steps_per_second': 2.241, 'total_flos': 4934165922653184.0, 'train_loss': 0.49736401166858984, 'epoch': 3.0})

In [23]:
trainer.evaluate()

{'eval_loss': 0.5664123296737671,
 'eval_accuracy': 0.7633333333333333,
 'eval_f1': 0.747069841162474,
 'eval_runtime': 10.7632,
 'eval_samples_per_second': 27.873,
 'eval_steps_per_second': 3.531,
 'epoch': 3.0}

## RoBERTa

In [24]:
torch.cuda.empty_cache()

In [25]:
model_name = "roberta-base"

In [30]:
model, tokenizer, data_coll = model_init(model_name)

tokenized_trainset = tokenization(train_df, tokenizer=tokenizer)
tokenized_devset =   tokenization(dev_df, tokenizer=tokenizer)

train_dataset = PLMDataset(tokenized_trainset, df=train_df, isTest=False)
val_dataset = PLMDataset(tokenized_devset, df=dev_df, isTest=False)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
training_args = TrainingArguments(
    report_to="none",
    save_only_model=True,
    save_strategy="steps",
    eval_strategy="steps",
    greater_is_better=False,
    eval_steps=WARMUP_STEPS,
    save_steps=WARMUP_STEPS,
    warmup_steps=WARMUP_STEPS,
    num_train_epochs=N_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    logging_steps=WARMUP_STEPS,
    logging_dir=dir_path+"logs",
    load_best_model_at_end=True,
    learning_rate=LEARNING_RATE,
    metric_for_best_model="eval_loss",
    per_device_train_batch_size=BATCH_SIZE,
    output_dir=dir_path + model_name + "_res/results"
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_coll,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    train_dataset=train_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)

In [32]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1
391,0.8779,0.602486,0.74,0.721386
782,0.5417,0.575793,0.803333,0.794663
1173,0.4671,0.578462,0.78,0.768546
1564,0.4628,0.618061,0.79,0.776604
1955,0.3564,0.73101,0.78,0.771274
2346,0.3608,0.866831,0.75,0.740185


TrainOutput(global_step=2346, training_loss=0.5111339468277121, metrics={'train_runtime': 1766.1796, 'train_samples_per_second': 17.696, 'train_steps_per_second': 2.214, 'total_flos': 4934165922653184.0, 'train_loss': 0.5111339468277121, 'epoch': 3.0})

In [33]:
trainer.evaluate()

{'eval_loss': 0.5757932066917419,
 'eval_accuracy': 0.8033333333333333,
 'eval_f1': 0.7946633866399488,
 'eval_runtime': 9.9552,
 'eval_samples_per_second': 30.135,
 'eval_steps_per_second': 3.817,
 'epoch': 3.0}

### Test phase

After different trials, the RoBERTa-base model showed better performance with respect to the DistilBERT and BERT-base models.  
All the models showed that after 1 epoch the train loss and validation loss start to diverge. In order to show this behavior, the models has been trained for at most 5 epochs and the early stopping callback has been used to interrupt the training when the validation loss increase for 4 *eval_steps*.  
In this section, the predictions of first elements of the test set made by the last training experiment of the RoBERTa-base, while in the next section [**Best Model**], instead, the best trained RoBERTa-base model predictions are showed and saved as file csv.

In [34]:
test_dataset = PLMDataset(tokenization(test_df, tokenizer=tokenizer), df=test_df, isTest=True)

preds_struct = trainer.predict(test_dataset=test_dataset)
predictions = np.argmax(preds_struct.predictions, axis=1)

In [35]:
results = pd.DataFrame()
results['item'] = test_df['item']
results['name'] = test_df['name']
results['predictions'] = predictions

remap_dict = {
    0: 'cultural exclusive',
    1: 'cultural representative',
    2: 'cultural agnostic'
}

results['predictions'] = results['predictions'].map(remap_dict)
results.to_csv(dir_path + "RoBERTa_predictions.csv")

results.head(15)

Unnamed: 0,item,name,predictions
0,http://www.wikidata.org/entity/Q2427430,Northeast Flag Replacement,cultural exclusive
1,http://www.wikidata.org/entity/Q125482,imam,cultural representative
2,http://www.wikidata.org/entity/Q15789,FC Bayern Munich,cultural representative
3,http://www.wikidata.org/entity/Q582496,Fome Zero,cultural exclusive
4,http://www.wikidata.org/entity/Q572811,Anthony Award,cultural exclusive
5,http://www.wikidata.org/entity/Q1866547,Livraria Bertrand,cultural exclusive
6,http://www.wikidata.org/entity/Q19081,prokaryotes,cultural agnostic
7,http://www.wikidata.org/entity/Q474090,narrative poetry,cultural agnostic
8,http://www.wikidata.org/entity/Q1266300,Neue Slowenische Kunst,cultural exclusive
9,http://www.wikidata.org/entity/Q193654,short-track speed skating,cultural agnostic


---

## Best Model

After several experiments, observing the metrics on the development set, the best model is **RoBERTa-base model**. Here is presented the evaluation metrics on the validation set and then the predictions on the first elements of the test set. The best model's predictions are saved in file csv, delivered as final result.

In [36]:
best_model_path = dir_path + "BestRes/results/checkpoint-782"

model, tokenizer, data_coll = model_init(best_model_path)

tokenized_trainset = tokenization(train_df, tokenizer=tokenizer)
tokenized_devset =   tokenization(dev_df, tokenizer=tokenizer)

train_dataset = PLMDataset(tokenized_trainset, df=train_df, isTest=False)
val_dataset = PLMDataset(tokenized_devset, df=dev_df, isTest=False)

In [37]:
training_args = TrainingArguments(
    report_to="none",
    save_only_model=True,
    save_strategy="steps",
    eval_strategy="steps",
    greater_is_better=False,
    eval_steps=WARMUP_STEPS,
    save_steps=WARMUP_STEPS,
    warmup_steps=WARMUP_STEPS,
    num_train_epochs=N_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    logging_steps=WARMUP_STEPS,
    logging_dir=dir_path+"logs",
    load_best_model_at_end=True,
    learning_rate=LEARNING_RATE,
    metric_for_best_model="eval_loss",
    per_device_train_batch_size=BATCH_SIZE
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_coll,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    train_dataset=train_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)

In [38]:
results = trainer.evaluate()

In [39]:
accuracy = results['eval_accuracy']
f1_score = results['eval_f1']

print(f"RoBERTa base score:")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 score: {f1_score:.4f}")

RoBERTa base score:
Accuracy: 0.8100
F1 score: 0.8007


In [40]:
test_dataset = PLMDataset(tokenization(test_df, tokenizer=tokenizer), df=test_df, isTest=True)

In [41]:
preds_struct = trainer.predict(test_dataset=test_dataset)
predictions = np.argmax(preds_struct.predictions, axis=1)

The labels of the predictions have been mapped in the corresponding cultural class in order to provide a easier readable table. This mapping can be easily removed.

In [42]:
results = pd.DataFrame()
results['item'] = test_df['item']
results['name'] = test_df['name']
results['predictions'] = predictions

remap_dict = {
    0: 'cultural exclusive',
    1: 'cultural representative',
    2: 'cultural agnostic'
}
results['predictions'] = results['predictions'].map(remap_dict)

results.head(15)

Unnamed: 0,item,name,predictions
0,http://www.wikidata.org/entity/Q2427430,Northeast Flag Replacement,cultural exclusive
1,http://www.wikidata.org/entity/Q125482,imam,cultural representative
2,http://www.wikidata.org/entity/Q15789,FC Bayern Munich,cultural representative
3,http://www.wikidata.org/entity/Q582496,Fome Zero,cultural exclusive
4,http://www.wikidata.org/entity/Q572811,Anthony Award,cultural exclusive
5,http://www.wikidata.org/entity/Q1866547,Livraria Bertrand,cultural exclusive
6,http://www.wikidata.org/entity/Q19081,prokaryotes,cultural agnostic
7,http://www.wikidata.org/entity/Q474090,narrative poetry,cultural agnostic
8,http://www.wikidata.org/entity/Q1266300,Neue Slowenische Kunst,cultural exclusive
9,http://www.wikidata.org/entity/Q193654,short-track speed skating,cultural agnostic


In [None]:
results.to_csv(dir_path + "RoBERTa_predictions.csv")