In [None]:
!nvidia-smi

# Bert baseline for POLAR

## Introduction

In this part of the starter notebook, we will take you through the process of all three Subtasks.

## Subtask 1 - Polarization detection

This is a binary classification to determine whether a post contains polarized content (Polarized or Not Polarized).

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/MyDrive/NLP

# ! pip install optuna
# !unzip dev_phase.zip

/content/drive/MyDrive/NLP


## Imports

In [None]:
import pandas as pd

from sklearn.metrics import recall_score, precision_score, f1_score
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from sklearn.metrics import f1_score

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from torch.utils.data import Dataset
import os
import matplotlib.pyplot as plt

In [None]:
# !pip install wandb



In [None]:
import wandb

# Disable wandb logging for this script
wandb.init(mode="disabled")

  | |_| | '_ \/ _` / _` |  _/ -_)


## Data Import

The training data consists of a short text and binary labels

The data is structured as a CSV file with the following fields:
- id: a unique identifier for the sample
- text: a sentence or short text
- polarization:  1 text is polarized, 0 text is not polarized

The data is in all three subtask folders the same but only containing the labels for the specific task.

In [None]:
# Load the training and validation data for subtask 1

train = pd.read_csv('subtask1/train/swa.csv')
# train = pd.read_csv('amh.csv')
val = pd.read_csv('subtask1/train/swa.csv')

test = pd.read_csv('subtask1/dev/swa.csv')
# test = pd.read_csv('amh.csv')

#split train data into traina and val
train, val = train_test_split(
    train,
    test_size=0.2,        # 20% for validation
    stratify=train['polarization']      # Ensures you get the same split every time
)

train.head()

Unnamed: 0,id,text,polarization
4924,swa_4dbbc16b7dae9a56d93edb1c0c3d6ab0,nilifikiria kwamba ikiwa majembe la kutombwa p...,1
176,swa_fafbf63b41ba35b340cbe73ec0bdb303,why do other kenyans hateon kikuyus we are all...,1
2246,swa_7f781ed0b362f427a1b7ed3ba5f52a08,watu wa muislamu Nyakundi wamejaa kwa menshens...,1
4082,swa_1654f1c787372c3445fe707f24149785,free and fair elect zikiwa held wanjigi ata ku...,0
5986,swa_5284848dcbb2d6b0774923f0a4038c38,kukutana juu ili niweze molly whoop yo bitch k...,0


# Dataset
-  Create a pytorch class for handling data
-  Wrapping the raw texts and labels into a format that Huggingfaceâ€™s Trainer can use for training and evaluation

In [None]:
# Fix the dataset class by inheriting from torch.utils.data.Dataset
class PolarizationDataset(torch.utils.data.Dataset):
  def __init__(self,texts,labels,tokenizer,max_length =128):
    self.texts=texts
    self.labels=labels
    self.tokenizer= tokenizer
    self.max_length = max_length # Store max_length

  def __len__(self):
    return len(self.texts)

  def __getitem__(self,idx):
    text=self.texts[idx]
    label=self.labels[idx]
    encoding=self.tokenizer(text,truncation=True,padding=False,max_length=self.max_length,return_tensors='pt')

    # Ensure consistent tensor conversion for all items
    item = {key: encoding[key].squeeze() for key in encoding.keys()}
    item['labels'] = torch.tensor(label, dtype=torch.long)
    return item


class PolarizationTestDataset(torch.utils.data.Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding=False,
            max_length=self.max_length,
            return_tensors='pt'
        )

        item = {key: encoding[key].squeeze() for key in encoding.keys()}
        return item

Now, we'll tokenize the text data and create the datasets using `bert-base-uncased` as the tokenizer.

In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('FacebookAI/xlm-roberta-base')

# Create datasets
train_dataset = PolarizationDataset(train['text'].tolist(), train['polarization'].tolist(), tokenizer)
val_dataset = PolarizationDataset(val['text'].tolist(), val['polarization'].tolist(), tokenizer)
test_dataset = PolarizationTestDataset(test['text'].tolist(), tokenizer)

Next, we'll load the pre-trained `bert-base-uncased` model for sequence classification. Since this is a binary classification task (Polarized/Not Polarized), we set `num_labels=2`.

In [None]:
# Load the model
model = AutoModelForSequenceClassification.from_pretrained('FacebookAI/xlm-roberta-base', num_labels=2)

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Now, we'll define the training arguments and the evaluation metric. We'll use macro F1 score for evaluation.

In [None]:
# Define metrics function
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {'f1_macro': f1_score(p.label_ids, preds, average='macro')}

# Define training arguments
training_args = TrainingArguments(
        output_dir=f"/content/output",
        num_train_epochs=10,
        learning_rate=3e-5,
        lr_scheduler_type="linear",
        per_device_train_batch_size=64,
        per_device_eval_batch_size=8,
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_steps=1000,
        disable_tqdm=False,
        metric_for_best_model="f1_macro",
        load_best_model_at_end=True,
        greater_is_better=True,
        save_total_limit=1,
        optim="adamw_torch" # Explicitly specify the PyTorch AdamW optimizer
    )

Finally, we'll initialize the `Trainer` and start training.

In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the instantiated ðŸ¤— Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
    data_collator=DataCollatorWithPadding(tokenizer) # Data collator for dynamic padding
)

# Train the model
trainer.train()

# Evaluate the model on the validation set
eval_results = trainer.evaluate()
print(f"Macro F1 score on validation set: {eval_results['eval_f1_macro']}")

import os
import matplotlib.pyplot as plt

# Create plots folder if it doesn't exist
plot_dir = "plots"
os.makedirs(plot_dir, exist_ok=True)

# Extract logs from trainer.state.log_history
train_logs = trainer.state.log_history

# Collect validation loss and f1-macro
eval_steps = []
eval_loss = []
eval_f1 = []

for entry in train_logs:
    if "eval_loss" in entry:
        eval_steps.append(entry.get("epoch", len(eval_steps)+1))
        eval_loss.append(entry["eval_loss"])
        eval_f1.append(entry["eval_f1_macro"])

# --- Plot Validation Loss ---
plt.figure()
plt.plot(eval_steps, eval_loss, marker='o')
plt.xlabel("Epoch")
plt.ylabel("Validation Loss")
plt.title("Validation Loss per Epoch for Swa")
plt.grid(True)
plt.tight_layout()
plt.savefig(f"{plot_dir}/validation_loss_swa.pdf")
plt.close()

# --- Plot F1-Macro ---
plt.figure()
plt.plot(eval_steps, eval_f1, marker='o')
plt.xlabel("Epoch")
plt.ylabel("F1-Macro")
plt.title("F1-Macro per Epoch for Swa")
plt.grid(True)
plt.tight_layout()
plt.savefig(f"{plot_dir}/f1_macro_swa.pdf")
plt.close()

print("Saved validation_loss.pdf and f1_macro.pdf in 'plots/' folder.")


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.53669,0.751247
2,No log,0.458667,0.777107
3,No log,0.464277,0.78699
4,No log,0.492475,0.757082
5,No log,0.478044,0.801081
6,No log,0.515939,0.785205
7,No log,0.604954,0.765335
8,No log,0.635666,0.744161
9,No log,0.64043,0.766193
10,No log,0.635782,0.776638


Macro F1 score on validation set: 0.8010808234143627
Saved validation_loss.pdf and f1_macro.pdf in 'plots/' folder.


In [None]:
trainer.save_model('yori_yori_swa')

In [None]:
import os
import zipfile

predictions = trainer.predict(test_dataset)

# Extract predicted labels
predicted_labels = predictions.predictions.argmax(-1)

# Create a new dataframe with only id and predicted polarization
results_df = pd.DataFrame({
    'id': test['id'],
    'polarization': predicted_labels
})

# Create the folder if it doesn't exist
os.makedirs('subtask_1', exist_ok=True)

# Save to CSV in the folder    lr_scheduler_type="linear",
csv_path = 'subtask_1/pred_swa.csv'
results_df.to_csv(csv_path, index=False)

print(f"Saved predictions to {csv_path}")
print(results_df.head())

# Compress the folder
zip_filename = 'subtask_1.zip'
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    # Walk through the folder and add all files
    for root, dirs, files in os.walk('subtask_1'):
        for file in files:
            file_path = os.path.join(root, file)
            # Add file to zip with relative path
            arcname = os.path.join(os.path.basename(root), file)
            zipf.write(file_path, arcname)

print(f"Created compressed file: {zip_filename}")

Saved predictions to subtask_1/pred_swa.csv
                                     id  polarization
0  swa_a5748df181277341143f7da4175add4a             1
1  swa_2df0d42f9b49ea2e4fb006b2e6604e6d             1
2  swa_3718757514005767302b7220b08e409d             1
3  swa_9fa3337a35cce723d60c06056d422330             1
4  swa_5c39ac8ef70345e9e3c21a47f8769bc0             1
Created compressed file: subtask_1.zip


# Subtask 2: Polarization Type Classification
Multi-label classification to identify the target of polarization as one of the following categories: Gender/Sexual, Political, Religious, Racial/Ethnic, or Other.
For this task we will load the data for subtask 2.

In [None]:
train = pd.read_csv('subtask2/train/swa.csv')
# train = pd.read_csv('amh.csv')
val = pd.read_csv('subtask2/train/swa.csv')

test = pd.read_csv('subtask2/dev/swa.csv')
# test = pd.read_csv('amh.csv')

#split train data into traina and val
train, val = train_test_split(train, test_size=0.2, random_state=42)

train.head()

Unnamed: 0,id,text,political,racial/ethnic,religious,gender/sexual,other
3875,swa_1e08a561fc1d7881fffb4822639b357f,kenya iuzw mtu apew share yake ama waturudishi...,0,0,0,0,0
1768,swa_b436d75df7d5cc4847f2c973740e49d0,bana rt mamzee especially hawa hapa twitter rt...,0,1,0,0,0
3250,swa_71b7ef26027e30d7f0e3025b5afb45ff,uliwaibia nani hao majembe la kutombwa pussy m...,0,0,0,0,0
6574,swa_fb121178c35ba35b1fafc735704824fa,Woi Museveni will kill us hio kamba hata ya ba...,0,1,0,0,1
2815,swa_1352a5d0c27ff8f79314abf4597a5046,mt hulipwa na supremacist mweupe kila wakati h...,0,0,0,0,0


In [None]:
# Fix the dataset class by inheriting from torch.utils.data.Dataset
class PolarizationDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length # Store max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, padding=False, max_length=self.max_length, return_tensors='pt')

        # Ensure consistent tensor conversion for all items
        item = {key: encoding[key].squeeze() for key in encoding.keys()}
        # CHANGE THIS LINE: Use torch.float instead of torch.long for multi-label classification
        item['labels'] = torch.tensor(label, dtype=torch.float)
        return item
class PolarizationTestDataset(torch.utils.data.Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx]) if pd.notna(self.texts[idx]) else ""

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding=False,
            max_length=self.max_length,
            return_tensors='pt'
        )

        item = {key: encoding[key].squeeze(0) for key in encoding.keys()}
        return item

In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('Davlan/afro-xlmr-large')

# Create train and Test dataset for multilabel
train_dataset = PolarizationDataset(train['text'].tolist(), train[['gender/sexual','political','religious','racial/ethnic','other']].values.tolist(), tokenizer)
val_dataset = PolarizationDataset(val['text'].tolist(), val[['gender/sexual','political','religious','racial/ethnic','other']].values.tolist(), tokenizer)
dev_dataset = PolarizationDataset(val['text'].tolist(), val[['gender/sexual','political','religious','racial/ethnic','other']].values.tolist(), tokenizer)
test_dataset = PolarizationTestDataset(test['text'].tolist(), tokenizer)

tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [None]:
# Load the model
model = AutoModelForSequenceClassification.from_pretrained('Davlan/afro-xlmr-large', num_labels=5, problem_type="multi_label_classification", ignore_mismatched_sizes=True) # 5 labels

config.json:   0%|          | 0.00/714 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def compute_metrics_multilabel(p):
    # Sigmoid -> probabilities
    probs = torch.sigmoid(torch.from_numpy(p.predictions)).numpy()

    # Convert probabilities â†’ predicted labels
    preds = (probs > 0.4).astype(int)

    # Macro F1 for multilabel
    f1 = f1_score(p.label_ids, preds, average='macro')

    return {'f1_macro': f1}



# -------------------------------------------------
# 3) TRAINING ARGUMENTS (Stable + Prevent Overfitting)
# -------------------------------------------------
training_args = TrainingArguments(
    output_dir="/content/output",
    num_train_epochs=5,
    learning_rate=3e-5,
    lr_scheduler_type="linear",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    logging_steps=100,
    disable_tqdm=False
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics_multilabel,  # Use the new metrics function
    data_collator=DataCollatorWithPadding(tokenizer)
)

# --- Train the model ---
trainer.train()

# --- Evaluate the model on the validation set ---
eval_results = trainer.evaluate()
print(f"Macro F1 score on validation set for Subtask 2: {eval_results['eval_f1_macro']}")

# --- Create plots folder ---
plot_dir = "plots"
os.makedirs(plot_dir, exist_ok=True)

# --- Extract logs from trainer.state.log_history ---
train_logs = trainer.state.log_history

# Collect validation loss and F1-Macro
eval_steps = []
eval_loss = []
eval_f1 = []

for entry in train_logs:
    if "eval_loss" in entry:
        eval_steps.append(entry.get("epoch", len(eval_steps)+1))
        eval_loss.append(entry["eval_loss"])
        eval_f1.append(entry["eval_f1_macro"])

# --- Plot Validation Loss ---
plt.figure()
plt.plot(eval_steps, eval_loss, marker='o', color='blue')
plt.xlabel("Epoch")
plt.ylabel("Validation Loss")
plt.title("Validation Loss per Epoch for Subtask 2")
plt.grid(True)
plt.tight_layout()
plt.savefig(f"{plot_dir}/validation_loss_subtask2_swa.pdf")
plt.close()

# --- Plot F1-Macro ---
plt.figure()
plt.plot(eval_steps, eval_f1, marker='o', color='green')
plt.xlabel("Epoch")
plt.ylabel("F1-Macro")
plt.title("F1-Macro per Epoch for Subtask 2")
plt.grid(True)
plt.tight_layout()
plt.savefig(f"{plot_dir}/f1_macro_subtask2_swa.pdf")
plt.close()

print("Saved validation_loss_subtask2_swa.pdf and f1_macro_subtask2_swa.pdf in 'plots/' folder.")


A ConfigError was raised whilst setting the number of model parameters in Weights & Biases config.


Epoch,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 978.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 402.12 MiB is free. Process 25070 has 14.35 GiB memory in use. Of the allocated memory 11.94 GiB is allocated by PyTorch, and 2.27 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
trainer.save_model('subtask_2.eng')

KeyboardInterrupt: 

In [None]:
# Evaluate on validation set
label_cols = ['gender/sexual', 'political', 'religious', 'racial/ethnic', 'other']
print("\n" + "="*50)
print("Validation Results:")
print("="*50)
eval_results = trainer.evaluate()
for key, value in eval_results.items():
    if key.startswith('eval_'):
        print(f"{key}: {value:.4f}")

# Generate predictions on test set (no labels available)
print("\n" + "="*50)
print("Generating predictions on test set...")
print("="*50)
test_predictions = trainer.predict(test_dataset)
test_probs = torch.sigmoid(torch.from_numpy(test_predictions.predictions)).numpy()
test_preds = (test_probs > 0.4).astype(int)

# Create submission dataframe
submission = test.copy()
submission[label_cols] = test_preds

print(f"\nPredictions shape: {test_preds.shape}")
print(f"Sample predictions:\n{submission[['text'] + label_cols].head()}")


Validation Results:


eval_loss: 0.2258
eval_f1_macro: 0.2286
eval_runtime: 2.9685
eval_samples_per_second: 217.2820
eval_steps_per_second: 13.8120

Generating predictions on test set...

Predictions shape: (160, 5)
Sample predictions:
                                                text  gender/sexual  \
0                   God is with Ukraine and Zelensky              0   
1  4 Dems, 2 Republicans Luzerne County Council s...              0   
2  Abuse Survivor Recounts Her Struggles at YWCA ...              0   
3    After Rwanda, another deportation camp disaster              0   
4  Another plea in Trump election interference probe              0   

   political  religious  racial/ethnic  other  
0          0          0              0      0  
1          0          0              0      0  
2          0          0              0      0  
3          0          0              0      0  
4          0          0              0      0  


In [None]:
import os
import zipfile
import numpy as np

predictions = trainer.predict(test_dataset)

# For multi-label classification, you need to apply sigmoid and threshold
# Assuming your model outputs logits for each label
predicted_probs = torch.sigmoid(torch.tensor(predictions.predictions))

# Apply threshold (0.5) to get binary predictions
predicted_labels = (predicted_probs > 0.4).int().numpy()

# If you have 5 labels in order: political, racial/ethnic, religious, gender/sexual, other
# Create a dataframe with all columns
results_df = pd.DataFrame({
    'id': test['id'],
    'political': predicted_labels[:, 0],
    'racial/ethnic': predicted_labels[:, 1],
    'religious': predicted_labels[:, 2],
    'gender/sexual': predicted_labels[:, 3],
    'other': predicted_labels[:, 4]
})

# Create the folder if it doesn't exist
os.makedirs('subtask_2', exist_ok=True)

# Save to CSV in the folder
csv_path = 'subtask_2/pred_eng.csv'
results_df.to_csv(csv_path, index=False)

print(f"Saved predictions to {csv_path}")
print(results_df.head())

# Compress the folder
zip_filename = 'subtask_2_eng.zip'
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    # Walk through the folder and add all files
    for root, dirs, files in os.walk('subtask_2'):
        for file in files:
            file_path = os.path.join(root, file)
            # Add file to zip with relative path
            arcname = os.path.join(os.path.basename(root), file)
            zipf.write(file_path, arcname)

print(f"Created compressed file: {zip_filename}")


Saved predictions to subtask_2/pred_eng.csv
                                     id  political  racial/ethnic  religious  \
0  eng_f66ca14d60851371f9720aaf4ccd9b58          0              0          0   
1  eng_3a489aa7fed9726aa8d3d4fe74c57efb          0              0          0   
2  eng_95770ff547ea5e48b0be00f385986483          0              0          0   
3  eng_2048ae6f9aa261c48e6d777bcc5b38bf          0              0          0   
4  eng_07781aa88e61e7c0a996abd1e5ea3a20          0              0          0   

   gender/sexual  other  
0              0      0  
1              0      0  
2              0      0  
3              0      0  
4              0      0  
Created compressed file: subtask_2_eng.zip


# Subtask 3: Manifestation Identification
Multi-label classification to classify how polarization is expressed, with multiple possible labels including Vilification, Extreme Language, Stereotype, Invalidation, Lack of Empathy, and Dehumanization.



In [None]:
train = pd.read_csv('subtask3/train/eng.csv')
val = pd.read_csv('subtask3/train/eng.csv')

test = pd.read_csv('subtask3/dev/eng.csv')
# test = pd.read_csv('amh.csv')

#split train data into traina and val
train, val = train_test_split(train, test_size=0.2, random_state=42)

train.head()

train.head()

Unnamed: 0,id,text,stereotype,vilification,dehumanization,extreme_language,lack_of_empathy,invalidation
3000,eng_427cea503f2e3a2b4608d26fa87a55f0,The IDF needs some B52s.,0,0,0,0,0,0
366,eng_98eb4278a5fb9f249149a899b8f2c4e7,Fascinating life journey leads me to ANRAurora...,0,0,0,0,0,0
1965,eng_0d9c500e39edc99af8c61c45db8825a2,Lazy woke excuse to justify sjw practices,0,1,0,0,0,0
29,eng_69ebe50510087fb8c06cd12283a88ef8,5 takeaways on Republicans first impeachment h...,0,0,0,0,0,0
2689,eng_787ec0b4ddda46bbb5a0bf507a31d484,And yet Dems are supposed to be the breeders o...,1,1,0,1,0,0


In [None]:
# Fix the dataset class by inheriting from torch.utils.data.Dataset
class PolarizationDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length # Store max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, padding=False, max_length=self.max_length, return_tensors='pt')

        # Ensure consistent tensor conversion for all items
        item = {key: encoding[key].squeeze() for key in encoding.keys()}
        # CHANGE THIS LINE: Use torch.float instead of torch.long for multi-label classification
        item['labels'] = torch.tensor(label, dtype=torch.float)
        return item

class PolarizationTestDataset(torch.utils.data.Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx]) if pd.notna(self.texts[idx]) else ""

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding=False,
            max_length=self.max_length,
            return_tensors='pt'
        )

        item = {key: encoding[key].squeeze(0) for key in encoding.keys()}
        return item

In [None]:
# Load the tokenizer

tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-hate-latest')

# Create train and Test dataset for multilabel
train_dataset = PolarizationDataset(train['text'].tolist(), train[['vilification','extreme_language','stereotype','invalidation','lack_of_empathy','dehumanization']].values.tolist(), tokenizer)
val_dataset = PolarizationDataset(val['text'].tolist(), val[['vilification','extreme_language','stereotype','invalidation','lack_of_empathy','dehumanization']].values.tolist(), tokenizer)

# Load the tokenizer

test_dataset = PolarizationTestDataset(test['text'].tolist(), tokenizer)

tokenizer_config.json:   0%|          | 0.00/351 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [None]:
# Load the model
model = AutoModelForSequenceClassification.from_pretrained('cardiffnlp/twitter-roberta-base-hate-latest', num_labels=6, problem_type="multi_label_classification", ignore_mismatched_sizes=True) # use 6 labels

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-hate-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([6]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="/content/output",
    num_train_epochs=5,
    learning_rate=3e-5,
    lr_scheduler_type="linear",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    logging_steps=100,
    disable_tqdm=False
)

# Define metrics function for multi-label classification
def compute_metrics_multilabel(p):
    # Sigmoid the predictions to get probabilities
    probs = torch.sigmoid(torch.from_numpy(p.predictions))
    # Convert probabilities to predicted labels (0 or 1)
    preds = (probs > 0.5).int().numpy()
    # Compute macro F1 score
    return {'f1_macro': f1_score(p.label_ids, preds, average='macro')}

In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics_multilabel,  # Use the new metrics function
    data_collator=DataCollatorWithPadding(tokenizer)
)

# Train the model
trainer.train()

# Evaluate the model on the validation set
eval_results = trainer.evaluate()
print(f"Macro F1 score on validation set for Subtask 3: {eval_results['eval_f1_macro']}")

A ConfigError was raised whilst setting the number of model parameters in Weights & Biases config.


Epoch,Training Loss,Validation Loss,F1 Macro
1,0.4229,0.349565,0.172188
2,0.3263,0.33739,0.321641
3,0.2968,0.367246,0.355992
4,0.2423,0.366762,0.410967
5,0.2154,0.380164,0.397768


Macro F1 score on validation set for Subtask 3: 0.4109668239091586


In [None]:
import os
import zipfile
import numpy as np

predictions = trainer.predict(test_dataset)

# For multi-label classification, you need to apply sigmoid and threshold
# Assuming your model outputs logits for each label
predicted_probs = torch.sigmoid(torch.tensor(predictions.predictions))

# Apply threshold (0.5) to get binary predictions
predicted_labels = (predicted_probs > 0.3).int().numpy()

# If you have 5 labels in order: political, racial/ethnic, religious, gender/sexual, other
# Create a dataframe with all columns
results_df = pd.DataFrame({
    'id': test['id'],
    'vilification': predicted_labels[:, 0],
    'extreme_language': predicted_labels[:, 1],
    'stereotype': predicted_labels[:, 2],
    'invalidation': predicted_labels[:, 3],
    'lack_of_empathy': predicted_labels[:, 4],
    'dehumanization': predicted_labels[:, 5]
})



# Create the folder if it doesn't exist
os.makedirs('subtask_3', exist_ok=True)

# Save to CSV in the folder
csv_path = 'subtask_3/pred_eng.csv'
results_df.to_csv(csv_path, index=False)

print(f"Saved predictions to {csv_path}")
print(results_df.head())

# Compress the folder
zip_filename = 'subtask_3.zip'
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    # Walk through the folder and add all files
    for root, dirs, files in os.walk('subtask_2'):
        for file in files:
            file_path = os.path.join(root, file)
            # Add file to zip with relative path
            arcname = os.path.join(os.path.basename(root), file)
            zipf.write(file_path, arcname)

print(f"Created compressed file: {zip_filename}")


Saved predictions to subtask_3/pred_eng.csv
                                     id  vilification  extreme_language  \
0  eng_f66ca14d60851371f9720aaf4ccd9b58             0                 0   
1  eng_3a489aa7fed9726aa8d3d4fe74c57efb             0                 0   
2  eng_95770ff547ea5e48b0be00f385986483             0                 0   
3  eng_2048ae6f9aa261c48e6d777bcc5b38bf             1                 1   
4  eng_07781aa88e61e7c0a996abd1e5ea3a20             0                 0   

   stereotype  invalidation  lack_of_empathy  dehumanization  
0           0             0                0               0  
1           0             0                0               0  
2           0             0                0               0  
3           0             1                0               0  
4           0             0                0               0  
Created compressed file: subtask_3.zip
