## Subtask 1 - Polarization detection

This is a binary classification to determine whether a post contains polarized content (Polarized or Not Polarized).

## Imports

In [1]:
import pandas as pd

from sklearn.metrics import recall_score, precision_score, f1_score
import numpy as np

import torch

from sklearn.metrics import f1_score

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from torch.utils.data import Dataset

In [2]:
import numpy as np
import random

SEED = 42

# Python
random.seed(SEED)

# Numpy
np.random.seed(SEED)

# PyTorch (CPU)
torch.manual_seed(SEED)

# PyTorch (GPU)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)  # if you have multiple GPUs

# Ensure deterministic behavior
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
import wandb

# Disable wandb logging for this script
wandb.init(mode="disabled")

In [4]:
# Load the training and validation data for subtask 1

train = pd.read_csv('train/swa.csv')
train_plus = pd.read_csv('swa_backtranslated.csv')

# dev = pd.read_csv('dev/swa.csv')
# val = pd.read_csv('train/swa.csv')
train.head()

Unnamed: 0,id,text,polarization
0,swa_53de6a7a4d0123b5755da79d8d97a82f,uwizi rt kenyan rao akishinda nitachinja kuku ...,1
1,swa_ee2533cb334df97236ea2bcfda0d6823,wakikuyu ndio wako na manyumba za kukodeshwa t...,1
2,swa_1dd81b5985840a55b1ab292aa65d11a8,wakikuyu ni wezi power hungry and this time we...,1
3,swa_18589adc3945e20c5e5c61e10245fad1,wakikuyu sijui shida yenu ni nini kuogopa rail...,1
4,swa_aee76fc4cd1c6c6c09e19ba5ddd3901a,wakikuyu walisogwa hwakuumbwa,1


In [5]:
# Fix the dataset class by inheriting from torch.utils.data.Dataset
class PolarizationDataset(torch.utils.data.Dataset):
  def __init__(self,texts,labels,tokenizer,max_length =128):
    self.texts=texts
    self.labels=labels
    self.tokenizer= tokenizer
    self.max_length = max_length # Store max_length

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = self.texts[idx]
    label = self.labels[idx]

    # Remove return_tensors='pt' - let DataCollator handle it
    encoding = self.tokenizer(
        text,
        truncation=True,
        max_length=self.max_length
    )

    encoding['label'] = label  # Use 'label' not 'labels'
    return encoding

In [6]:
# Load the tokenizer
MODEL_NAME = 'distilroberta-base' #  xlm-roberta-base

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [7]:
train_to_use = train.sample(n=5000)

In [8]:
train_data = []
train_polarization = []


train_data.extend(train_to_use['text'].tolist())
train_polarization.extend(train_to_use['polarization'].tolist())

print("Total number of data before adding the back-translation : ", len(train_data))

Total number of data before adding the back-translation :  5000


In [9]:
back_tranlated_sampled = train_plus.sample(n=1000)

In [10]:
back_tranlated_sampled.shape

(1000, 4)

In [11]:
back_translated = back_tranlated_sampled['text'].tolist()
back_translated_labels = back_tranlated_sampled['polarization'].tolist()

add_to_train = back_translated[:-300]
labels_train = back_translated_labels[:-300]

val = back_translated[-300:]
labels_val = back_translated_labels[-300:]
labels_val_to_int = [int(elt) for elt in labels_val]

# Adding the new dataset into the training set
train_data.extend(add_to_train)
train_polarization.extend(labels_train)

print("Length of the data after adding new dataset :", len(train_data) )

Length of the data after adding new dataset : 5700


In [12]:
# Create datasets

train_dataset = PolarizationDataset(train_data, train_polarization, tokenizer)
val_dataset = PolarizationDataset(val, labels_val, tokenizer)

## Loading the model

In [13]:
# Load the model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    ignore_mismatched_sizes=True,
    #local_files_only=True # Only if model is downloaded locally
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Defining training argument

In [14]:
# Define metrics function
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {'f1_macro': f1_score(p.label_ids, preds, average='macro')}

# Define training arguments
training_args = TrainingArguments(
        output_dir=f"./",
        num_train_epochs=4,
        learning_rate=1e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=8,
        eval_strategy="epoch",
        save_strategy="no",
        logging_steps=100,
        disable_tqdm=False
    )


In [15]:
# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the instantiated ðŸ¤— Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
    data_collator=DataCollatorWithPadding(tokenizer) # Data collator for dynamic padding
)

# Train the model
trainer.train()

# Evaluate the model on the validation set
eval_results = trainer.evaluate()
print(f"Macro F1 score on validation set: {eval_results['eval_f1_macro']}")

Epoch,Training Loss,Validation Loss,F1 Macro
1,0.6467,0.579089,0.709739
2,0.5241,0.531535,0.72607
3,0.4564,0.462634,0.769875
4,0.4315,0.450419,0.779755


Macro F1 score on validation set: 0.7797552836484983


In [16]:
# Assuming your dev dataset has columns like 'sentence' and maybe 'label'
# First, keep the original dev dataframe intact
dev = pd.read_csv('dev/swa.csv')
dev_original = dev.copy()

# Tokenize the sentences
dev_texts = dev['text'].tolist()  # This creates a list of texts
dev_encodings = tokenizer(dev_texts, truncation=True, padding=True, max_length=512)

# Create a dataset object for dev (without labels)
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

dev_dataset = CustomDataset(dev_encodings)

# Make predictions
predictions = trainer.predict(dev_dataset)

# Extract predicted labels
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Create the final dataframe with idx, sentence, prediction
results = pd.DataFrame({
    'id': dev_original['id'],  # Get 'id' from the original dataframe, not the list
    'polarization': predicted_labels
})

# Display first few rows
print(results.head())

# Save to CSV if needed
results.to_csv('pred_swa.csv', index=False)

                                     id  polarization
0  swa_a5748df181277341143f7da4175add4a             1
1  swa_2df0d42f9b49ea2e4fb006b2e6604e6d             1
2  swa_3718757514005767302b7220b08e409d             1
3  swa_9fa3337a35cce723d60c06056d422330             1
4  swa_5c39ac8ef70345e9e3c21a47f8769bc0             1
