## Subtask 1 - Polarization detection

This is a binary classification to determine whether a post contains polarized content (Polarized or Not Polarized).

## Imports

In [1]:
import pandas as pd

from sklearn.metrics import recall_score, precision_score, f1_score
import numpy as np

import torch

from sklearn.metrics import f1_score

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from torch.utils.data import Dataset

In [2]:
import numpy as np
import random

SEED = 42

# Python
random.seed(SEED)

# Numpy
np.random.seed(SEED)

# PyTorch (CPU)
torch.manual_seed(SEED)

# PyTorch (GPU)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)  # if you have multiple GPUs

# Ensure deterministic behavior
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
import wandb

# Disable wandb logging for this script
wandb.init(mode="disabled")

## Data Import

The training data consists of a short text and binary labels

The data is structured as a CSV file with the following fields:
- id: a unique identifier for the sample
- text: a sentence or short text
- polarization:  1 text is polarized, 0 text is not polarized

The data is in all three subtask folders the same but only containing the labels for the specific task.

In [4]:
# Load the training and validation data for subtask 1

train = pd.read_csv('train/eng.csv')
train_plus = pd.read_csv('eng_BT_full.csv')

# dev = pd.read_csv('dev/swa.csv')
# val = pd.read_csv('train/swa.csv')
train.head()

Unnamed: 0,id,text,polarization
0,eng_973938b90b0ff5d87d35a582f83f5c89,is defending imperialism in the dnd chat,0
1,eng_07dfd4600426caca6e2c5883fcbea9ea,Still playing with this. I am now following Ra...,0
2,eng_f14519ff2302b6cd47712073f13bc461,.senate.gov Theres 3 groups out there Republic...,0
3,eng_e48b7e7542faafa544ac57b64bc80daf,"""ABC MD, David Anderson, said the additional f...",0
4,eng_7c581fb77bce8033aeba3d6dbd6273eb,"""bad people"" I have some conservative values s...",0


# Dataset
-  Create a pytorch class for handling data
-  Wrapping the raw texts and labels into a format that Huggingfaceâ€™s Trainer can use for training and evaluation

In [5]:
# Fix the dataset class by inheriting from torch.utils.data.Dataset
class PolarizationDataset(torch.utils.data.Dataset):
  def __init__(self,texts,labels,tokenizer,max_length =128):
    self.texts=texts
    self.labels=labels
    self.tokenizer= tokenizer
    self.max_length = max_length # Store max_length

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = self.texts[idx]
    label = self.labels[idx]

    # Remove return_tensors='pt' - let DataCollator handle it
    encoding = self.tokenizer(
        text,
        truncation=True,
        max_length=self.max_length
    )

    encoding['label'] = label  # Use 'label' not 'labels'
    return encoding

Now, we'll tokenize the text data and create the datasets using `bert-base-uncased` as the tokenizer.

In [6]:
# Load the tokenizer
MODEL_NAME = 'distilroberta-base' #  xlm-roberta-base

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

## Beginning the data processing through Back-translation

In [7]:
train_data = []
train_polarization = []


train_data.extend(train['text'].tolist())
train_polarization.extend(train['polarization'].tolist())

print("Total number of data before adding the back-translation : ", len(train_data))

Total number of data before adding the back-translation :  3222


## Handling the translated data
All the subtasks use `eng_BT_full.csv`. We process it conveniently for each subtask.

In [10]:
train_plus.drop(columns=['text', 'political', 'racial/ethnic', 'religious', 'gender/sexual', 'other'])

train_plus['polarization'] = train['polarization']
train_plus['text'] = train_plus['back_translated']
train_plus.drop(columns=['back_translated'])

back_tranlated_sampled = train_plus.sample(n=1000)

In [11]:
back_tranlated_sampled.shape

(1000, 9)

In [12]:
back_translated = back_tranlated_sampled['text'].tolist()
back_translated_labels = back_tranlated_sampled['polarization'].tolist()

add_to_train = back_translated[:-300]
labels_train = back_translated_labels[:-300]

val = back_translated[-300:]
labels_val = back_translated_labels[-300:]
labels_val_to_int = [int(elt) for elt in labels_val]

# Adding the new dataset into the training set
# train_data.extend(add_to_train)
# train_polarization.extend(labels_train)

print("Length of the data after adding new dataset :", len(train_data) )

Length of the data after adding new dataset : 3222


In [13]:
# Create datasets

train_dataset = PolarizationDataset(train_data, train_polarization, tokenizer)
val_dataset = PolarizationDataset(val, labels_val, tokenizer)

## Loading the model

In [14]:
# Load the model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    ignore_mismatched_sizes=True,
    # local_files_only=True # Only if the model is dowmloaded locally
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Defining training arugments

In [15]:
# Define metrics function
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {'f1_macro': f1_score(p.label_ids, preds, average='macro')}

# Define training arguments
training_args = TrainingArguments(
        output_dir=f"./",
        num_train_epochs=4,
        learning_rate=1e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=8,
        eval_strategy="epoch",
        save_strategy="no",
        logging_steps=100,
        disable_tqdm=False
    )


Finally, we'll initialize the `Trainer` and start training.

In [16]:
# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the instantiated ðŸ¤— Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
    data_collator=DataCollatorWithPadding(tokenizer) # Data collator for dynamic padding
)

# Train the model
trainer.train()

# Evaluate the model on the validation set
eval_results = trainer.evaluate()
print(f"Macro F1 score on validation set: {eval_results['eval_f1_macro']}")

Epoch,Training Loss,Validation Loss,F1 Macro
1,0.5651,0.467124,0.724265
2,0.432,0.400196,0.797175
3,0.381,0.402797,0.796196
4,0.3537,0.38395,0.810152


Macro F1 score on validation set: 0.8101524984179375


In [None]:
eval_results

## Process the output dataset into a .csv

In [17]:
# Assuming your dev dataset has columns like 'sentence' and maybe 'label'
# First, keep the original dev dataframe intact
dev = pd.read_csv('dev/eng.csv')
dev_original = dev.copy()

# Tokenize the sentences
dev_texts = dev['text'].tolist()  # This creates a list of texts
dev_encodings = tokenizer(dev_texts, truncation=True, padding=True, max_length=512)

# Create a dataset object for dev (without labels)
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

dev_dataset = CustomDataset(dev_encodings)

# Make predictions
predictions = trainer.predict(dev_dataset)

# Extract predicted labels
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Create the final dataframe with idx, sentence, prediction
results = pd.DataFrame({
    'id': dev_original['id'],  # Get 'id' from the original dataframe, not the list
    'polarization': predicted_labels
})

# Display first few rows
print(results.head())

# Save to CSV if needed
results.to_csv('pred_eng.csv', index=False)

                                     id  polarization
0  eng_f66ca14d60851371f9720aaf4ccd9b58             0
1  eng_3a489aa7fed9726aa8d3d4fe74c57efb             0
2  eng_95770ff547ea5e48b0be00f385986483             0
3  eng_2048ae6f9aa261c48e6d777bcc5b38bf             0
4  eng_07781aa88e61e7c0a996abd1e5ea3a20             0
