# Bert baseline for POLAR

## Introduction

In this part of the starter notebook, we will take you through the process of all three Subtasks.

## Subtask 1 - Polarization detection

This is a binary classification to determine whether a post contains polarized content (Polarized or Not Polarized).

In [1]:
# Install gdown if needed
# Replace the ID below with your actual file ID from the Drive link
# (The ID is the long string of random characters in the URL)
file_id = '1Cvdkk_AZQzM5rJYhV4Nq-8bXvJrF8t4z'
url = f'https://drive.google.com/uc?id={file_id}'
output = 'dev_phase.zip'

!gdown {url} -O {output}

!unzip {output}

# Delete __MACOSX directory (if exists) and the dev_phase.zip file (cleanup)
import os
import shutil

if os.path.exists("__MACOSX"):
    shutil.rmtree("__MACOSX")

if os.path.exists("dev_phase.zip"):
    os.remove("dev_phase.zip")

Downloading...
From: https://drive.google.com/uc?id=1Cvdkk_AZQzM5rJYhV4Nq-8bXvJrF8t4z
To: /content/dev_phase.zip
100% 10.1M/10.1M [00:00<00:00, 25.8MB/s]
Archive:  dev_phase.zip
   creating: dev_phase/
  inflating: __MACOSX/._dev_phase    
   creating: dev_phase/subtask2/
  inflating: __MACOSX/dev_phase/._subtask2  
   creating: dev_phase/subtask3/
  inflating: __MACOSX/dev_phase/._subtask3  
  inflating: dev_phase/.DS_Store     
  inflating: __MACOSX/dev_phase/._.DS_Store  
   creating: dev_phase/subtask1/
  inflating: __MACOSX/dev_phase/._subtask1  
  inflating: dev_phase/subtask2/.DS_Store  
  inflating: __MACOSX/dev_phase/subtask2/._.DS_Store  
   creating: dev_phase/subtask2/train/
  inflating: __MACOSX/dev_phase/subtask2/._train  
   creating: dev_phase/subtask2/dev/
  inflating: __MACOSX/dev_phase/subtask2/._dev  
  inflating: dev_phase/subtask3/.DS_Store  
  inflating: __MACOSX/dev_phase/subtask3/._.DS_Store  
   creating: dev_phase/subtask3/train/
  inflating: __MACOSX/dev_pha

## Imports

In [2]:
import pandas as pd

from sklearn.metrics import recall_score, precision_score, f1_score
import numpy as np

import torch

from sklearn.metrics import f1_score

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from torch.utils.data import Dataset

In [3]:
import wandb

# Disable wandb logging for this script
wandb.init(mode="disabled")

## Data Import

The training data consists of a short text and binary labels

The data is structured as a CSV file with the following fields:
- id: a unique identifier for the sample
- text: a sentence or short text
- polarization:  1 text is polarized, 0 text is not polarized

The data is in all three subtask folders the same but only containing the labels for the specific task.

In [4]:
# Load the training and validation data for subtask 1

train = pd.read_csv('./dev_phase/subtask1/train/arb.csv')
val = pd.read_csv('./dev_phase/subtask1/dev/arb.csv')

train.head()

Unnamed: 0,id,text,polarization
0,arb_a2a60c8b4af3389e842d8ec31afb0eea,ÿßÿ≠ŸÑÿßŸÖ ÿßŸÜÿ™Ÿä ŸàŸÜÿπÿßŸÑŸä ŸàŸÖŸÜŸà ÿßŸÜÿ™Ÿä ÿ≠ÿ™Ÿâ ÿ™ŸÇŸäŸÖŸäŸÜ ÿßŸÑŸÅŸÜÿßŸÜŸä...,1
1,arb_6723e56a672674a6c1d9b28b213c4a05,Ÿàÿ±Ÿá ÿßŸÑŸÉŸàÿßŸÑŸäÿ≥ ÿ™ŸÜŸäÿ¨ÿ¨ ŸÖŸÜ Ÿàÿ±Ÿá ÿ®ÿπŸäÿ± ÿµÿ∑ŸÜÿßÿπŸä ÿπŸÑŸâ ŸÅŸÉÿ±ÿ©...,1
2,arb_b0365d606edeee38ae6c025b1ca33e96,.ÿÆÿÆÿÆÿÆ ÿßŸÑŸÖŸÑŸÉŸá ÿßÿ≠ŸÑÿßŸÖ ŸÅŸäŸáÿß ÿ¥ÿ∞Ÿàÿ∞ ÿ¥ŸÜŸà ŸáŸÑ ÿ®Ÿàÿ≥ ŸàÿßŸÑÿØŸÑÿπ...,1
3,arb_858c0ee684049ba6f416a6cecb0b0761,ÿßŸÑŸÑŸá ŸäÿÆÿ≤Ÿä ÿßÿ≠ŸÑÿßŸÖ ŸáŸä ŸàÿßŸÑÿ®ÿ±ŸÜÿßŸÖÿ¨ ÿßŸÑÿÆÿßŸäÿ≥ ÿßŸÑŸä ŸÉŸÑŸá ŸÖÿµÿÆÿ±Ÿá,1
4,arb_bdafc73afd0bc2cd2badae2a089446b9,ŸÉÿ≥ ÿßŸÖ ÿßÿ≠ŸÑÿßŸÖ ÿßŸÑŸä ŸÖÿßÿ±ÿ®ÿ™Ÿáÿß Ÿàÿ¥ ŸÖŸÑŸÉŸá ŸáŸáŸáŸá ŸÖÿ™ÿ≥ÿ™ÿßŸáŸÑ ŸÖ...,1


# Dataset
-  Create a pytorch class for handling data
-  Wrapping the raw texts and labels into a format that Huggingface‚Äôs Trainer can use for training and evaluation

In [5]:
# Fix the dataset class by inheriting from torch.utils.data.Dataset
class PolarizationDataset(torch.utils.data.Dataset):
  def __init__(self,texts,labels,tokenizer,max_length =128):
    self.texts=texts
    self.labels=labels
    self.tokenizer= tokenizer
    self.max_length = max_length # Store max_length

  def __len__(self):
    return len(self.texts)

  def __getitem__(self,idx):
    text=self.texts[idx]
    label=self.labels[idx]
    encoding=self.tokenizer(text,truncation=True,padding=False,max_length=self.max_length,return_tensors='pt')

    # Ensure consistent tensor conversion for all items
    item = {key: encoding[key].squeeze() for key in encoding.keys()}
    item['labels'] = torch.tensor(label, dtype=torch.long)
    return item

Now, we'll tokenize the text data and create the datasets using `bert-base-uncased` as the tokenizer.

In [16]:
from sklearn.model_selection import train_test_split
# Load the tokenizer
model_name = ['bert-base-uncased', "UBC-NLP/MARBERTv2"]
tokenizer = AutoTokenizer.from_pretrained(model_name[0])

texts_train, texts_val, labels_train, labels_val = train_test_split(
    train['text'].tolist(),
    train['polarization'].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=train['polarization'].tolist(),  # if labels are imbalanced
)

train_dataset = PolarizationDataset(texts_train, labels_train, tokenizer)
val_dataset = PolarizationDataset(texts_val, labels_val, tokenizer)
test_dataset = PolarizationDataset(val['text'].tolist(), val['polarization'].tolist(), tokenizer)

Next, we'll load the pre-trained `bert-base-uncased` model for sequence classification. Since this is a binary classification task (Polarized/Not Polarized), we set `num_labels=2`.

In [20]:
# Load the model
model = AutoModelForSequenceClassification.from_pretrained(model_name[0], num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Now, we'll define the training arguments and the evaluation metric. We'll use macro F1 score for evaluation.

In [21]:
# Define metrics function
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {'f1_macro': f1_score(p.label_ids, preds, average='macro')}

# Define training arguments
training_args = TrainingArguments(
        output_dir=f"./",
        num_train_epochs=3,
        learning_rate=2e-5,
        per_device_train_batch_size=64,
        per_device_eval_batch_size=8,
        eval_strategy="epoch",
        save_strategy="no",
        logging_steps=100,
        disable_tqdm=False
    )


Finally, we'll initialize the `Trainer` and start training.

In [22]:
# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the instantiated ü§ó Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=train_dataset,            # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
    data_collator=DataCollatorWithPadding(tokenizer) # Data collator for dynamic padding
)

# Train the model
trainer.train()

# Evaluate the model on the validation set
eval_results = trainer.evaluate()
print(f"Macro F1 score on validation set: {eval_results['eval_f1_macro']}")

Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.628892,0.627409
2,No log,0.534294,0.730066
3,0.607800,0.518725,0.738102


Macro F1 score on validation set: 0.738102495098396


In [None]:
# Run predictions on the test set and fill the labels array with predicted testues
test_texts = test['text'].tolist()
labels = []

# Tokenize and predict in batches to avoid memory issues
for i in range(0, len(test_texts), 64):
    batch_texts = test_texts[i:i+64]
    inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model(**inputs)
        preds = outputs.logits.argmax(dim=1).cpu().numpy()
        labels.extend(preds.tolist())

# save it into CSV format
test_results = pd.DataFrame({
    'id': test['id'].tolist(),
    'polarization': labels
})
test_results.to_csv('ar_bert_based_uncased.csv', index=False)

: 

: 

In [20]:
# print the results row by row in csv format
for index, row in val_results.iterrows():
    print(f"{row['id']},{row['polarization']}")


arb_67be47e5216d7bee41e17484e619f4e6,1
arb_272322e5b265e177613d685e5619e402,1
arb_d1ec38dd0ec5d7a4fe28ef8317fc96c1,1
arb_fad75310b17c124d98ebc514189ec033,1
arb_95caf70cec5bf00c94c35cf7af2a0ab5,0
arb_ac108c1ecf5071892c61abd253847b15,1
arb_adaaa6d482119e65ce337ee224674e70,1
arb_2794b08cac6cc9394a68c51cfc436243,1
arb_19dd96c989323c9e950a2c3ab9c285be,1
arb_f2bd638d9d9fc7a617130ff2b198b562,1
arb_f992bf7776b854d4f7f8475aebf80f49,1
arb_0b5ac70e86926f5e84cad94028864a37,0
arb_8ababf95f952e2425c2df1033192dac0,1
arb_06cd19aac6cc52e394a22d7d1dd58efc,1
arb_12eeeb8d2fa2d04be2ed9830d5f36ce9,1
arb_5bc23bacf9a161cd0f99719c70681a81,1
arb_9ee7c931ab1ecd655533042d8301f6bb,1
arb_bb7c40559f3a7ca1ecdd7dd7c136198f,1
arb_5d394c0cce56675e2fc36a0590b47ed7,1
arb_0704305e8313650e672563a2d073384f,1
arb_e56b759d14fd70506e01cf971315453d,1
arb_d286a2aac63432acef285a4799041f55,1
arb_5a9f322a530e85cd640f21af5c6bae42,1
arb_133c4737c4e04a8991fc7c219106b4e4,0
arb_f03fd416ab63cbb8f9f92a020fdd46be,1
arb_b8fbf2253ba4b8a83829b

# Subtask 2: Polarization Type Classification
Multi-label classification to identify the target of polarization as one of the following categories: Gender/Sexual, Political, Religious, Racial/Ethnic, or Other.
For this task we will load the data for subtask 2.

In [15]:
train = pd.read_csv('./dev_phase/subtask2/train/eng.csv')
val = pd.read_csv('./dev_phase/subtask2/train/eng.csv')
train.head()

Unnamed: 0,id,text,political,racial/ethnic,religious,gender/sexual,other
0,eng_973938b90b0ff5d87d35a582f83f5c89,is defending imperialism in the dnd chat,0,0,0,0,0
1,eng_07dfd4600426caca6e2c5883fcbea9ea,Still playing with this. I am now following Ra...,0,0,0,0,0
2,eng_f14519ff2302b6cd47712073f13bc461,.senate.gov Theres 3 groups out there Republic...,0,0,0,0,0
3,eng_e48b7e7542faafa544ac57b64bc80daf,"""ABC MD, David Anderson, said the additional f...",0,0,0,0,0
4,eng_7c581fb77bce8033aeba3d6dbd6273eb,"""bad people"" I have some conservative values s...",0,0,0,0,0


In [16]:
# Fix the dataset class by inheriting from torch.utils.data.Dataset
class PolarizationDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length # Store max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, padding=False, max_length=self.max_length, return_tensors='pt')

        # Ensure consistent tensor conversion for all items
        item = {key: encoding[key].squeeze() for key in encoding.keys()}
        # CHANGE THIS LINE: Use torch.float instead of torch.long for multi-label classification
        item['labels'] = torch.tensor(label, dtype=torch.float)
        return item


In [17]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Create train and Test dataset for multilabel
train_dataset = PolarizationDataset(train['text'].tolist(), train[['gender/sexual','political','religious','racial/ethnic','other']].values.tolist(), tokenizer)
val_dataset = PolarizationDataset(val['text'].tolist(), val[['gender/sexual','political','religious','racial/ethnic','other']].values.tolist(), tokenizer)
dev_dataset = PolarizationDataset(val['text'].tolist(), val[['gender/sexual','political','religious','racial/ethnic','other']].values.tolist(), tokenizer)


In [18]:
# Load the model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5, problem_type="multi_label_classification") # 5 labels

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# Define metrics function for multi-label classification
def compute_metrics_multilabel(p):
    # Sigmoid the predictions to get probabilities
    probs = torch.sigmoid(torch.from_numpy(p.predictions))
    # Convert probabilities to predicted labels (0 or 1)
    preds = (probs > 0.5).int().numpy()
    # Compute macro F1 score
    return {'f1_macro': f1_score(p.label_ids, preds, average='macro')}

# Define training arguments
training_args = TrainingArguments(
    output_dir=f"./",
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="no",
    logging_steps=100,
    disable_tqdm=False
)

In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics_multilabel,  # Use the new metrics function
    data_collator=DataCollatorWithPadding(tokenizer)
)

# Train the model
trainer.train()

# Evaluate the model on the validation set
eval_results = trainer.evaluate()
print(f"Macro F1 score on validation set for Subtask 2: {eval_results['eval_f1_macro']}")

# Subtask 3: Manifestation Identification
Multi-label classification to classify how polarization is expressed, with multiple possible labels including Vilification, Extreme Language, Stereotype, Invalidation, Lack of Empathy, and Dehumanization.



In [None]:
train = pd.read_csv('subtask3/train/eng.csv')
val = pd.read_csv('subtask3/train/eng.csv')

train.head()

In [None]:
# Fix the dataset class by inheriting from torch.utils.data.Dataset
class PolarizationDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length # Store max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, padding=False, max_length=self.max_length, return_tensors='pt')

        # Ensure consistent tensor conversion for all items
        item = {key: encoding[key].squeeze() for key in encoding.keys()}
        # CHANGE THIS LINE: Use torch.float instead of torch.long for multi-label classification
        item['labels'] = torch.tensor(label, dtype=torch.float)
        return item

In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Create train and Test dataset for multilabel
train_dataset = PolarizationDataset(train['text'].tolist(), train[['vilification','extreme_language','stereotype','invalidation','lack_of_empathy','dehumanization']].values.tolist(), tokenizer)
val_dataset = PolarizationDataset(val['text'].tolist(), val[['vilification','extreme_language','stereotype','invalidation','lack_of_empathy','dehumanization']].values.tolist(), tokenizer)

In [None]:
# Load the model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6, problem_type="multi_label_classification") # use 6 labels

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=f"./",
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="no",
    logging_steps=100,
    disable_tqdm=False
)

# Define metrics function for multi-label classification
def compute_metrics_multilabel(p):
    # Sigmoid the predictions to get probabilities
    probs = torch.sigmoid(torch.from_numpy(p.predictions))
    # Convert probabilities to predicted labels (0 or 1)
    preds = (probs > 0.5).int().numpy()
    # Compute macro F1 score
    return {'f1_macro': f1_score(p.label_ids, preds, average='macro')}

In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics_multilabel,  # Use the new metrics function
    data_collator=DataCollatorWithPadding(tokenizer)
)

# Train the model
trainer.train()

# Evaluate the model on the validation set
eval_results = trainer.evaluate()
print(f"Macro F1 score on validation set for Subtask 3: {eval_results['eval_f1_macro']}")