In [1]:
# Upgrade datasets and transformers to support NumPy 2.0+
!pip install --upgrade datasets transformers
import numpy as np  # Use the default NumPy 2.0+ version
# Install dependencies for Section 5
!pip install nltk

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading transformers-4.52.4-py3-none-any.whl (10.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m121.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, transformers, datasets
  Attempting uninstall: fsspec
    Found existing installation:

In [2]:
!pip install emoji

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/590.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1


In [3]:
# Section 1: Import Libraries
import numpy as np
import pandas as pd
import torch
import random
import os
import json
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    pipeline
)
from datasets import load_dataset, Dataset
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import warnings
import zipfile
import re
import emoji
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
warnings.filterwarnings("ignore")
print("Libraries imported successfully!")

Libraries imported successfully!


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
# Section 2: Set Random Seed for Reproducibility
def set_seed(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)  # If using GPU
    os.environ['PYTHONHASHSEED'] = str(seed_value)

# Use a single seed value of 123
seed = 123
set_seed(seed)
print(f"Random seed set to {seed} for reproducibility.")

Random seed set to 123 for reproducibility.


In [6]:
# Section 3: Importing the Datasets
df_train = pd.read_csv('A_train.csv')
df_val= pd.read_csv('A_val.csv')
df_test = pd.read_csv('A_test.csv')

print("Training Dataset:")
print(df_train.head())

print("\nVal Dataset:")
print(df_val.head())
print("\nTest Dataset:")
print(df_test.head())

Training Dataset:
      index                                               text  label
0  1001.png  transgirls who grow boobs but keep their cock ...      0
1  1005.png  realistic and wholesome representation of tran...      0
2  1008.png  united lgbt united sponsored you can color you...      0
3  1009.png  neolib politician i'm going to fuck my cat and...      1
4  1010.png  dad i'm gay i love you no matter what i also s...      1

Val Dataset:
      index                                               text
0  1003.png  soon available on every women's bathroom say t...
1  1024.png  sides gay fries fries but they're g ya know 6 ...
2  1045.png  me doesn't want to talk about dating sex with ...
3  1046.png  ferb trans rights says aren't you a little you...
4  1080.png  bethesda tr ethesda tr modle east bethesda rus...

Test Dataset:
      index                                               text
0  1002.png  imagine only being able to date this meme was ...
1  1011.png  every company du

In [7]:
# Section 4: Checking the Hate and Non-Hate Ratio
class_counts = df_train['label'].value_counts()
print("Class distribution in the training set:")
print(f"No Hate (0): {class_counts.get(0, 0)} samples")
print(f"Hate (1): {class_counts.get(1, 0)} samples")
print(f"Percentage No Hate (0): {(class_counts.get(0, 0) / len(df_train)) * 100:.2f}%")
print(f"Percentage Hate (1): {(class_counts.get(1, 0) / len(df_train)) * 100:.2f}%")

Class distribution in the training set:
No Hate (0): 2065 samples
Hate (1): 1985 samples
Percentage No Hate (0): 50.99%
Percentage Hate (1): 49.01%


In [8]:
# Section 5: Use of Text Cleaning (No Stopwords Removal, Less Aggressive)
def clean_text(text):
    text = text.lower()
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"'re", " are", text)
    text = re.sub(r"'s", " is", text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = emoji.replace_emoji(text, replace='')
    # Keep digits and reduce character repetition less aggressively
    text = re.sub(r'(.)\1{3,}', r'\1\1\1', text)  # e.g., loooove -> looove
    text = ''.join(c for c in text if c.isalnum() or c.isspace())
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df_train['text'] = df_train['text'].apply(clean_text)
df_val['text'] = df_val['text'].apply(clean_text)
df_test['text'] = df_test['text'].apply(clean_text)

print("\nTraining dataset after cleaning:")
print(df_train.head())
print("\nVal dataset after cleaning:")
print(df_val.head())
print("\nTest dataset after cleaning:")
print(df_test.head())


Training dataset after cleaning:
      index                                               text  label
0  1001.png  transgirls who grow boobs but keep their cock ...      0
1  1005.png  realistic and wholesome representation of tran...      0
2  1008.png  united lgbt united sponsored you can color you...      0
3  1009.png  neolib politician im going to fuck my cat and ...      1
4  1010.png  dad im gay i love you no matter what i also su...      1

Val dataset after cleaning:
      index                                               text
0  1003.png  soon available on every women is bathroom say ...
1  1024.png  sides gay fries fries but they are g ya know 6...
2  1045.png  me does not want to talk about dating sex with...
3  1046.png  ferb trans rights says are not you a little yo...
4  1080.png  bethesda tr ethesda tr modle east bethesda rus...

Test dataset after cleaning:
      index                                               text
0  1002.png  imagine only being able to date t

In [10]:
# Section 6: Tokenization and Fine-Tuning with Transformer
def initialize_model(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    model.config.hidden_dropout_prob = 0.2
    model.config.attention_probs_dropout_prob = 0.2
    return tokenizer, model

# Switch model to hateBERT
selected_model = 'GroNLP/hateBERT'

# Initialize tokenizer and model
tokenizer, model = initialize_model(selected_model)

# Define tokenize_function after tokenizer is initialized
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

# Split training data into 80% train and 20% validation
train_data, val_data = train_test_split(df_train, test_size=0.2, stratify=df_train['label'], random_state=seed)
dataset_train = Dataset.from_pandas(train_data[['text', 'label']])
dataset_val_internal = Dataset.from_pandas(val_data[['text', 'label']])
tokenized_train = dataset_train.map(tokenize_function, batched=True)
tokenized_val_internal = dataset_val_internal.map(tokenize_function, batched=True)
tokenized_train.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_val_internal.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# Handle test dataset
dataset_test = Dataset.from_pandas(df_test[['text']])
tokenized_test = dataset_test.map(tokenize_function, batched=True)
tokenized_test.set_format('torch', columns=['input_ids', 'attention_mask'])


tokenizer_config.json:   0%|          | 0.00/151 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/hateBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3240 [00:00<?, ? examples/s]

Map:   0%|          | 0/810 [00:00<?, ? examples/s]

Map:   0%|          | 0/507 [00:00<?, ? examples/s]

In [12]:
#Section 7 : Train the model

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds),
        'precision': precision_score(labels, preds),
        'recall': recall_score(labels, preds)
    }

batch_size = 16
total_steps = (len(dataset_train) // batch_size) * 10
warmup_steps = int(0.1 * total_steps)
print(f"Total steps: {total_steps}, Warmup steps: {warmup_steps}")

training_args = TrainingArguments(
    output_dir=f'./results/{selected_model}_seed{seed}',
    report_to="none",
    num_train_epochs=10,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=warmup_steps,
    weight_decay=0.01,
    learning_rate=1e-5,
    logging_dir=f'./logs/seed{seed}',
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True,
)

# Custom Trainer without class weights
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss()  # No weights
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val_internal,
    compute_metrics=compute_metrics,
)

trainer.train()
eval_results = trainer.evaluate()
print(f"\nEvaluation Results on Internal Validation Set (Seed {seed}, Model: {selected_model}):")
print(eval_results)



Total steps: 2020, Warmup steps: 202


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3543,0.660194,0.712346,0.742541,0.661417,0.846348
2,0.3523,0.668118,0.695062,0.678804,0.701613,0.657431
3,0.2234,0.807207,0.693827,0.683673,0.692506,0.675063
4,0.157,0.979358,0.709877,0.718563,0.684932,0.755668
5,0.0906,1.243413,0.696296,0.705742,0.671982,0.743073
6,0.0654,1.50732,0.696296,0.697044,0.681928,0.712846
7,0.0744,1.642968,0.702469,0.705739,0.684834,0.72796
8,0.0394,1.72086,0.697531,0.699387,0.681818,0.717884
9,0.0285,1.763054,0.706173,0.711864,0.685315,0.740554
10,0.0207,1.787773,0.701235,0.702703,0.685851,0.720403



Evaluation Results on Internal Validation Set (Seed 123, Model: GroNLP/hateBERT):
{'eval_loss': 0.6601942181587219, 'eval_accuracy': 0.7123456790123457, 'eval_f1': 0.7425414364640884, 'eval_precision': 0.6614173228346457, 'eval_recall': 0.8463476070528967, 'eval_runtime': 1.4559, 'eval_samples_per_second': 556.368, 'eval_steps_per_second': 35.031, 'epoch': 10.0}


In [13]:
#Section 8: prediction and json format convert for shared task
# Predict on test set (A_test.csv)
predictions = trainer.predict(tokenized_test)
pred_labels = np.argmax(predictions.predictions, axis=1)

# Format test predictions
test_predictions = [{"index": str(idx), "prediction": int(pred)} for idx, pred in zip(df_test['index'], pred_labels)]
print("\nTest Predictions (First 5):")
for pred in test_predictions[:5]:
    print(pred)

# Save test predictions to submission.json
with open('submission.json', 'w') as f:
    for pred in test_predictions:
        f.write(f'{{"index": "{pred["index"]}", "prediction": {pred["prediction"]}}}\n')

print("\nTest predictions saved to 'submission.json'")

# Create a zip file containing submission.json
with zipfile.ZipFile('ref.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write('submission.json')

print("\nZip file 'ref.zip' created with submission.json")


Test Predictions (First 5):
{'index': '1002.png', 'prediction': 0}
{'index': '1011.png', 'prediction': 1}
{'index': '1040.png', 'prediction': 1}
{'index': '1055.png', 'prediction': 1}
{'index': '1115.png', 'prediction': 0}

Test predictions saved to 'submission.json'

Zip file 'ref.zip' created with submission.json
