In [None]:
# Upgrade datasets and transformers to support NumPy 2.0+
!pip install --upgrade datasets transformers
import numpy as np  # Use the default NumPy 2.0+ version
# Install dependencies for Section 5
!pip install nltk

In [None]:
!pip install emoji

In [None]:
# Section 1: Import Libraries
import numpy as np
import pandas as pd
import torch
import random
import os
import json
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    pipeline
)
from datasets import load_dataset, Dataset
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import warnings
import zipfile
import re
import emoji
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
warnings.filterwarnings("ignore")
print("Libraries imported successfully!")

In [None]:
# Section 2: Set Random Seed for Reproducibility
def set_seed(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)  # If using GPU
    os.environ['PYTHONHASHSEED'] = str(seed_value)

# Use a single seed value of 123
seed = 123
set_seed(seed)
print(f"Random seed set to {seed} for reproducibility.")

In [None]:
# Section 3: Importing the Datasets
df_train = pd.read_csv('A_train.csv')
df_val= pd.read_csv('A_val.csv')
df_test = pd.read_csv('A_test.csv')

print("Training Dataset:")
print(df_train.head())

print("\nVal Dataset:")
print(df_val.head())
print("\nTest Dataset:")
print(df_test.head())

In [None]:
# Section 4: Checking the Hate and Non-Hate Ratio
class_counts = df_train['label'].value_counts()
print("Class distribution in the training set:")
print(f"No Hate (0): {class_counts.get(0, 0)} samples")
print(f"Hate (1): {class_counts.get(1, 0)} samples")
print(f"Percentage No Hate (0): {(class_counts.get(0, 0) / len(df_train)) * 100:.2f}%")
print(f"Percentage Hate (1): {(class_counts.get(1, 0) / len(df_train)) * 100:.2f}%")

In [None]:
# Section 5: Use of Text Cleaning (No Stopwords Removal, Less Aggressive)
def clean_text(text):
    text = text.lower()
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"'re", " are", text)
    text = re.sub(r"'s", " is", text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = emoji.replace_emoji(text, replace='')
    # Keep digits and reduce character repetition less aggressively
    text = re.sub(r'(.)\1{3,}', r'\1\1\1', text)  # e.g., loooove -> looove
    text = ''.join(c for c in text if c.isalnum() or c.isspace())
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df_train['text'] = df_train['text'].apply(clean_text)
df_val['text'] = df_val['text'].apply(clean_text)
df_test['text'] = df_test['text'].apply(clean_text)

print("\nTraining dataset after cleaning:")
print(df_train.head())
print("\nVal dataset after cleaning:")
print(df_val.head())
print("\nTest dataset after cleaning:")
print(df_test.head())

In [None]:
# Section 6: Tokenization and Fine-Tuning with Transformer
def initialize_model(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    model.config.hidden_dropout_prob = 0.2
    model.config.attention_probs_dropout_prob = 0.2
    return tokenizer, model

# Switch model to hateBERT
selected_model = 'GroNLP/hateBERT'

# Initialize tokenizer and model
tokenizer, model = initialize_model(selected_model)

# Define tokenize_function after tokenizer is initialized
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

# Split training data into 80% train and 20% validation
train_data, val_data = train_test_split(df_train, test_size=0.2, stratify=df_train['label'], random_state=seed)
dataset_train = Dataset.from_pandas(train_data[['text', 'label']])
dataset_val_internal = Dataset.from_pandas(val_data[['text', 'label']])
tokenized_train = dataset_train.map(tokenize_function, batched=True)
tokenized_val_internal = dataset_val_internal.map(tokenize_function, batched=True)
tokenized_train.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_val_internal.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# Handle test dataset
dataset_test = Dataset.from_pandas(df_test[['text']])
tokenized_test = dataset_test.map(tokenize_function, batched=True)
tokenized_test.set_format('torch', columns=['input_ids', 'attention_mask'])


In [None]:
#Section 7 : Train the model

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds),
        'precision': precision_score(labels, preds),
        'recall': recall_score(labels, preds)
    }

batch_size = 16
total_steps = (len(dataset_train) // batch_size) * 10
warmup_steps = int(0.1 * total_steps)
print(f"Total steps: {total_steps}, Warmup steps: {warmup_steps}")

training_args = TrainingArguments(
    output_dir=f'./results/{selected_model}_seed{seed}',
    report_to="none",
    num_train_epochs=10,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=warmup_steps,
    weight_decay=0.01,
    learning_rate=1e-5,
    logging_dir=f'./logs/seed{seed}',
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True,
)

# Custom Trainer without class weights
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss()  # No weights
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val_internal,
    compute_metrics=compute_metrics,
)

trainer.train()
eval_results = trainer.evaluate()
print(f"\nEvaluation Results on Internal Validation Set (Seed {seed}, Model: {selected_model}):")
print(eval_results)



In [None]:
#Section 8: prediction and json format convert for shared task
# Predict on test set (A_test.csv)
predictions = trainer.predict(tokenized_test)
pred_labels = np.argmax(predictions.predictions, axis=1)

# Format test predictions
test_predictions = [{"index": str(idx), "prediction": int(pred)} for idx, pred in zip(df_test['index'], pred_labels)]
print("\nTest Predictions (First 5):")
for pred in test_predictions[:5]:
    print(pred)

# Save test predictions to submission.json
with open('submission.json', 'w') as f:
    for pred in test_predictions:
        f.write(f'{{"index": "{pred["index"]}", "prediction": {pred["prediction"]}}}\n')

print("\nTest predictions saved to 'submission.json'")

# Create a zip file containing submission.json
with zipfile.ZipFile('ref.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write('submission.json')

print("\nZip file 'ref.zip' created with submission.json")