In [None]:
# install required libraries
!pip install torch torchvision torchaudio
!pip install transformers
!pip install sentencepiece
!pip install datasets
!pip install evaluate

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Using cached nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Using cached nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Using cached nvidia_curand_cu12

In [None]:
import os
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoConfig,
    TrainingArguments,
    Trainer,
    get_linear_schedule_with_warmup
)

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    precision_recall_fscore_support,
    confusion_matrix,
    ConfusionMatrixDisplay
)

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.auto import tqdm
import random
from typing import List, Dict, Tuple
import logging

In [None]:
# set random seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Check for gpu, if not available use cpu
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")


MODEL_CONFIGS = {
    "afro_xlmr": {
        "name": "Davlan/afro-xlmr-large",
        "description": "AfroXLMR - Specialized for African languages"
    },
    "xlm_roberta": {
        "name": "xlm-roberta-base",
        "description": "XLM-RoBERTa - General multilingual model"
    },
    "multilingual_bert": {
        "name": "bert-base-multilingual-cased",
        "description": "Multilingual BERT"
    }
}

print("\nAvailable models for comparison:")
for key, config in MODEL_CONFIGS.items():
    print(f"- {key}: {config['description']}")

# define data label mapping
LABEL_MAP = {
    "neutral": 0,
    "mildly_offensive": 1,
    "hate_speech": 2
}

ID_TO_LABEL = {v: k for k, v in LABEL_MAP.items()}

print(f"\nLabel mapping: {LABEL_MAP}")

# Configuration constants
MAX_LENGTH = 128  # Reduced max length
BATCH_SIZE = 16 # Original batch size, will be overridden by training_args
LEARNING_RATE = 2e-5
NUM_EPOCHS = 3
WARMUP_STEPS = 500

print(f"\nDefault configuration:")
print(f"Max length: {MAX_LENGTH}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Learning rate: {LEARNING_RATE}")
print(f"Epochs: {NUM_EPOCHS}")

def check_gpu_memory():
    """Check available GPU memory"""
    if torch.cuda.is_available():
        memory_free = torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_allocated(0)
        print(f"Available GPU memory: {memory_free / 1024**3:.1f} GB")
        return memory_free
    return None

def clear_gpu_cache():
    """Clear GPU cache to free memory"""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print("GPU cache cleared")

clear_gpu_cache()
check_gpu_memory()

Using device: cuda
GPU: Tesla T4
GPU Memory: 14.7 GB

Available models for comparison:
- afro_xlmr: AfroXLMR - Specialized for African languages
- xlm_roberta: XLM-RoBERTa - General multilingual model
- multilingual_bert: Multilingual BERT

Label mapping: {'neutral': 0, 'mildly_offensive': 1, 'hate_speech': 2}

Default configuration:
Max length: 128
Batch size: 16
Learning rate: 2e-05
Epochs: 3
GPU cache cleared
Available GPU memory: 5.8 GB


6182719488

In [None]:
from transformers import pipeline
unmasker = pipeline('fill-mask', model='bert-base-multilingual-uncased')
unmasker("Hello I'm a [MASK] model.")

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/672M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

Device set to use cuda:0


[{'score': 0.15077334642410278,
  'token': 11397,
  'token_str': 'top',
  'sequence': "hello i ' m a top model."},
 {'score': 0.13075214624404907,
  'token': 23589,
  'token_str': 'fashion',
  'sequence': "hello i ' m a fashion model."},
 {'score': 0.03627222031354904,
  'token': 12050,
  'token_str': 'good',
  'sequence': "hello i ' m a good model."},
 {'score': 0.03595409914851189,
  'token': 10246,
  'token_str': 'new',
  'sequence': "hello i ' m a new model."},
 {'score': 0.028642697259783745,
  'token': 11838,
  'token_str': 'great',
  'sequence': "hello i ' m a great model."}]

In [None]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
model = BertModel.from_pretrained("bert-base-multilingual-uncased")
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

In [None]:
print(encoded_input)

{'input_ids': tensor([[  101, 35829, 10525, 10151, 11318, 14059, 10855,   112,   146, 11531,
           119,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [None]:
from transformers import AutoTokenizer, BertModel
tokenizer = AutoTokenizer.from_pretrained('Davlan/afro-xlmr-large')
#model = BertModel.from_pretrained("bert-base-multilingual-uncased")
text = "mofe lowo bi moronkola"
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [None]:
  print (encoded_input)

{'input_ids': tensor([[    0,   931,  2242, 27226,    31,   333,  2993,   191, 24004,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [None]:
df = pd.read_csv("/content/multilingual_hate_speech_dataset (1).csv")
df

Unnamed: 0,class,text,language
0,1,Guy you are wicked 🤣🤣\nEven a mad woman ge...,english
1,2,What will happen to the tribalists? the nep...,english
2,0,Lmaooo them get mind dey share these clown st...,english
3,0,: Ohun tó ṣẹlẹ̀ sí ọ̀rẹ́ ọ̀rẹ́ mi kan ní òpópó...,yoruba
4,1,Twitter m bụ nnọọ a bitch,igbo
...,...,...,...
27143,2,Amma 'yan uwanka Fulani suna cikin daji suna n...,hausa
27144,2,The worst set of politicians in Nigeria today...,english
27145,2,Yammacin duniya suna nuna cewa suna goyon baya...,hausa
27146,2,A yaushe ne Black Lives Matter za ta yi magana...,hausa


In [None]:
from transformers import BertTokenizer, BertModel
tokenizer = AutoTokenizer.from_pretrained('Davlan/afro-xlmr-large')
text = df['text'][0]
encoded_input = tokenizer(text, return_tensors='pt')
# print(encoded_input)


# Loop through all the text in the df
all_text = df['text']
all_text_length = []

for text in all_text:
    encoded_input = len(tokenizer.tokenize(str(text)))
    all_text_length.append(encoded_input)
    # print(encoded_input)

# find the median
median_length = np.median(all_text_length)
print(f"Median text length: {median_length}")

Token indices sequence length is longer than the specified maximum sequence length for this model (1181 > 512). Running this sequence through the model will result in indexing errors


Median text length: 35.0


In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_df['class'].value_counts()
test_df['class'].value_counts()
#train_df.shape
#test_df.shape



Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
0,1961
1,1765
2,1704


In [None]:
# Give me the architecture of the XLM
config = AutoConfig.from_pretrained('Davlan/afro-xlmr-large')
config

config.json:   0%|          | 0.00/714 [00:00<?, ?B/s]

XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float16",
  "transformers_version": "4.55.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}

In [None]:
def convert_df_format(texts, label, tokenizer, max_length):
  inputs = tokenizer.encode_plus(texts, add_special_tokens=True, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt', return_attention_mask=True, return_token_type_ids=False)

In [None]:
def convert_df_format(texts, label, tokenizer, max_length):
  inputs = tokenizer(texts, padding='max_length', add_special_tokens=True, truncation=True,
                                 max_length=max_length, return_attention_mask=True, return_token_type_ids=False, return_tensors='pt')
  dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], torch.tensor(label, dtype=torch.long))
  return dataset

In [None]:
train_df['text'] = train_df['text'].apply(lambda x: str(x))
test_df['text'] = test_df['text'].apply(lambda x: str(x))

In [None]:
train_df_texts = train_df['text'].to_list()
test_df_texts = test_df['text'].to_list()

In [None]:
train_df_labels = train_df['class'].to_list()
test_df_labels = test_df['class'].to_list()

In [None]:
train_dataset = convert_df_format(train_df_texts, train_df_labels, tokenizer, MAX_LENGTH)
test_dataset = convert_df_format(test_df_texts, test_df_labels, tokenizer, MAX_LENGTH)

# **ASSIGNMENT**

In [None]:
class DictDataset:
    def __init__(self, tensor_dataset):
        self.tensor_dataset = tensor_dataset

    def __len__(self):
        return len(self.tensor_dataset)

    def __getitem__(self, idx):
        input_ids, attention_mask, labels = self.tensor_dataset[idx]
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

# Convert the dataset from tuple to dictionary format:
train_dataset_dict = DictDataset(train_dataset)
test_dataset_dict = DictDataset(test_dataset)

data_sample = train_dataset_dict[0]
print(f"data sample type: {type(data_sample)}")

data sample type: <class 'dict'>


In [None]:
from sklearn.metrics import accuracy_score, f1_score
import sys
sys.stdout.flush()

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted")
    }

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer


# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CONFIGS["xlm_roberta"]["name"],
    num_labels=3
)

# Training setup
training_args = TrainingArguments(
    output_dir="./hate-speech-model", # write the folder name to store the training artifacts
    num_train_epochs=1,
    per_device_train_batch_size=8, # Reduced batch size
    per_device_eval_batch_size=8,  # Reduced batch size
    gradient_accumulation_steps=4, # Accumulate gradients over 4 steps (effective batch size = 8 * 4 = 32)
    eval_strategy="steps",
    save_strategy="steps",
    eval_steps=25,
    save_steps=25,
    max_steps=100,
    disable_tqdm=False,              # Enable progress bars
    logging_steps=10,                # Log every 10 steps
    logging_strategy="steps",        # Log at regular intervals
    dataloader_num_workers=0,        # Avoid multiprocessing issues
    load_best_model_at_end=True,     # Load best model after training
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_dict,  # we pass in the dict format of the train_dataset we did in class
    eval_dataset=test_dataset_dict,    # we pass in the dict format of the test_dataset we did in class
    compute_metrics=compute_metrics
)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


OutOfMemoryError: CUDA out of memory. Tried to allocate 978.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 824.12 MiB is free. Process 10156 has 13.93 GiB memory in use. Of the allocated memory 8.98 GiB is allocated by PyTorch, and 4.82 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# since we set load_best_model_at_end=True, we can save the best model to a dir

name = "./final-model"
trainer.save_model(name)
tokenizer.save_pretrained(name)

In [None]:
def model_test(model_path, text):

    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()

    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=256)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        probabilities = torch.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(probabilities, dim=-1).item()
        confidence = probabilities[0][predicted_class].item()

    predicted_label = ID_TO_LABEL[predicted_class]

    print(f"Text: '{text}'")
    print(f"Label: {predicted_class} = Prediction: {predicted_label} (confidence: {confidence:.4f})")

    return predicted_label, confidence

In [None]:
sample_texts = [
        # English examples
        "This movie is absolutely amazing and wonderful!",  # Neutral
        "The film was okay, nothing too special",  # Neutral
        "I hate this terrible garbage, complete waste of time",  # Mildly Offensive
        "You stupid idiots don't deserve to live, go kill yourselves",  # Hate Speech

        # Yoruba examples
        "Fiimu yi dara pupo, mo feran re",  # Neutral (This movie is very good, I love it)
        "Fiimu yi ko dara to bee, emi ko feran re",  # Mildly Offensive (This movie is not that good, I don't like it)
        "Yin lo ni awon omo ale, yin ko ye nkan rara",  # Hate Speech (You are bastards, you are completely worthless)
        "Mo wo fiimu yi, o dara lopolopo",  # Neutral (I watched this movie, it's very good)

        # Igbo examples
        "Ihe nkiri a maka nke oma, enwere m mmasị na ya",  # Neutral (This movie is very good, I like it)
        "Ihe nkiri a adịghị mma nke ukwuu",  # Mildly Offensive (This movie is not very good)
        "Unu niile bụ ndị nzuzu na-enweghị isi",  # Hate Speech (You are all stupid and senseless people)
        "Ihe nkiri a dị mma ma ọ bụghị ihe pụrụ iche",  # Neutral (This movie is good but nothing special)

        # Hausa examples
        "Wannan fim yana da kyau sosai, ina son shi",  # Neutral (This movie is very good, I like it)
        "Wannan fim bai yi kyau ba sosai",  # Mildly Offensive (This movie is not very good)
        "Ku duk marasa hankali ne ba ku da amfani",  # Hate Speech (You are all senseless and useless)
        "Na kalli wannan fim, yana da kyau amma ba musamman ba"  # Neutral (I watched this movie, it's good but not special)
    ]

print("Testing examples:")
for text in sample_texts:
    model_test("./final-model", text)
    print("-" * 30)

In [None]:
# Train: this might take some time
trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33matinukeaabson[0m ([33matinukeaabson-pan-atlantic-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 978.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 656.12 MiB is free. Process 10156 has 14.10 GiB memory in use. Of the allocated memory 8.98 GiB is allocated by PyTorch, and 4.98 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Train: this might take some time
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 978.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 824.12 MiB is free. Process 10156 has 13.93 GiB memory in use. Of the allocated memory 8.98 GiB is allocated by PyTorch, and 4.82 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)