# Environment Setup

In [None]:
!pip install transformers datasets scikit-learn matplotlib seaborn torch
!pip install evaluate --quiet
!pip install optuna
!pip install fsspec==2025.3.2 scikit-learn==1.6.1
!pip install joblib

# Environment Setup
import torch
import time
import wandb
import numpy as np
import optuna
from joblib import Parallel, delayed
import itertools
import copy
import os
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    TrainerCallback,
    EvalPrediction
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, hamming_loss
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetPowerUsage
from google.colab import drive

torch.cuda.empty_cache()




Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
# Environment Setup
import torch
import time
import wandb
import numpy as np
import optuna
from joblib import Parallel, delayed
import itertools
import copy
import os
from typing import Dict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    TrainerCallback,
    EvalPrediction
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, hamming_loss
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetPowerUsage
from google.colab import drive

torch.cuda.empty_cache()

In [None]:
# Initialize NVML for GPU power monitoring
try:
    nvmlInit()
    power_monitoring_available = True
except:
    power_monitoring_available = False
    print("NVML not available - power metrics will be skipped")

class EfficiencyMetrics:
    def __init__(self):
        self.start_time = None
        self.metrics = {
            'train_time': [],
            'eval_time': [],
            'gpu_memory': [],
            'gpu_power': []
        }

    def start_timer(self):
        self.start_time = time.time()

    def record_train_step(self):
        if self.start_time is None:
            return

        elapsed = time.time() - self.start_time
        self.metrics['train_time'].append(elapsed)

        # GPU memory tracking
        if torch.cuda.is_available():
            self.metrics['gpu_memory'].append(torch.cuda.max_memory_allocated() / 1024**3)

            # GPU power tracking
            if power_monitoring_available:
                handle = nvmlDeviceGetHandleByIndex(torch.cuda.current_device())
                power = nvmlDeviceGetPowerUsage(handle) / 1000
                self.metrics['gpu_power'].append(power)

        self.start_time = time.time()

    def get_average_metrics(self):
        return {
            'avg_train_time_per_step': np.mean(self.metrics['train_time']),
            'max_gpu_memory_usage_gb': np.max(self.metrics['gpu_memory']) if self.metrics['gpu_memory'] else 0,
            'avg_gpu_power_watts': np.mean(self.metrics['gpu_power']) if self.metrics['gpu_power'] else 0
        }

class EfficiencyCallback(TrainerCallback):
    def __init__(self, metrics):
        self.metrics = metrics

    def on_step_begin(self, args, state, control, **kwargs):
        self.metrics.start_timer()

    def on_step_end(self, args, state, control, **kwargs):
        self.metrics.record_train_step()
        if state.global_step % args.logging_steps == 0:
            wandb.log(self.metrics.get_average_metrics())


# 1. Load and Preprocess Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import re

# Emotion labels constant
EMOTION_LABELS = ["anger", "disgust", "fear", "joy", "sadness", "surprise"]

def clean_text(text):
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text.lower())
    return text.strip()

def load_and_preprocess_data(language):
    """Load and preprocess data for a specific language"""
    # Load data
    train_path = f'/content/drive/Shareddrives/COS 760 Group 13 Project/Data/{language.capitalize()}/dev-00000-of-00001.parquet'
    test_path = f'/content/drive/Shareddrives/COS 760 Group 13 Project/Data/{language.capitalize()}/test-00000-of-00001.parquet'

    train_df = pd.read_parquet(train_path)
    test_df = pd.read_parquet(test_path)

    # Clean text
    train_df['text'] = train_df['text'].apply(clean_text)
    test_df['text'] = test_df['text'].apply(clean_text)

    return train_df, test_df

def preprocess_for_model(df, language):
    """Prepare data in format needed for model training"""
    return {
        'text': df['text'].tolist(),
        'labels': df[EMOTION_LABELS].astype(float).values.tolist(),
        'language': [language] * len(df)
    }


os.makedirs('/content/drive/Shareddrives/COS 760 Group 13 Project/Parameters', exist_ok=True)

def save_parameters(model_name, language, params):
    """Save hyperparameters to CSV file"""
    params_df = pd.DataFrame([{
        'model': model_name,
        'language': language,
        **params
    }])

    file_path = f'/content/drive/Shareddrives/COS 760 Group 13 Project/Parameters/{model_name.split("/")[-1]}_{language}_params.csv'

    # Append to existing file
    if os.path.exists(file_path):
        existing_df = pd.read_csv(file_path)
        params_df = pd.concat([existing_df, params_df])

    params_df.to_csv(file_path, index=False)

Mounted at /content/drive


# 2. Model Training Class

In [None]:
from torch.utils.data import Dataset

class EmotionDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item['labels'] = item['labels'].float()
        return item

class EmotionClassifier:
    def __init__(self, model_name, language, freeze_layers=None, hyperparams=None):
        self.model_name = model_name
        self.language = language
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Use hyperparameters
        self.hyperparams = hyperparams or {}
        print("======================================================================================")
        print(f"✅ Best params for SENT {self.hyperparams}")
        print("======================================================================================")

        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=len(EMOTION_LABELS),
            problem_type="multi_label_classification",
            hidden_dropout_prob=self.hyperparams.get('dropout', 0.1),
            attention_probs_dropout_prob=self.hyperparams.get('dropout', 0.1)
        )

        # Initialize freezing parameters
        self.freeze_layers = freeze_layers if freeze_layers is not None else []
        self.current_unfrozen_layers = []
        self._apply_freezing()
        self.metrics = EfficiencyMetrics()


    def compute_metrics(self, eval_pred):
        predictions, labels = eval_pred
        return self.multi_label_metrics(predictions, labels)

    def multi_label_metrics(self, predictions, labels, threshold=0.3):
        sigmoid = torch.nn.Sigmoid()
        probs = sigmoid(torch.tensor(predictions))
        preds = np.zeros(probs.shape)
        preds[np.where(probs >= threshold)] = 1
        true = labels

        f1_micro = f1_score(true, preds, average='micro')
        f1_macro = f1_score(true, preds, average='macro')
        roc_auc = roc_auc_score(true, probs, average='micro')
        hamming = hamming_loss(true, preds)

        return {
            'f1_micro': f1_micro,
            'f1_macro': f1_macro,
            'roc_auc': roc_auc,
            'hamming_loss': hamming
        }

    def _apply_freezing(self):
        """Freeze specified layers of the model"""
        # Freeze all parameters first
        for param in self.model.parameters():
            param.requires_grad = False

        # Unfreeze classifier head
        for param in self.model.classifier.parameters():
            param.requires_grad = True

        # Unfreeze specified layers
        for layer_name in self.current_unfrozen_layers:
            layer = self._get_layer_by_name(layer_name)
            if layer is not None:
                for param in layer.parameters():
                    param.requires_grad = True

    def _get_layer_by_name(self, layer_name):
        """Helper to get layer by name"""
        # Handle different model architectures
        if hasattr(self.model, 'roberta'):
            model = self.model.roberta
        elif hasattr(self.model, 'bert'):
            model = self.model.bert
        else:
            model = self.model.base_model

        # Get layer by name
        if layer_name.startswith('encoder.layer.'):
            layer_num = int(layer_name.split('.')[-1])
            return model.encoder.layer[layer_num]
        elif layer_name == 'embeddings':
            return model.embeddings
        elif layer_name == 'pooler':
            return model.pooler if hasattr(model, 'pooler') else None
        return None

    def unfreeze_next_layers(self, num_layers=1):
        """Gradually unfreeze next set of layers"""
        if not self.freeze_layers:
            return

        # Get layers that are still frozen
        remaining_frozen = [l for l in self.freeze_layers if l not in self.current_unfrozen_layers]

        # Unfreeze next N layers
        layers_to_unfreeze = remaining_frozen[:num_layers]
        self.current_unfrozen_layers.extend(layers_to_unfreeze)

        # Apply the new freezing configuration
        self._apply_freezing()
        print(f"Unfroze layers: {layers_to_unfreeze}")

    def tokenize_data(self, texts, labels):
        encodings = self.tokenizer(texts, truncation=True, padding=True, max_length=512)
        encodings['labels'] = labels
        return encodings

    def compute_metrics(self, eval_pred):
        predictions, labels = eval_pred
        return self.multi_label_metrics(predictions, labels)

    def multi_label_metrics(self, predictions, labels, threshold=0.3):
        sigmoid = torch.nn.Sigmoid()
        probs = sigmoid(torch.tensor(predictions))
        preds = np.zeros(probs.shape)
        preds[np.where(probs >= threshold)] = 1
        true = labels

        f1_micro = f1_score(true, preds, average='micro')
        f1_macro = f1_score(true, preds, average='macro')
        roc_auc = roc_auc_score(true, probs, average='micro')
        hamming = hamming_loss(true, preds)

        return {
            'f1_micro': f1_micro,
            'f1_macro': f1_macro,
            'roc_auc': roc_auc,
            'hamming_loss': hamming
        }

    def get_training_args(self):
        return TrainingArguments(
            output_dir=f"/content/drive/Shareddrives/COS 760 Group 13 Project/Models/{self.model_name.split('/')[-1]}_{self.language}",
            eval_strategy="epoch",
            save_strategy="epoch",
            save_steps=1000,
            save_total_limit=3,
            learning_rate=self.hyperparams.get('lr', 2e-5),
            per_device_train_batch_size=self.hyperparams.get('batch_size', 8),
            per_device_eval_batch_size=self.hyperparams.get('batch_size', 8),
            num_train_epochs=3,
            weight_decay=self.hyperparams.get('weight_decay', 0.01),
            warmup_steps=self.hyperparams.get('warmup_steps', 0),
            max_grad_norm=self.hyperparams.get('grad_clip', 1.0),
            load_best_model_at_end=True,
            metric_for_best_model='roc_auc',
            logging_dir=f'/content/drive/Shareddrives/COS 760 Group 13 Project/Models/{self.model_name.split("/")[-1]}_{self.language}/logs',
            logging_steps=100,
            run_name=f"{self.model_name.split('/')[-1]}_{self.language}_run"
        )

    def train(self, train_dataset, eval_dataset):
        training_args = self.get_training_args()

        # Efficiency metrics to training args
        training_args.report_to = ['wandb']
        training_args.logging_steps = 10

        # Define unfreezing schedule
        unfreeze_schedule = {
            1: 2,
            2: 2,

        }

        trainer = GradualUnfreezingTrainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=self.compute_metrics,
            unfreeze_schedule=unfreeze_schedule,
            callbacks=[EfficiencyCallback(self.metrics)]
        )

        trainer.add_callback(EfficiencyCallback(self.metrics))
        trainer.train()

        # Log final metrics
        final_metrics = self.metrics.get_average_metrics()
        print("\nTraining Efficiency Metrics:")
        for k, v in final_metrics.items():
            print(f"{k}: {v:.4f}")
        wandb.log({"final_" + k: v for k, v in final_metrics.items()})

        # Save model
        trainer.save_model(f"{training_args.output_dir}/final_model")
        self.tokenizer.save_pretrained(f"{training_args.output_dir}/final_model")

        return trainer

HyperParameters Grid Trainer

In [None]:
class GridSearchTrainer:
    def __init__(self, model_name, language, tokenizer, train_dataset, eval_dataset, param_grid):
        self.model_name = model_name
        self.language = language
        self.tokenizer = tokenizer
        self.train_dataset = train_dataset
        self.eval_dataset = eval_dataset
        self.param_grid = param_grid
    def compute_metrics(self, eval_pred):
        predictions, labels = eval_pred
        return self.multi_label_metrics(predictions, labels)

    def multi_label_metrics(self, predictions, labels, threshold=0.3):
        sigmoid = torch.nn.Sigmoid()
        probs = sigmoid(torch.tensor(predictions))
        preds = np.zeros(probs.shape)
        preds[np.where(probs >= threshold)] = 1
        true = labels

        f1_micro = f1_score(true, preds, average='micro')
        f1_macro = f1_score(true, preds, average='macro')
        roc_auc = roc_auc_score(true, probs, average='micro')
        hamming = hamming_loss(true, preds)

        return {
            'f1_micro': f1_micro,
            'f1_macro': f1_macro,
            'roc_auc': roc_auc,
            'hamming_loss': hamming
        }
    def train_and_evaluate(self, params):
        print(f"🔍 Trying: {params}")
        model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name,
            num_labels=len(EMOTION_LABELS),
            problem_type="multi_label_classification",
            hidden_dropout_prob=params['dropout'],
            attention_probs_dropout_prob=params['dropout']
        )

        training_args = TrainingArguments(
            output_dir="./tmp_output",
            per_device_train_batch_size=params['batch_size'],
            per_device_eval_batch_size=params['batch_size'],
            learning_rate=params['lr'],
            num_train_epochs=params['epochs'],
            weight_decay=params['weight_decay'],
            warmup_steps=params['warmup_steps'],
            max_grad_norm=params['grad_clip'],
            eval_strategy="epoch",
            disable_tqdm=True,
            report_to="none",
            metric_for_best_model='roc_auc',
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=self.train_dataset,
            eval_dataset=self.eval_dataset,
            tokenizer=self.tokenizer,
            compute_metrics=self.compute_metrics
        )

        # ACTUALLY TRAIN THE MODEL
        trainer.train()
        metrics = trainer.evaluate()


        del model
        del trainer
        torch.cuda.empty_cache()

        return params, metrics['eval_roc_auc']

    def run(self, n_jobs=1):
        keys, values = zip(*self.param_grid.items())
        param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]


        results = []
        for params in param_combinations:
            results.append(self.train_and_evaluate(params))

        best_params, best_score = max(results, key=lambda x: x[1])
        print(f"✅ Best Params: {best_params} | ROC AUC: {best_score:.4f}")
        return best_params

Hyper Tuner

In [None]:
def tune_hyperparameters(model_name, language, train_data, val_data):
    """Run hyperparameter tuning and return best params"""
    print(f"🚀 Starting hyperparameter tuning for {model_name} on {language}")

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Prepare datasets
    train_encodings = tokenizer(train_data['text'], truncation=True, padding=True, max_length=512)
    train_encodings['labels'] = train_data['labels']
    val_encodings = tokenizer(val_data['text'], truncation=True, padding=True, max_length=512)
    val_encodings['labels'] = val_data['labels']

    train_dataset = EmotionDataset(train_encodings)
    val_dataset = EmotionDataset(val_encodings)

    # Reduced parameter grid for efficiency
    param_grid = {
        'lr': [1e-5, 3e-5],
        'batch_size': [8, 16],
        'dropout': [0.1, 0.3],
        'epochs': [2],
        'weight_decay': [0.01, 0.1],
        'warmup_steps': [0, 100],
        'grad_clip': [1.0],
    }

    tuner = GridSearchTrainer(
        model_name=model_name,
        language=language,
        tokenizer=tokenizer,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        param_grid=param_grid
    )

    best_params = tuner.run(n_jobs=2)
    save_parameters(model_name, language, best_params)
    print(f"✅ Best params for {model_name}-{language}: {best_params}")
    return best_params

Freezing

In [None]:
class GradualUnfreezingTrainer(Trainer):
    def __init__(self, *args, unfreeze_schedule=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.unfreeze_schedule = unfreeze_schedule or {}
        self._current_epoch = 0

    def training_step(self, model, inputs, num_items_in_batch=None):
        if self._current_epoch in self.unfreeze_schedule:
            num_layers = self.unfreeze_schedule[self._current_epoch]
            if hasattr(model, 'unfreeze_next_layers'):
                model.unfreeze_next_layers(num_layers)
                self.log({"unfrozen_layers": num_layers})

        return super().training_step(model, inputs)

    def log(self, logs, iterator=None):
        if "unfrozen_layers" in logs:
            logs["unfrozen_layers"] = str(logs["unfrozen_layers"])
        super().log(logs)

    def _maybe_log_save_evaluate(self,tr_loss,model,trial,epoch,
        ignore_keys_for_eval=None,metrics=None,eval_dataloader=None,
        description=None,grad_norm=None,start_time=None,**kwargs
    ):
        self._current_epoch = epoch
        if not isinstance(ignore_keys_for_eval, (list, tuple)):
          ignore_keys_for_eval = []

        return super()._maybe_log_save_evaluate(tr_loss=tr_loss,model=model,
            trial=trial,epoch=epoch,ignore_keys_for_eval=ignore_keys_for_eval,
            grad_norm=grad_norm,start_time=start_time,**kwargs)

# 3. Train Execution

In [None]:
def execute_training():
    MODELS = ['castorini/afriberta_large','xlm-roberta-base', 'UBC-NLP/serengeti-E250']
    LANGUAGES = ['xhosa','zulu', 'swahili']
    trainers = {}

    for language in LANGUAGES:
        # Load data
        train_df, test_df = load_and_preprocess_data(language)

        # Split train into train/validation (80/20)
        train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

        # Prepare data
        train_data = preprocess_for_model(train_df, language)
        val_data = preprocess_for_model(val_df, language)
        test_data = preprocess_for_model(test_df, language)

        for model_name in MODELS:
            print(f"\n🔁 Processing: {model_name} | {language.upper()}")

            # STEP 1: Hyperparameter tuning
            best_params = tune_hyperparameters(model_name, language, train_data, val_data)

            # Define freezing layers
            if 'roberta' in model_name.lower() or 'afriberta' in model_name.lower():
                freeze_layers = [f'encoder.layer.{i}' for i in range(12)] + ['embeddings']
            elif 'serengeti' in model_name.lower():
                freeze_layers = [f'encoder.layer.{i}' for i in range(6)] + ['embeddings']
            else:
                freeze_layers = None

            print("======================================================================================")
            print(f"✅ Best params for returned {model_name}-{language}: {best_params}")
            print("======================================================================================")
            # STEP 2: Train final model with best params and freezing
            classifier = EmotionClassifier(
                model_name,
                language,
                freeze_layers=freeze_layers,
                hyperparams=best_params
            )

            # Tokenize data
            train_encodings = classifier.tokenize_data(train_data['text'], train_data['labels'])
            test_encodings = classifier.tokenize_data(test_data['text'], test_data['labels'])

            # Create datasets (using full training data)
            train_dataset = EmotionDataset(train_encodings)
            test_dataset = EmotionDataset(test_encodings)

            # Train and evaluate
            trainer = classifier.train(train_dataset, test_dataset)
            results = trainer.evaluate()

            print(f"\nResults for {model_name} on {language}:")
            for k, v in results.items():
                if k != 'eval_runtime':
                    print(f"{k}: {v:.4f}")

            trainers[f"{model_name.split('/')[-1]}_{language}"] = trainer

    return trainers

In [None]:
 execute_training()

# Population of Models and Tokenizers

In [None]:

model_configs = {
    'afriberta_large_zulu': '/content/drive/Shareddrives/COS 760 Group 13 Project/Models/afriberta_large_zulu/final_model',
    'afriberta_large_xhosa': '/content/drive/Shareddrives/COS 760 Group 13 Project/Models/afriberta_large_xhosa/final_model',
    'afriberta_large_swahili': '/content/drive/Shareddrives/COS 760 Group 13 Project/Models/afriberta_large_swahili/final_model',
    'xlm-roberta-base_zulu': '/content/drive/Shareddrives/COS 760 Group 13 Project/Models/xlm-roberta-base_zulu/final_model',
    'xlm-roberta-base_xhosa': '/content/drive/Shareddrives/COS 760 Group 13 Project/Models/xlm-roberta-base_xhosa/final_model',
    'xlm-roberta-base_swahili': '/content/drive/Shareddrives/COS 760 Group 13 Project/Models/xlm-roberta-base_xhosa/final_model',
    'serengeti-E250_zulu': '/content/drive/Shareddrives/COS 760 Group 13 Project/Models/serengeti-E250_zulu/final_model',
    'serengeti-E250_xhosa': '/content/drive/Shareddrives/COS 760 Group 13 Project/Models/serengeti-E250_xhosa/final_model',
    'serengeti-E250_swahili': '/content/drive/Shareddrives/COS 760 Group 13 Project/Models/serengeti-E250_swahili/final_model',
}
tokenizers = {}
models = {}
for key, model_path in model_configs.items():
    print(f"Loading model from {model_path} for {key}...")
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_path,
        num_labels=6,
        problem_type="multi_label_classification"
    )
    tokenizers[key] = tokenizer
    models[key] = model


Loading model from /content/drive/Shareddrives/COS 760 Group 13 Project/Models/afriberta_large_zulu/final_model for afriberta_large_zulu...
Loading model from /content/drive/Shareddrives/COS 760 Group 13 Project/Models/afriberta_large_xhosa/final_model for afriberta_large_xhosa...
Loading model from /content/drive/Shareddrives/COS 760 Group 13 Project/Models/afriberta_large_swahili/final_model for afriberta_large_swahili...
Loading model from /content/drive/Shareddrives/COS 760 Group 13 Project/Models/xlm-roberta-base_zulu/final_model for xlm-roberta-base_zulu...
Loading model from /content/drive/Shareddrives/COS 760 Group 13 Project/Models/xlm-roberta-base_xhosa/final_model for xlm-roberta-base_xhosa...
Loading model from /content/drive/Shareddrives/COS 760 Group 13 Project/Models/xlm-roberta-base_xhosa/final_model for xlm-roberta-base_swahili...
Loading model from /content/drive/Shareddrives/COS 760 Group 13 Project/Models/serengeti-E250_zulu/final_model for serengeti-E250_zulu...
Lo

In [None]:
import gc
import torch
import numpy as np
import pandas as pd
import re
from torch.utils.data import Dataset, DataLoader
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'


class EmotionDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        if 'labels' in item:
            item['labels'] = item['labels'].float()
        return item

EMOTION_LABELS = ["anger", "disgust", "fear", "joy", "sadness", "surprise"]

def preprocess_for_evaluation(df):
    """Prepare data in format needed for model evaluation"""
    return {
        'text': df['text'].tolist(),
        'labels': df[EMOTION_LABELS].astype(float).values.tolist(),
    }

def clean_text(text):
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text.lower())
    return text.strip()

def get_predictions(model, tokenizer, dataset, device, threshold=0.2):
    model.eval()
    predictions = []
    true_labels = []
    dataloader = DataLoader(dataset, batch_size=16)

    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**{k: batch[k] for k in ['input_ids', 'attention_mask'] if k in batch})
            logits = outputs.logits

            probs = torch.sigmoid(logits)


            preds = (probs >= threshold).int().cpu().numpy()
            labels = batch['labels'].cpu().numpy()


            predictions.extend(preds)
            true_labels.extend(labels)

    return np.array(predictions), np.array(true_labels)



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

DATA_PATHS = {
    "zulu": "/content/drive/Shared drives/COS 760 Group 13 Project/Data/Zulu/test-00000-of-00001.parquet",
    "xhosa": "/content/drive/Shared drives/COS 760 Group 13 Project/Data/Xhosa/test-00000-of-00001.parquet",
    "swahili": "/content/drive/Shared drives/COS 760 Group 13 Project/Data/Swahili/test-00000-of-00001.parquet",
}

test_dfs = {}
test_processed_data = {}
for lang, path in DATA_PATHS.items():
    print(f"Loading test data for {lang}...")
    df = pd.read_parquet(path)
    df['text'] = df['text'].apply(clean_text)
    for label in EMOTION_LABELS:
        if label not in df.columns:
            df[label] = 0
    test_dfs[lang] = df
    test_processed_data[lang] = preprocess_for_evaluation(df)
    print(f"Loaded and preprocessed test data for {lang}. Example labels: {test_processed_data[lang]['labels'][0]}")

results = {}


for model_key in models:
    print(f"\nGenerating predictions for {model_key}...")

    try:
        model = models[model_key].to(device)
    except RuntimeError as e:
        if 'out of memory' in str(e):
            print("CUDA out of memory error encountered while loading the model.")
            print("Trying to clear cache and retry...")
            torch.cuda.empty_cache()
            gc.collect()

            try:
                model = models[model_key].to(device)
            except RuntimeError as e2:
                print("Still out of memory after clearing cache. Skipping model:", model_key)
                continue
        else:
            raise e
    tokenizer = tokenizers[model_key]
    lang = model_key.split("_")[-1]

    test_encodings = tokenizer(
        test_processed_data[lang]['text'],
        truncation=True,
        padding=True,
        max_length=512
    )
    test_dataset = EmotionDataset(test_encodings)
    test_dataset.encodings['labels'] = test_processed_data[lang]['labels']


    preds, truths = get_predictions(model, tokenizer, test_dataset, device)

    results[model_key] = {
        'true': truths,
        'pred': preds,
        'threshold': 0.2
    }
    print(f"Finished generating predictions for {model_key}. Stored {len(truths)} samples.")

    del model
    torch.cuda.empty_cache()
    gc.collect()

Using device: cuda
Loading test data for zulu...
Loaded and preprocessed test data for zulu. Example labels: [0.0, 1.0, 0.0, 0.0, 1.0, 0.0]
Loading test data for xhosa...
Loaded and preprocessed test data for xhosa. Example labels: [0.0, 0.0, 0.0, 0.0, 0.0, 1.0]
Loading test data for swahili...
Loaded and preprocessed test data for swahili. Example labels: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

Generating predictions for afriberta_large_zulu...
Finished generating predictions for afriberta_large_zulu. Stored 2047 samples.

Generating predictions for afriberta_large_xhosa...
Finished generating predictions for afriberta_large_xhosa. Stored 1594 samples.

Generating predictions for afriberta_large_swahili...
Finished generating predictions for afriberta_large_swahili. Stored 3312 samples.

Generating predictions for xlm-roberta-base_zulu...
Finished generating predictions for xlm-roberta-base_zulu. Stored 2047 samples.

Generating predictions for xlm-roberta-base_xhosa...
Finished generating pr

In [None]:
 wandb.init()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mu21668452[0m ([33mu21668452-university-of-pretoria[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import multilabel_confusion_matrix, classification_report, hamming_loss
import time
import wandb

class EmotionDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        if 'labels' in item:
            item['labels'] = item['labels'].float()
        return item


EMOTION_LABELS = ["anger", "disgust", "fear", "joy", "sadness", "surprise"]


def preprocess_for_evaluation(df):
    return {
        'text': df['text'].tolist(),
        'labels': df[EMOTION_LABELS].astype(float).values.tolist(),
    }

def get_predictions(model, tokenizer, dataset, device, threshold=0.2):
    model.eval()
    predictions = []
    true_labels = []
    inference_times = []

    # Create a DataLoader
    dataloader = DataLoader(dataset, batch_size=16)

    with torch.no_grad():
        for batch_idx, batch in enumerate(dataloader):
            start_time = time.time()

            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = model(**{k: batch[k] for k in ['input_ids', 'attention_mask'] if k in batch})
            logits = outputs.logits
            probs = torch.sigmoid(logits)


            preds = (probs >= threshold).int().cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(batch['labels'].cpu().numpy())

            inference_time = time.time() - start_time
            inference_times.append(inference_time)

    avg_inference_time = np.mean(inference_times)
    print(f"\nAverage inference time per batch: {avg_inference_time:.4f}s")
    if wandb.run is not None:
        wandb.log({"avg_inference_time": avg_inference_time})


    return np.array(predictions), np.array(true_labels)


results = {}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Using device: {device}")

DATA_PATHS = {
    "zulu": "/content/drive/Shared drives/COS 760 Group 13 Project/Data/Zulu/test-00000-of-00001.parquet",
    "xhosa": "/content/drive/Shared drives/COS 760 Group 13 Project/Data/Xhosa/test-00000-of-00001.parquet",
    "swahili": "/content/drive/Shared drives/COS 760 Group 13 Project/Data/Swahili/test-00000-of-00001.parquet",
}


def clean_text(text):
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text.lower())
    return text.strip()

test_dfs = {}
test_processed_data = {}
for lang, path in DATA_PATHS.items():
    print(f"Loading test data for {lang}...")
    df = pd.read_parquet(path)
    df['text'] = df['text'].apply(clean_text) # Apply cleaning

    for label in EMOTION_LABELS:
         if label not in df.columns:

             df[label] = 0
    test_dfs[lang] = df
    test_processed_data[lang] = preprocess_for_evaluation(df)
    print(f"Loaded and preprocessed test data for {lang}. Example labels: {test_processed_data[lang]['labels'][0]}")



for model_key in models:
    print(f"\nGenerating predictions for {model_key}...")

    wandb.init(project="bantu-emotion-multilabel", name=f"{model_key}_predictions")

    model = models[model_key].to(device)
    tokenizer = tokenizers[model_key]
    lang = model_key.split("_")[-1]


    test_encodings = tokenizer(
        test_processed_data[lang]['text'],
        truncation=True,
        padding=True,
        max_length=512
    )
    test_dataset = EmotionDataset(test_encodings)

    test_dataset.encodings['labels'] = test_processed_data[lang]['labels']



    preds, truths = get_predictions(model, tokenizer, test_dataset, device)

    # Store results
    results[model_key] = {
        'true': truths,
        'pred': preds,
        'threshold': 0.5
    }
    print(f"Finished generating predictions for {model_key}. Stored {len(truths)} samples.")

    wandb.finish()

Using device: cuda
Loading test data for zulu...
Loaded and preprocessed test data for zulu. Example labels: [0.0, 1.0, 0.0, 0.0, 1.0, 0.0]
Loading test data for xhosa...
Loaded and preprocessed test data for xhosa. Example labels: [0.0, 0.0, 0.0, 0.0, 0.0, 1.0]
Loading test data for swahili...
Loaded and preprocessed test data for swahili. Example labels: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

Generating predictions for afriberta_large_zulu...



Average inference time per batch: 0.0695s
Finished generating predictions for afriberta_large_zulu. Stored 2047 samples.


0,1
avg_inference_time,▁

0,1
avg_inference_time,0.06947



Generating predictions for afriberta_large_xhosa...



Average inference time per batch: 0.0492s
Finished generating predictions for afriberta_large_xhosa. Stored 1594 samples.


0,1
avg_inference_time,▁

0,1
avg_inference_time,0.0492



Generating predictions for afriberta_large_swahili...



Average inference time per batch: 0.0553s
Finished generating predictions for afriberta_large_swahili. Stored 3312 samples.


0,1
avg_inference_time,▁

0,1
avg_inference_time,0.05527



Generating predictions for xlm-roberta-base_zulu...



Average inference time per batch: 0.0662s
Finished generating predictions for xlm-roberta-base_zulu. Stored 2047 samples.


0,1
avg_inference_time,▁

0,1
avg_inference_time,0.06617



Generating predictions for xlm-roberta-base_xhosa...



Average inference time per batch: 0.0452s
Finished generating predictions for xlm-roberta-base_xhosa. Stored 1594 samples.


0,1
avg_inference_time,▁

0,1
avg_inference_time,0.04523



Generating predictions for xlm-roberta-base_swahili...



Average inference time per batch: 0.0598s
Finished generating predictions for xlm-roberta-base_swahili. Stored 3312 samples.


0,1
avg_inference_time,▁

0,1
avg_inference_time,0.05979



Generating predictions for serengeti-E250_zulu...



Average inference time per batch: 0.0540s
Finished generating predictions for serengeti-E250_zulu. Stored 2047 samples.


0,1
avg_inference_time,▁

0,1
avg_inference_time,0.05396



Generating predictions for serengeti-E250_xhosa...



Average inference time per batch: 0.0359s
Finished generating predictions for serengeti-E250_xhosa. Stored 1594 samples.


0,1
avg_inference_time,▁

0,1
avg_inference_time,0.03593



Generating predictions for serengeti-E250_swahili...



Average inference time per batch: 0.0549s
Finished generating predictions for serengeti-E250_swahili. Stored 3312 samples.


0,1
avg_inference_time,▁

0,1
avg_inference_time,0.05491


# Confusion Matrices

In [None]:

from transformers import Trainer, TrainingArguments
import pandas as pd
import numpy as np
import torch
import os
from sklearn.metrics import f1_score, roc_auc_score, hamming_loss, accuracy_score
from IPython.display import display


def multi_label_metrics(predictions, labels, threshold=0.3):
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.tensor(predictions))
    preds = np.zeros(probs.shape)
    preds[np.where(probs >= threshold)] = 1
    true = labels

    f1_micro = f1_score(true, preds, average='micro')
    f1_macro = f1_score(true, preds, average='macro')
    roc_auc = roc_auc_score(true, probs, average='micro')
    hamming = hamming_loss(true, preds)
    subset_accuracy = accuracy_score(true, preds)

    return {
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'roc_auc': roc_auc,
        'hamming_loss': hamming,
        'subset_accuracy': subset_accuracy
    }


class EvaluationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)


def evaluate_models(model_configs):
    results = []

    for key, model_path in model_configs.items():

        language = key.split('_')[-1]

        print(f"\n{'='*50}")
        print(f"Evaluating {key} on {language} test set")
        print(f"{'='*50}")


        _, test_df = load_and_preprocess_data(language)
        test_data = preprocess_for_model(test_df, language)


        tokenizer = tokenizers[key]
        test_encodings = tokenizer(
            test_data['text'],
            truncation=True,
            padding=True,
            max_length=512
        )


        test_dataset = EvaluationDataset(
            test_encodings,
            test_data['labels']
        )


        trainer = Trainer(
            model=models[key],
            args=TrainingArguments(
                output_dir="./tmp_eval",
                per_device_eval_batch_size=16,
                disable_tqdm=False
            ),
            compute_metrics=lambda p: multi_label_metrics(p.predictions, p.label_ids)
        )


        eval_results = trainer.evaluate(test_dataset)

        result_entry = {
            'Model': key,
            'Language': language,
            'Loss': eval_results['eval_loss'],
            'F1 Micro': eval_results['eval_f1_micro'],
            'F1 Macro': eval_results['eval_f1_macro'],
            'ROC AUC': eval_results['eval_roc_auc'],
            'Hamming Loss': eval_results['eval_hamming_loss'],
            'Subset Accuracy': eval_results['eval_subset_accuracy'],
            'Runtime': eval_results['eval_runtime'],
            'Samples Per Second': eval_results['eval_samples_per_second'],
            'Steps Per Second': eval_results['eval_steps_per_second']
        }

        results.append(result_entry)


        print(f"\nEvaluation results for {key}:")
        for k, v in result_entry.items():
            if isinstance(v, float):
                print(f"{k}: {v:.6f}")
            else:
                print(f"{k}: {v}")


    results_df = pd.DataFrame(results)


    results_path = '/content/drive/Shareddrives/COS 760 Group 13 Project/Results/evaluation_results.csv'
    os.makedirs(os.path.dirname(results_path), exist_ok=True)
    results_df.to_csv(results_path, index=False)
    print(f"\nSaved evaluation results to {results_path}")

    return results_df


evaluation_results = evaluate_models(model_configs)


display(evaluation_results)



Evaluating afriberta_large_zulu on zulu test set



Evaluation results for afriberta_large_zulu:
Model: afriberta_large_zulu
Language: zulu
Loss: 0.264422
F1 Micro: 0.001953
F1 Macro: 0.000782
ROC AUC: 0.707974
Hamming Loss: 0.083211
Subset Accuracy: 0.576942
Runtime: 10.339000
Samples Per Second: 197.988000
Steps Per Second: 12.380000

Evaluating afriberta_large_xhosa on xhosa test set



Evaluation results for afriberta_large_xhosa:
Model: afriberta_large_xhosa
Language: xhosa
Loss: 0.341455
F1 Micro: 0.504813
F1 Macro: 0.181740
ROC AUC: 0.824064
Hamming Loss: 0.220514
Subset Accuracy: 0.119197
Runtime: 5.724200
Samples Per Second: 278.467000
Steps Per Second: 17.470000

Evaluating afriberta_large_swahili on swahili test set



Evaluation results for afriberta_large_swahili:
Model: afriberta_large_swahili
Language: swahili
Loss: 0.310652
F1 Micro: 0.005988
F1 Macro: 0.004132
ROC AUC: 0.648552
Hamming Loss: 0.100242
Subset Accuracy: 0.443237
Runtime: 13.004800
Samples Per Second: 254.675000
Steps Per Second: 15.917000

Evaluating xlm-roberta-base_zulu on zulu test set



Evaluation results for xlm-roberta-base_zulu:
Model: xlm-roberta-base_zulu
Language: zulu
Loss: 0.563618
F1 Micro: 0.153360
F1 Macro: 0.148114
ROC AUC: 0.630767
Hamming Loss: 0.916952
Subset Accuracy: 0.000000
Runtime: 9.159700
Samples Per Second: 223.480000
Steps Per Second: 13.974000

Evaluating xlm-roberta-base_xhosa on xhosa test set



Evaluation results for xlm-roberta-base_xhosa:
Model: xlm-roberta-base_xhosa
Language: xhosa
Loss: 0.492806
F1 Micro: 0.335648
F1 Macro: 0.240978
ROC AUC: 0.813303
Hamming Loss: 0.610100
Subset Accuracy: 0.000000
Runtime: 4.899600
Samples Per Second: 325.336000
Steps Per Second: 20.410000

Evaluating xlm-roberta-base_swahili on swahili test set



Evaluation results for xlm-roberta-base_swahili:
Model: xlm-roberta-base_swahili
Language: swahili
Loss: 0.493953
F1 Micro: 0.187383
F1 Macro: 0.155235
ROC AUC: 0.563612
Hamming Loss: 0.742854
Subset Accuracy: 0.000000
Runtime: 13.375100
Samples Per Second: 247.625000
Steps Per Second: 15.477000

Evaluating serengeti-E250_zulu on zulu test set



Evaluation results for serengeti-E250_zulu:
Model: serengeti-E250_zulu
Language: zulu
Loss: 0.266396
F1 Micro: 0.000000
F1 Macro: 0.000000
ROC AUC: 0.708950
Hamming Loss: 0.083048
Subset Accuracy: 0.577430
Runtime: 7.562400
Samples Per Second: 270.680000
Steps Per Second: 16.926000

Evaluating serengeti-E250_xhosa on xhosa test set



Evaluation results for serengeti-E250_xhosa:
Model: serengeti-E250_xhosa
Language: xhosa
Loss: 0.354824
F1 Micro: 0.492480
F1 Macro: 0.177016
ROC AUC: 0.827612
Hamming Loss: 0.246968
Subset Accuracy: 0.014429
Runtime: 3.884300
Samples Per Second: 410.366000
Steps Per Second: 25.744000

Evaluating serengeti-E250_swahili on swahili test set



Evaluation results for serengeti-E250_swahili:
Model: serengeti-E250_swahili
Language: swahili
Loss: 0.313354
F1 Micro: 0.000000
F1 Macro: 0.000000
ROC AUC: 0.639736
Hamming Loss: 0.099738
Subset Accuracy: 0.444444
Runtime: 12.459800
Samples Per Second: 265.815000
Steps Per Second: 16.613000

Saved evaluation results to /content/drive/Shareddrives/COS 760 Group 13 Project/Results/evaluation_results.csv


Unnamed: 0,Model,Language,Loss,F1 Micro,F1 Macro,ROC AUC,Hamming Loss,Subset Accuracy,Runtime,Samples Per Second,Steps Per Second
0,afriberta_large_zulu,zulu,0.264422,0.001953,0.000782,0.707974,0.083211,0.576942,10.339,197.988,12.38
1,afriberta_large_xhosa,xhosa,0.341455,0.504813,0.18174,0.824064,0.220514,0.119197,5.7242,278.467,17.47
2,afriberta_large_swahili,swahili,0.310652,0.005988,0.004132,0.648552,0.100242,0.443237,13.0048,254.675,15.917
3,xlm-roberta-base_zulu,zulu,0.563618,0.15336,0.148114,0.630767,0.916952,0.0,9.1597,223.48,13.974
4,xlm-roberta-base_xhosa,xhosa,0.492806,0.335648,0.240978,0.813303,0.6101,0.0,4.8996,325.336,20.41
5,xlm-roberta-base_swahili,swahili,0.493953,0.187383,0.155235,0.563612,0.742854,0.0,13.3751,247.625,15.477
6,serengeti-E250_zulu,zulu,0.266396,0.0,0.0,0.70895,0.083048,0.57743,7.5624,270.68,16.926
7,serengeti-E250_xhosa,xhosa,0.354824,0.49248,0.177016,0.827612,0.246968,0.014429,3.8843,410.366,25.744
8,serengeti-E250_swahili,swahili,0.313354,0.0,0.0,0.639736,0.099738,0.444444,12.4598,265.815,16.613



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import multilabel_confusion_matrix, classification_report, hamming_loss, accuracy_score
import wandb


output_dir = "/content/drive/Shared drives/COS 760 Group 13 Project/Results/"
os.makedirs(output_dir, exist_ok=True)

label_names = EMOTION_LABELS
ablation_flags = {'use_class_weights': False, 'data_augmentation': False}

def parse_model_and_lang(key):
    parts = key.split("_")
    if len(parts) <= 1:
        print(f"Warning: Unable to parse model/lang from key '{key}'")
        return None, None
    model_name = "_".join(parts[:-1])
    lang = parts[-1]
    return {
        'afriberta_large': 'afriberta',
        'xlm-roberta-base': 'xlmr',
        'serengeti-E250': 'serengeti'
    }.get(model_name, model_name), lang

def log_metrics_to_wandb(report, h_loss, subset_acc):
    for label in label_names:
        metrics = report.get(label, {})
        wandb.log({
            f"{label}_precision": metrics.get("precision", 0.0),
            f"{label}_recall": metrics.get("recall", 0.0),
            f"{label}_f1": metrics.get("f1-score", 0.0)
        })

    macro = report.get("macro avg", {})
    wandb.log({
        "macro_precision": macro.get("precision", 0.0),
        "macro_recall": macro.get("recall", 0.0),
        "macro_f1": macro.get("f1-score", 0.0),
        "hamming_loss": h_loss,
        "subset_accuracy": subset_acc
    })

def log_confusion_matrices(y_true, y_pred_binary, model_name, lang):
    mcm = multilabel_confusion_matrix(y_true, y_pred_binary)
    for i, label in enumerate(label_names):
        if i >= mcm.shape[0]:
            print(f"Skipping confusion matrix for '{label}' (index out of range)")
            continue
        fig, ax = plt.subplots(figsize=(4, 3))
        sns.heatmap(mcm[i], annot=True, fmt='d', cmap='Blues', cbar=False, ax=ax)
        ax.set(title=f'Confusion Matrix: {label}', xlabel='Predicted', ylabel='True')
        ax.set_xticks([0.5, 1.5])
        ax.set_yticks([0.5, 1.5])
        ax.set_xticklabels(['Negative', 'Positive'])
        ax.set_yticklabels(['Negative', 'Positive'])

        file_path = os.path.join(output_dir, f"{model_name}_{lang}_cm_{label}.png")
        plt.savefig(file_path)
        wandb.log({f"confusion_matrix_{label}": wandb.Image(fig)})
        plt.close(fig)

def process_results(results):
    if not results:
        print("Warning: 'results' dictionary is empty.")
        return

    for key, value in results.items():
        model_name, lang = parse_model_and_lang(key)
        if not model_name or not lang:
            continue

        y_true = np.asarray(value['true'])
        y_pred = np.asarray(value['pred'])
        threshold = value.get("threshold", 0.5)
        y_pred_binary = (y_pred >= threshold).astype(int)

        print(f"\nProcessing: {key} (Model: {model_name}, Lang: {lang}, Threshold: {threshold})")

        try:
            wandb.init(
                project="bantu-emotion-multilabel",
                name=f"{model_name}_{lang}_eval",
                config={"model": model_name, "language": lang, "threshold": threshold, **ablation_flags},
                reinit=True
            )

            report = classification_report(y_true, y_pred_binary, target_names=label_names, output_dict=True, zero_division=0)
            h_loss = hamming_loss(y_true, y_pred_binary)
            subset_acc = accuracy_score(y_true, y_pred_binary)

            log_metrics_to_wandb(report, h_loss, subset_acc)
            log_confusion_matrices(y_true, y_pred_binary, model_name, lang)

        except Exception as e:
            print(f"[ERROR] Processing failed for {key}: {e}")
        finally:
            if wandb.run:
                wandb.finish()


process_results(results)


Processing: afriberta_large_zulu (Model: afriberta, Lang: zulu, Threshold: 0.5)


0,1
eval/f1_macro,▁▆▁▅█▆▁▆▁▆▁▅█▆▁▆▁▆▁▅█▆▁▆▁▆▁▅█▆▁▆▁▆▁▅█▆▁▁
eval/f1_micro,▁█▁▃▆▄▁█▁█▁▃▆▄▁█▁█▁▃▆▄▁█▁█▁▃▆▄▁█▁█▁▃▆▄▁▁
eval/hamming_loss,▁▂▁█▅▇▁▂▁▂▁█▅▇▁▂▁▂▁█▅▇▁▂▁▂▁█▅▇▁▂▁▂▁█▅▇▁▁
eval/loss,▁▃▂█▆▆▁▃▁▃▂█▆▆▁▃▁▃▂█▆▆▁▃▁▃▂█▆▆▁▃▁▃▂█▆▆▁▂
eval/model_preparation_time,▁▄▁▁▃▁▁▁▃▃▂▄▅▁▁▁▂▁▂▂▁▃▂▁▁▁▂▂▁▂▂▁█▂▂█▄▂▁▁
eval/roc_auc,▅█▃▃█▁▅█▅█▃▃█▁▅█▅█▃▃█▁▅█▅█▃▃█▁▅█▅█▃▃█▁▅▃
eval/runtime,▅▂▇▅▂█▃▁▅▂▇▅▂▇▃▁▅▂▇▅▂█▄▁▆▂▇▅▂█▄▁▆▂█▅▂█▄▇
eval/samples_per_second,▂▄▃▂▅▃▄█▂▄▃▂▅▃▄█▁▄▃▂▅▂▃▇▁▄▃▂▅▂▃▇▁▃▃▂▅▂▃▃
eval/steps_per_second,▂▄▃▂▅▃▄█▂▄▃▂▅▃▄█▁▄▃▂▅▂▃▇▁▄▃▂▅▂▃▇▁▃▃▂▅▂▃▃
eval/subset_accuracy,█▂▆▁▁▁█▁▆█▂▆▁▁▁█▁▆█▂▆▁▁▁█▁▆█▂▆▁▁▁█▁▆

0,1
eval/f1_macro,0.0
eval/f1_micro,0.0
eval/hamming_loss,0.09974
eval/loss,0.31335
eval/model_preparation_time,0.0026
eval/roc_auc,0.63974
eval/runtime,12.4598
eval/samples_per_second,265.815
eval/steps_per_second,16.613
eval/subset_accuracy,0.44444


0,1
anger_f1,▁
anger_precision,▁
anger_recall,▁
disgust_f1,▁
disgust_precision,▁
disgust_recall,▁
fear_f1,▁
fear_precision,▁
fear_recall,▁
hamming_loss,▁

0,1
anger_f1,0.0
anger_precision,0.0
anger_recall,0.0
disgust_f1,0.0
disgust_precision,0.0
disgust_recall,0.0
fear_f1,0.0
fear_precision,0.0
fear_recall,0.0
hamming_loss,0.10886



Processing: afriberta_large_xhosa (Model: afriberta, Lang: xhosa, Threshold: 0.5)


0,1
anger_f1,▁
anger_precision,▁
anger_recall,▁
disgust_f1,▁
disgust_precision,▁
disgust_recall,▁
fear_f1,▁
fear_precision,▁
fear_recall,▁
hamming_loss,▁

0,1
anger_f1,0.0
anger_precision,0.0
anger_recall,0.0
disgust_f1,0.0
disgust_precision,0.0
disgust_recall,0.0
fear_f1,0.0
fear_precision,0.0
fear_recall,0.0
hamming_loss,0.2523



Processing: afriberta_large_swahili (Model: afriberta, Lang: swahili, Threshold: 0.5)


0,1
anger_f1,▁
anger_precision,▁
anger_recall,▁
disgust_f1,▁
disgust_precision,▁
disgust_recall,▁
fear_f1,▁
fear_precision,▁
fear_recall,▁
hamming_loss,▁

0,1
anger_f1,0.0
anger_precision,0.0
anger_recall,0.0
disgust_f1,0.0
disgust_precision,0.0
disgust_recall,0.0
fear_f1,0.0
fear_precision,0.0
fear_recall,0.0
hamming_loss,0.11242



Processing: xlm-roberta-base_zulu (Model: xlmr, Lang: zulu, Threshold: 0.5)


0,1
anger_f1,▁
anger_precision,▁
anger_recall,▁
disgust_f1,▁
disgust_precision,▁
disgust_recall,▁
fear_f1,▁
fear_precision,▁
fear_recall,▁
hamming_loss,▁

0,1
anger_f1,0.15086
anger_precision,0.08158
anger_recall,1.0
disgust_f1,0.05603
disgust_precision,0.02882
disgust_recall,1.0
fear_f1,0.06339
fear_precision,0.03273
fear_recall,1.0
hamming_loss,0.91695



Processing: xlm-roberta-base_xhosa (Model: xlmr, Lang: xhosa, Threshold: 0.5)


0,1
anger_f1,▁
anger_precision,▁
anger_recall,▁
disgust_f1,▁
disgust_precision,▁
disgust_recall,▁
fear_f1,▁
fear_precision,▁
fear_recall,▁
hamming_loss,▁

0,1
anger_f1,0.07952
anger_precision,0.04141
anger_recall,1.0
disgust_f1,0.00999
disgust_precision,0.00502
disgust_recall,1.0
fear_f1,0.03815
fear_precision,0.01945
fear_recall,1.0
hamming_loss,0.84191



Processing: xlm-roberta-base_swahili (Model: xlmr, Lang: swahili, Threshold: 0.5)


0,1
anger_f1,▁
anger_precision,▁
anger_recall,▁
disgust_f1,▁
disgust_precision,▁
disgust_recall,▁
fear_f1,▁
fear_precision,▁
fear_recall,▁
hamming_loss,▁

0,1
anger_f1,0.17219
anger_precision,0.09426
anger_recall,0.99363
disgust_f1,0.13626
disgust_precision,0.07311
disgust_recall,1.0
fear_f1,0.0552
fear_precision,0.02838
fear_recall,1.0
hamming_loss,0.90026



Processing: serengeti-E250_zulu (Model: serengeti, Lang: zulu, Threshold: 0.5)


0,1
anger_f1,▁
anger_precision,▁
anger_recall,▁
disgust_f1,▁
disgust_precision,▁
disgust_recall,▁
fear_f1,▁
fear_precision,▁
fear_recall,▁
hamming_loss,▁

0,1
anger_f1,0.0
anger_precision,0.0
anger_recall,0.0
disgust_f1,0.0
disgust_precision,0.0
disgust_recall,0.0
fear_f1,0.0
fear_precision,0.0
fear_recall,0.0
hamming_loss,0.11358



Processing: serengeti-E250_xhosa (Model: serengeti, Lang: xhosa, Threshold: 0.5)


0,1
anger_f1,▁
anger_precision,▁
anger_recall,▁
disgust_f1,▁
disgust_precision,▁
disgust_recall,▁
fear_f1,▁
fear_precision,▁
fear_recall,▁
hamming_loss,▁

0,1
anger_f1,0.0
anger_precision,0.0
anger_recall,0.0
disgust_f1,0.0
disgust_precision,0.0
disgust_recall,0.0
fear_f1,0.0
fear_precision,0.0
fear_recall,0.0
hamming_loss,0.25157



Processing: serengeti-E250_swahili (Model: serengeti, Lang: swahili, Threshold: 0.5)


0,1
anger_f1,▁
anger_precision,▁
anger_recall,▁
disgust_f1,▁
disgust_precision,▁
disgust_recall,▁
fear_f1,▁
fear_precision,▁
fear_recall,▁
hamming_loss,▁

0,1
anger_f1,0.0
anger_precision,0.0
anger_recall,0.0
disgust_f1,0.0
disgust_precision,0.0
disgust_recall,0.0
fear_f1,0.0
fear_precision,0.0
fear_recall,0.0
hamming_loss,0.09974


# Evaluate and Compare Models

In [None]:

from transformers import Trainer, TrainingArguments
import pandas as pd
import numpy as np
import torch
import os
from sklearn.metrics import f1_score, roc_auc_score, hamming_loss


def multi_label_metrics(predictions, labels, threshold=0.3):
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.tensor(predictions))
    preds = np.zeros(probs.shape)
    preds[np.where(probs >= threshold)] = 1
    true = labels

    f1_micro = f1_score(true, preds, average='micro')
    f1_macro = f1_score(true, preds, average='macro')
    roc_auc = roc_auc_score(true, probs, average='micro')
    hamming = hamming_loss(true, preds)

    return {
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'roc_auc': roc_auc,
        'hamming_loss': hamming
    }


class EvaluationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)


def evaluate_models(model_configs):
    results = []

    for key, model_path in model_configs.items():

        language = key.split('_')[-1]

        print(f"\n{'='*50}")
        print(f"Evaluating {key} on {language} test set")
        print(f"{'='*50}")


        _, test_df = load_and_preprocess_data(language)
        test_data = preprocess_for_model(test_df, language)


        tokenizer = tokenizers[key]
        test_encodings = tokenizer(
            test_data['text'],
            truncation=True,
            padding=True,
            max_length=512
        )


        test_dataset = EvaluationDataset(
            test_encodings,
            test_data['labels']
        )


        trainer = Trainer(
            model=models[key],
            args=TrainingArguments(
                output_dir="./tmp_eval",
                per_device_eval_batch_size=16,
                disable_tqdm=False
            ),
            compute_metrics=lambda p: multi_label_metrics(p.predictions, p.label_ids)
        )


        eval_results = trainer.evaluate(test_dataset)


        result_entry = {
            'Model': key,
            'Language': language,
            'Loss': eval_results['eval_loss'],
            'F1 Micro': eval_results['eval_f1_micro'],
            'F1 Macro': eval_results['eval_f1_macro'],
            'ROC AUC': eval_results['eval_roc_auc'],
            'Hamming Loss': eval_results['eval_hamming_loss'],
            'Runtime': eval_results['eval_runtime'],
            'Samples Per Second': eval_results['eval_samples_per_second'],
            'Steps Per Second': eval_results['eval_steps_per_second']
        }

        results.append(result_entry)


        print(f"\nEvaluation results for {key}:")
        for k, v in result_entry.items():
            if isinstance(v, float):
                print(f"{k}: {v:.6f}")
            else:
                print(f"{k}: {v}")

    results_df = pd.DataFrame(results)


    results_path = '/content/drive/Shareddrives/COS 760 Group 13 Project/Results/evaluation_results.csv'
    os.makedirs(os.path.dirname(results_path), exist_ok=True)
    results_df.to_csv(results_path, index=False)
    print(f"\nSaved evaluation results to {results_path}")

    return results_df


evaluation_results = evaluate_models(model_configs)


from IPython.display import display
display(evaluation_results)


Evaluating afriberta_large_zulu on zulu test set



Evaluation results for afriberta_large_zulu:
Model: afriberta_large_zulu
Language: zulu
Loss: 0.264422
F1 Micro: 0.001953
F1 Macro: 0.000782
ROC AUC: 0.707974
Hamming Loss: 0.083211
Runtime: 9.168700
Samples Per Second: 223.260000
Steps Per Second: 13.961000

Evaluating afriberta_large_xhosa on xhosa test set



Evaluation results for afriberta_large_xhosa:
Model: afriberta_large_xhosa
Language: xhosa
Loss: 0.341455
F1 Micro: 0.504813
F1 Macro: 0.181740
ROC AUC: 0.824064
Hamming Loss: 0.220514
Runtime: 5.064300
Samples Per Second: 314.750000
Steps Per Second: 19.746000

Evaluating afriberta_large_swahili on swahili test set



Evaluation results for afriberta_large_swahili:
Model: afriberta_large_swahili
Language: swahili
Loss: 0.310652
F1 Micro: 0.005988
F1 Macro: 0.004132
ROC AUC: 0.648552
Hamming Loss: 0.100242
Runtime: 11.774100
Samples Per Second: 281.296000
Steps Per Second: 17.581000

Evaluating xlm-roberta-base_zulu on zulu test set



Evaluation results for xlm-roberta-base_zulu:
Model: xlm-roberta-base_zulu
Language: zulu
Loss: 0.563618
F1 Micro: 0.153360
F1 Macro: 0.148114
ROC AUC: 0.630767
Hamming Loss: 0.916952
Runtime: 8.668200
Samples Per Second: 236.152000
Steps Per Second: 14.767000

Evaluating xlm-roberta-base_xhosa on xhosa test set



Evaluation results for xlm-roberta-base_xhosa:
Model: xlm-roberta-base_xhosa
Language: xhosa
Loss: 0.492806
F1 Micro: 0.335648
F1 Macro: 0.240978
ROC AUC: 0.813303
Hamming Loss: 0.610100
Runtime: 4.629500
Samples Per Second: 344.310000
Steps Per Second: 21.600000

Evaluating xlm-roberta-base_swahili on swahili test set



Evaluation results for xlm-roberta-base_swahili:
Model: xlm-roberta-base_swahili
Language: swahili
Loss: 0.493953
F1 Micro: 0.187383
F1 Macro: 0.155235
ROC AUC: 0.563612
Hamming Loss: 0.742854
Runtime: 12.708400
Samples Per Second: 260.615000
Steps Per Second: 16.288000

Evaluating serengeti-E250_zulu on zulu test set



Evaluation results for serengeti-E250_zulu:
Model: serengeti-E250_zulu
Language: zulu
Loss: 0.266396
F1 Micro: 0.000000
F1 Macro: 0.000000
ROC AUC: 0.708950
Hamming Loss: 0.083048
Runtime: 7.088700
Samples Per Second: 288.768000
Steps Per Second: 18.057000

Evaluating serengeti-E250_xhosa on xhosa test set



Evaluation results for serengeti-E250_xhosa:
Model: serengeti-E250_xhosa
Language: xhosa
Loss: 0.354824
F1 Micro: 0.492480
F1 Macro: 0.177016
ROC AUC: 0.827612
Hamming Loss: 0.246968
Runtime: 3.675500
Samples Per Second: 433.681000
Steps Per Second: 27.207000

Evaluating serengeti-E250_swahili on swahili test set



Evaluation results for serengeti-E250_swahili:
Model: serengeti-E250_swahili
Language: swahili
Loss: 0.313354
F1 Micro: 0.000000
F1 Macro: 0.000000
ROC AUC: 0.639736
Hamming Loss: 0.099738
Runtime: 11.759400
Samples Per Second: 281.647000
Steps Per Second: 17.603000

Saved evaluation results to /content/drive/Shareddrives/COS 760 Group 13 Project/Results/evaluation_results.csv


Unnamed: 0,Model,Language,Loss,F1 Micro,F1 Macro,ROC AUC,Hamming Loss,Runtime,Samples Per Second,Steps Per Second
0,afriberta_large_zulu,zulu,0.264422,0.001953,0.000782,0.707974,0.083211,9.1687,223.26,13.961
1,afriberta_large_xhosa,xhosa,0.341455,0.504813,0.18174,0.824064,0.220514,5.0643,314.75,19.746
2,afriberta_large_swahili,swahili,0.310652,0.005988,0.004132,0.648552,0.100242,11.7741,281.296,17.581
3,xlm-roberta-base_zulu,zulu,0.563618,0.15336,0.148114,0.630767,0.916952,8.6682,236.152,14.767
4,xlm-roberta-base_xhosa,xhosa,0.492806,0.335648,0.240978,0.813303,0.6101,4.6295,344.31,21.6
5,xlm-roberta-base_swahili,swahili,0.493953,0.187383,0.155235,0.563612,0.742854,12.7084,260.615,16.288
6,serengeti-E250_zulu,zulu,0.266396,0.0,0.0,0.70895,0.083048,7.0887,288.768,18.057
7,serengeti-E250_xhosa,xhosa,0.354824,0.49248,0.177016,0.827612,0.246968,3.6755,433.681,27.207
8,serengeti-E250_swahili,swahili,0.313354,0.0,0.0,0.639736,0.099738,11.7594,281.647,17.603
