In [None]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, set_seed
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
from scipy.optimize import minimize
from datasets import Dataset
import os
import gc
import matplotlib.pyplot as plt
import random

In [None]:
MODEL_NAME = "xlm-roberta-large"
MAXIMUM_SEQUENCE_LENGTH = 160
BATCH_SIZE = 32
NUMBER_OF_EPOCHS = 5
LEARNING_RATE = 1e-5
GLOBAL_RANDOM_SEED = 42
FOCAL_LOSS_GAMMA = 2.0
MINIMUM_SAMPLES_FOR_OVERSAMPLING = 250
NUMBER_OF_FOLDS = 5

In [None]:
device_computation_target = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device_computation_target}")

Using device: cuda


In [None]:
def set_deterministic_seed(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)
    set_seed(seed_value)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
class FocalLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        if hasattr(self, 'class_weights') and self.class_weights is not None:
             alpha_weights = self.class_weights.to(logits.device)
        else:
             alpha_weights = None

        cross_entropy_loss = nn.functional.cross_entropy(
            logits.view(-1, self.model.config.num_labels),
            labels.view(-1),
            reduction='none',
            weight=alpha_weights
        )

        probability_of_true_class = torch.exp(-cross_entropy_loss)
        focal_loss = ((1 - probability_of_true_class) ** FOCAL_LOSS_GAMMA) * cross_entropy_loss
        mean_loss = focal_loss.mean()

        return (mean_loss, outputs) if return_outputs else mean_loss

In [None]:
def compute_evaluation_metrics(prediction_output):
    labels = prediction_output.label_ids
    predictions = prediction_output.predictions.argmax(-1)
    macro_f1_score = f1_score(labels, predictions, average='macro')
    return {'macro_f1': macro_f1_score}

In [None]:
def oversample_minority_classes(dataframe, label_column_name, minimum_samples):
    class_counts = dataframe[label_column_name].value_counts()
    dataframe_chunks = [dataframe]

    for label, count in class_counts.items():
        if count < minimum_samples:
            difference = minimum_samples - count
            samples_of_class = dataframe[dataframe[label_column_name] == label]
            if len(samples_of_class) > 0:
                upsampled_chunk = samples_of_class.sample(
                    n=difference,
                    replace=True,
                    random_state=GLOBAL_RANDOM_SEED
                )
                dataframe_chunks.append(upsampled_chunk)

    return pd.concat(dataframe_chunks).sample(
        frac=1,
        random_state=GLOBAL_RANDOM_SEED
    ).reset_index(drop=True)

In [None]:
def optimization_objective(weights, probabilities, true_labels):
    weighted_probs = probabilities * weights
    final_predictions = np.argmax(weighted_probs, axis=1)
    score = f1_score(true_labels, final_predictions, average='macro')
    return -score

In [None]:
def find_best_class_weights(oof_probabilities, true_labels, num_classes):
    print("\nStarting Threshold/Weight Optimization...")

    initial_weights = np.ones(num_classes)

    bounds = [(0.01, 10.0)] * num_classes

    result = minimize(
        optimization_objective,
        initial_weights,
        args=(oof_probabilities, true_labels),
        method='Powell',
        bounds=bounds,
        tol=1e-4
    )

    best_weights = result.x
    best_score = -result.fun
    print(f"Optimization Complete. Best Validation F1: {best_score:.4f}")
    print(f"Optimized Weights: {best_weights}")

    return best_weights

In [None]:
def execute_training_and_prediction_pipeline(target_column_name, training_dataframe, testing_dataframe):
    print(f"\n{'='*30}\n Processing Target: {target_column_name}\n{'='*30}")

    label_encoder = LabelEncoder()
    training_dataframe['label'] = label_encoder.fit_transform(training_dataframe[target_column_name])
    number_of_labels = len(label_encoder.classes_)

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            padding="max_length",
            truncation=True,
            max_length=MAXIMUM_SEQUENCE_LENGTH
        )

    testing_dataset_raw = Dataset.from_pandas(testing_dataframe[['text']])
    testing_dataset_encoded = testing_dataset_raw.map(tokenize_function, batched=True)

    stratified_k_fold = StratifiedKFold(
        n_splits=NUMBER_OF_FOLDS,
        shuffle=True,
        random_state=GLOBAL_RANDOM_SEED
    )

    accumulated_test_probabilities = np.zeros((len(testing_dataframe), number_of_labels))
    oof_probabilities = np.zeros((len(training_dataframe), number_of_labels))

    valid_folds_count = 0

    for fold_index, (train_indices, validation_indices) in enumerate(stratified_k_fold.split(training_dataframe, training_dataframe['label'])):
        print(f"\n--- Fold {fold_index + 1}/{NUMBER_OF_FOLDS} ---")

        training_fold_dataframe = training_dataframe.iloc[train_indices]
        validation_fold_dataframe = training_dataframe.iloc[validation_indices]

        training_fold_dataframe = oversample_minority_classes(
            training_fold_dataframe,
            'label',
            MINIMUM_SAMPLES_FOR_OVERSAMPLING
        )

        class_weights_array = compute_class_weight(
            class_weight='balanced',
            classes=np.unique(training_fold_dataframe['label']),
            y=training_fold_dataframe['label']
        )
        class_weights_tensor = torch.tensor(class_weights_array, dtype=torch.float)

        training_dataset_raw = Dataset.from_pandas(training_fold_dataframe[['text', 'label']])
        validation_dataset_raw = Dataset.from_pandas(validation_fold_dataframe[['text', 'label']])

        training_dataset_encoded = training_dataset_raw.map(tokenize_function, batched=True)
        validation_dataset_encoded = validation_dataset_raw.map(tokenize_function, batched=True)

        model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_NAME, num_labels=number_of_labels
        ).to(device_computation_target)

        training_arguments = TrainingArguments(
            output_dir=f"./results_{target_column_name}_fold{fold_index}",
            eval_strategy="epoch",
            save_strategy="epoch",
            learning_rate=LEARNING_RATE,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE * 2,
            num_train_epochs=NUMBER_OF_EPOCHS,
            weight_decay=0.01,
            load_best_model_at_end=True,
            metric_for_best_model="macro_f1",
            save_total_limit=1,
            fp16=True,
            report_to="none",
            label_smoothing_factor=0.0,
            seed=GLOBAL_RANDOM_SEED
        )

        trainer = FocalLossTrainer(
            model=model,
            args=training_arguments,
            train_dataset=training_dataset_encoded,
            eval_dataset=validation_dataset_encoded,
            compute_metrics=compute_evaluation_metrics,
        )
        trainer.class_weights = class_weights_tensor

        trainer.train()

        eval_metrics = trainer.evaluate()
        fold_score = eval_metrics['eval_macro_f1']

        val_preds_output = trainer.predict(validation_dataset_encoded).predictions
        val_probs = torch.nn.functional.softmax(torch.tensor(val_preds_output), dim=-1).numpy()
        oof_probabilities[validation_indices] = val_probs

        if fold_score > 0.45:
            print(f"Keeping Fold {fold_index + 1} (Score: {fold_score:.4f})")

            test_preds_output = trainer.predict(testing_dataset_encoded).predictions
            test_probs = torch.nn.functional.softmax(torch.tensor(test_preds_output), dim=-1).numpy()
            accumulated_test_probabilities += test_probs
            valid_folds_count += 1
        else:
            print(f"DISCARDING Fold {fold_index + 1} (Score: {fold_score:.4f}) - Model Collapsed")

        del model, trainer
        torch.cuda.empty_cache()
        gc.collect()

    if valid_folds_count > 0:
        average_test_probabilities = accumulated_test_probabilities / valid_folds_count
    else:
        print("CRITICAL WARNING: All folds failed! Returning raw probabilities (likely garbage).")
        average_test_probabilities = accumulated_test_probabilities

    best_weights = find_best_class_weights(
        oof_probabilities,
        training_dataframe['label'].values,
        number_of_labels
    )

    weighted_test_probs = average_test_probabilities * best_weights

    final_prediction_indices = np.argmax(weighted_test_probs, axis=1)
    final_prediction_labels = label_encoder.inverse_transform(final_prediction_indices)

    return final_prediction_labels

In [None]:
set_deterministic_seed(GLOBAL_RANDOM_SEED)

training_dataframe_main = pd.read_csv('train.csv')
testing_dataframe_main = pd.read_csv('test.csv')

training_dataframe_main['text'] = training_dataframe_main['text'].fillna("")
testing_dataframe_main['text'] = testing_dataframe_main['text'].fillna("")

emotion_predictions = execute_training_and_prediction_pipeline(
    'emotion',
    training_dataframe_main,
    testing_dataframe_main
)
category_predictions = execute_training_and_prediction_pipeline(
    'category',
    training_dataframe_main,
    testing_dataframe_main
)

submission_dataframe = pd.DataFrame({
    'index': testing_dataframe_main.iloc[:, 0],
    'emotion': emotion_predictions,
    'category': category_predictions
})

output_filename = 'submission_optimized_thresholds.csv'
submission_dataframe.to_csv(output_filename, index=False)
print(f"Done! Submission saved to {output_filename}")


 Processing Target: emotion


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Map:   0%|          | 0/3474 [00:00<?, ? examples/s]


--- Fold 1/5 ---


Map:   0%|          | 0/7126 [00:00<?, ? examples/s]

Map:   0%|          | 0/1622 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Macro F1
1,No log,0.573034,0.24434
2,No log,0.292753,0.514022
3,0.740300,0.323399,0.556839
4,0.740300,0.349644,0.581423
5,0.082200,0.36182,0.593863


Keeping Fold 1 (Score: 0.5939)



--- Fold 2/5 ---


Map:   0%|          | 0/7126 [00:00<?, ? examples/s]

Map:   0%|          | 0/1621 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Macro F1
1,No log,0.665007,0.327189
2,No log,0.33285,0.46368
3,0.903600,0.356931,0.540717
4,0.903600,0.394805,0.538424
5,0.111000,0.419568,0.559048


Keeping Fold 2 (Score: 0.5590)



--- Fold 3/5 ---


Map:   0%|          | 0/7125 [00:00<?, ? examples/s]

Map:   0%|          | 0/1621 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Macro F1
1,No log,0.536517,0.318802
2,No log,0.412633,0.476991
3,0.814800,0.40388,0.516641
4,0.814800,0.478059,0.578186
5,0.092500,0.486647,0.532656


Keeping Fold 3 (Score: 0.5782)



--- Fold 4/5 ---


Map:   0%|          | 0/7125 [00:00<?, ? examples/s]

Map:   0%|          | 0/1621 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Macro F1
1,No log,0.421829,0.424875
2,No log,0.321204,0.569395
3,0.716100,0.346215,0.581121
4,0.716100,0.370794,0.609674
5,0.099200,0.417561,0.627521


Keeping Fold 4 (Score: 0.6275)



--- Fold 5/5 ---


Map:   0%|          | 0/7126 [00:00<?, ? examples/s]

Map:   0%|          | 0/1621 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Macro F1
1,No log,0.50785,0.379378
2,No log,0.38805,0.483142
3,0.775100,0.462507,0.450615
4,0.775100,0.447015,0.507351
5,0.095800,0.502628,0.495176


Keeping Fold 5 (Score: 0.5074)



Starting Threshold/Weight Optimization...
Optimization Complete. Best Validation F1: 0.6073
Optimized Weights: [3.26710038 2.34583049 6.04221147 6.40367413 2.02035235 3.49757959
 6.11166831]

 Processing Target: category


Map:   0%|          | 0/3474 [00:00<?, ? examples/s]


--- Fold 1/5 ---


Map:   0%|          | 0/6548 [00:00<?, ? examples/s]

Map:   0%|          | 0/1622 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Macro F1
1,No log,0.400592,0.76068
2,No log,0.341687,0.69012
3,0.461200,0.382745,0.837269
4,0.461200,0.33321,0.819278
5,0.074900,0.389576,0.831121


Keeping Fold 1 (Score: 0.8373)



--- Fold 2/5 ---


Map:   0%|          | 0/6548 [00:00<?, ? examples/s]

Map:   0%|          | 0/1621 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Macro F1
1,No log,0.48314,0.638966
2,No log,0.219416,0.755092
3,0.574700,0.299341,0.841068
4,0.574700,0.321795,0.835397
5,0.097600,0.33897,0.847207


Keeping Fold 2 (Score: 0.8472)



--- Fold 3/5 ---


Map:   0%|          | 0/6548 [00:00<?, ? examples/s]

Map:   0%|          | 0/1621 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Macro F1
1,No log,0.40714,0.597293
2,No log,0.274255,0.793616
3,0.511400,0.296848,0.795416
4,0.511400,0.366268,0.822556
5,0.079300,0.388904,0.819399


Keeping Fold 3 (Score: 0.8226)



--- Fold 4/5 ---


Map:   0%|          | 0/6548 [00:00<?, ? examples/s]

Map:   0%|          | 0/1621 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Macro F1
1,No log,0.282498,0.683272
2,No log,0.24957,0.764202
3,0.505100,0.261128,0.807539
4,0.505100,0.263803,0.808297
5,0.098600,0.336165,0.836149


Keeping Fold 4 (Score: 0.8361)



--- Fold 5/5 ---


Map:   0%|          | 0/6548 [00:00<?, ? examples/s]

Map:   0%|          | 0/1621 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Macro F1
1,No log,0.284784,0.754644
2,No log,0.219607,0.741172
3,0.480700,0.228759,0.82425
4,0.480700,0.264579,0.837616
5,0.092100,0.289318,0.831136


Keeping Fold 5 (Score: 0.8376)



Starting Threshold/Weight Optimization...
Optimization Complete. Best Validation F1: 0.8558
Optimized Weights: [5.56787027 8.54260477 2.5496499  2.74324404 1.01610507]
Done! Submission saved to submission_optimized_thresholds.csv
