In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
# =============================
# NLP Project: Emotion Classification (XLM-RoBERTa only)
# Kaggle-ready + GPU-safe
# =============================

!pip install -q transformers datasets scikit-learn torch

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import pandas as pd
import torch
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import pickle

# -----------------------------
# 1. Load Dataset (Updated paths)
# -----------------------------
train_df = pd.read_csv("/kaggle/input/nlpwadernew/final_vader_filtered (1).csv")[['Sentence', 'Emotion']]
val_df = pd.read_csv("/kaggle/input/nlpwadernew/val.csv")[['Sentence', 'Emotion']]
test_df = pd.read_csv("/kaggle/input/nlpwadernew/test.csv")[['Sentence', 'Emotion']]

# Encode Emotion labels
le = LabelEncoder()
train_df['Emotion'] = le.fit_transform(train_df['Emotion'])
val_df['Emotion'] = le.transform(val_df['Emotion'])
test_df['Emotion'] = le.transform(test_df['Emotion'])
num_labels = len(le.classes_)

# Convert to HuggingFace Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# -----------------------------
# 2. Model Selection
# -----------------------------
model_name = "xlm-roberta-base"

# -----------------------------
# 3. Tokenization Function
# -----------------------------
def tokenize_function(examples, tokenizer):
    return tokenizer(examples["Sentence"], padding="max_length", truncation=True, max_length=128)

# -----------------------------
# 4. Train Function
# -----------------------------
def train_model(model_name, batch_size=4, fp16=True):
    print(f"\n===== Training {model_name} =====")
    
    torch.cuda.empty_cache()
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
    
    # Tokenize datasets
    tokenized_train = train_dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True)
    tokenized_val = val_dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True)
    tokenized_test = test_dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True)
    
    # Rename label column for Trainer
    tokenized_train = tokenized_train.rename_column("Emotion", "labels")
    tokenized_val = tokenized_val.rename_column("Emotion", "labels")
    tokenized_test = tokenized_test.rename_column("Emotion", "labels")
    
    # Set format for PyTorch
    tokenized_train.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
    tokenized_val.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
    tokenized_test.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir=f"./results_{model_name.replace('/', '_')}",
        num_train_epochs=3,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=2e-5,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="no",
        logging_strategy="steps",
        logging_steps=50,
        report_to="none",
        fp16=fp16
    )
    
    # Metrics function
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        acc = accuracy_score(labels, predictions)
        return {"accuracy": acc}
    
    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        compute_metrics=compute_metrics
    )
    
    # Train
    trainer.train()
    
    # Predict on test set
    preds_output = trainer.predict(tokenized_test)
    preds = np.argmax(preds_output.predictions, axis=-1)
    
    # Metrics
    cm = confusion_matrix(test_df['Emotion'], preds)
    report = classification_report(test_df['Emotion'], preds, target_names=le.classes_, output_dict=True)
    acc = accuracy_score(test_df['Emotion'], preds)
    
    print(f"âœ… {model_name} Accuracy: {acc:.4f}")
    print(f"Confusion Matrix:\n{cm}\n")
    
    return {
        "model_name": model_name,
        "tokenizer": tokenizer,
        "preds": preds,
        "cm": cm,
        "report": report,
        "accuracy": acc
    }

# -----------------------------
# 5. Train Only XLM-RoBERTa
# -----------------------------
result = train_model(model_name, batch_size=4, fp16=True)

# -----------------------------
# 6. Save Results
# -----------------------------
all_results = {
    "label_encoder": le,
    "model_result": result
}

with open("emotion_xlm_roberta_results.pkl", "wb") as f:
    pickle.dump(all_results, f)

print("âœ… Training complete! Results saved to emotion_xlm_roberta_results.pkl")



===== Training xlm-roberta-base =====


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/21202 [00:00<?, ? examples/s]

Map:   0%|          | 0/2433 [00:00<?, ? examples/s]

Map:   0%|          | 0/2434 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy
1,0.6325,0.717185,0.719688
2,0.6081,0.723329,0.743527
3,0.4485,0.781406,0.737361




âœ… xlm-roberta-base Accuracy: 0.7391
Confusion Matrix:
[[  17    0    1    9   11]
 [   2    4    0    4    2]
 [   4    0  287   93   16]
 [   5    0  197 1090  180]
 [   9    2   15   85  401]]

âœ… Training complete! Results saved to emotion_xlm_roberta_results.pkl


In [6]:
import pickle
import pandas as pd

# Load the results
with open("/kaggle/input/pklxlm/emotion_xlm_roberta_results.pkl", "rb") as f:
    results = pickle.load(f)

# Extract model result
model_result = results["model_result"]

# Get confusion matrix and classification report
cm = model_result["cm"]
report_dict = model_result["report"]
accuracy = model_result["accuracy"]
model_name = model_result["model_name"]

# Display Confusion Matrix
print("âœ… Model:", model_name)
print(f"Overall Accuracy: {accuracy:.4f}\n")
print("ðŸ”¹ Confusion Matrix:")
print(pd.DataFrame(cm))

# Display Classification Report (nicely formatted)
print("\nðŸ”¹ Classification Report:")
report_df = pd.DataFrame(report_dict).transpose()
print(report_df)


âœ… Model: xlm-roberta-base
Overall Accuracy: 0.7391

ðŸ”¹ Confusion Matrix:
    0  1    2     3    4
0  17  0    1     9   11
1   2  4    0     4    2
2   4  0  287    93   16
3   5  0  197  1090  180
4   9  2   15    85  401

ðŸ”¹ Classification Report:
              precision    recall  f1-score      support
angry          0.459459  0.447368  0.453333    38.000000
fear           0.666667  0.333333  0.444444    12.000000
happy          0.574000  0.717500  0.637778   400.000000
no             0.850898  0.740489  0.791863  1472.000000
sad            0.657377  0.783203  0.714795   512.000000
accuracy       0.739113  0.739113  0.739113     0.739113
macro avg      0.641680  0.604379  0.608443  2434.000000
weighted avg   0.757666  0.739113  0.743332  2434.000000
