In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [7]:
"""
üéØ TELUGU SENTIMENT ANALYSIS ‚Äî STACKED ENSEMBLE
Models: xlm-roberta-base, microsoft/deberta-v3-base, google/muril-base-cased

‚úÖ Trains 3 models individually
‚úÖ Collects test predictions (probabilities)
‚úÖ Builds stacking meta-learner (Logistic Regression)
‚úÖ Generates classification report & confusion matrix for the stacked ensemble
"""

# =============================================================================
# IMPORTS
# =============================================================================
import os, time, gc, pickle, warnings
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

warnings.filterwarnings("ignore")

# =============================================================================
# CONFIG
# =============================================================================
MODELS = [
    "xlm-roberta-base",
    "microsoft/deberta-v3-base",
    "google/muril-base-cased"
]

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
CONFIG = {
    "batch_size": 16,
    "max_length": 192,
    "learning_rate": 2e-5,
    "num_epochs": 2,
    "mixed_precision": "fp16" if torch.cuda.is_available() else None
}

print(f"‚úÖ Using device: {DEVICE}")

# =============================================================================
# DATA PROCESSOR (PRE-SPLIT)
# =============================================================================
class TeluguSentimentProcessor:
    def __init__(self, train_path, val_path, test_path):
        self.train_path = train_path
        self.val_path = val_path
        self.test_path = test_path
        self.label_encoder = LabelEncoder()

    def load_and_process_dataset(self):
        def load_file(path):
            for enc in ["utf-8", "utf-8-sig", "latin1"]:
                try:
                    return pd.read_csv(path, encoding=enc)
                except:
                    continue
            raise ValueError(f"‚ùå Could not load {path}")

        print("\nüìä Loading Telugu Sentiment Dataset...")
        df_train = load_file(self.train_path)
        df_val = load_file(self.val_path)
        df_test = load_file(self.test_path)

        text_col = next((c for c in ["Text", "Sentence", "text", "sentence", "content"] if c in df_train.columns), df_train.columns[0])
        label_col = next((c for c in ["Sentiment", "label", "Label", "sentiment"] if c in df_train.columns), df_train.columns[-1])
        print(f"‚úÖ Text: {text_col}, Label: {label_col}")

        all_data = pd.concat([df_train, df_val, df_test])
        self.label_encoder.fit(all_data[label_col].astype(str))

        for df in [df_train, df_val, df_test]:
            df["sentiment_encoded"] = self.label_encoder.transform(df[label_col].astype(str))

        self.data = {
            "train": (df_train[text_col].values, df_train["sentiment_encoded"].values),
            "val": (df_val[text_col].values, df_val["sentiment_encoded"].values),
            "test": (df_test[text_col].values, df_test["sentiment_encoded"].values)
        }
        print(f"üìä Train={len(df_train)}, Val={len(df_val)}, Test={len(df_test)}")
        return self.data, self.label_encoder

# =============================================================================
# DATASET CLASS
# =============================================================================
class TeluguDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self): return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            str(self.texts[idx]),
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(int(self.labels[idx]), dtype=torch.long)
        }

# =============================================================================
# METRICS
# =============================================================================
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=1)
    acc = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted", zero_division=0)
    return {"accuracy": acc, "f1": f1, "precision": prec, "recall": rec}

# =============================================================================
# TRAIN SINGLE MODEL
# =============================================================================
def train_single_model(model_name, train_data, val_data, test_data, label_encoder):
    print(f"\nüöÄ Training {model_name} ...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_)).to(DEVICE)

    train_ds = TeluguDataset(train_data[0], train_data[1], tokenizer, CONFIG["max_length"])
    val_ds = TeluguDataset(val_data[0], val_data[1], tokenizer, CONFIG["max_length"])
    test_ds = TeluguDataset(test_data[0], test_data[1], tokenizer, CONFIG["max_length"])

    args = TrainingArguments(
        output_dir=f"./{model_name.replace('/', '_')}",
        num_train_epochs=CONFIG["num_epochs"],
        per_device_train_batch_size=CONFIG["batch_size"],
        per_device_eval_batch_size=CONFIG["batch_size"],
        learning_rate=CONFIG["learning_rate"],
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        logging_steps=100,
        fp16=(CONFIG["mixed_precision"] == "fp16"),
        report_to=[]
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        compute_metrics=compute_metrics
    )

    trainer.train()

    preds = trainer.predict(test_ds).predictions
    probs = torch.softmax(torch.tensor(preds), dim=-1).numpy()
    labels = test_data[1]
    pred_classes = np.argmax(probs, axis=1)

    # Save individual model predictions
    results = {
        "model_name": model_name,
        "predictions_proba": probs,
        "true_labels": labels,
        "predicted_labels": pred_classes
    }
    return results

# =============================================================================
# ENSEMBLE STACKING
# =============================================================================
def stack_models(models_outputs, label_encoder):
    print("\nüéØ Building Stacking Meta-Learner...")
    all_probs = [m["predictions_proba"] for m in models_outputs]
    y_true = models_outputs[0]["true_labels"]

    # Meta features (concatenate model probabilities)
    X_meta = np.concatenate(all_probs, axis=1)
    y_meta = np.array(y_true)

    meta_learner = LogisticRegression(max_iter=1000, random_state=42)
    meta_learner.fit(X_meta, y_meta)
    y_pred_meta = meta_learner.predict(X_meta)

    report = classification_report(y_meta, y_pred_meta, target_names=label_encoder.classes_)
    print("\nüìã Classification Report for Stacked Ensemble:\n", report)

    cm = confusion_matrix(y_meta, y_pred_meta)
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=label_encoder.classes_,
                yticklabels=label_encoder.classes_)
    plt.title("Confusion Matrix ‚Äî Stacked Ensemble")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.savefig("stacked_confusion_matrix.png")
    plt.close()

    acc = accuracy_score(y_meta, y_pred_meta)
    prec, rec, f1, _ = precision_recall_fscore_support(y_meta, y_pred_meta, average="weighted", zero_division=0)

    ensemble_results = {
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "classification_report": report
    }

    with open("stacked_ensemble_results.pkl", "wb") as f:
        pickle.dump(ensemble_results, f)

    print(f"\nüíæ Saved stacked ensemble ‚Üí stacked_ensemble_results.pkl")
    print(f"‚úÖ F1={f1:.4f}, Acc={acc:.4f}, Prec={prec:.4f}, Recall={rec:.4f}")

# =============================================================================
# MAIN
# =============================================================================
def main():
    processor = TeluguSentimentProcessor(
        "/kaggle/input/nlpdataset/train.csv",
        "/kaggle/input/nlpdataset/val.csv",
        "/kaggle/input/nlpdataset/test.csv"
    )
    data, label_encoder = processor.load_and_process_dataset()

    model_outputs = []
    for model_name in MODELS:
        result = train_single_model(
            model_name,
            data["train"],
            data["val"],
            data["test"],
            label_encoder
        )
        model_outputs.append(result)
        gc.collect()
        torch.cuda.empty_cache()

    stack_models(model_outputs, label_encoder)

if __name__ == "__main__":
    main()


‚úÖ Using device: cuda

üìä Loading Telugu Sentiment Dataset...
‚úÖ Text: Sentence, Label: Sentiment
üìä Train=19464, Val=2433, Test=2434

üöÄ Training xlm-roberta-base ...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.7225,0.67716,0.7127,0.706316,0.716124,0.7127
2,0.6116,0.653787,0.727908,0.72657,0.726608,0.727908



üöÄ Training microsoft/deberta-v3-base ...


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.035,1.002315,0.516646,0.411906,0.386575,0.516646
2,0.9583,0.961414,0.541718,0.453176,0.396346,0.541718



üöÄ Training google/muril-base-cased ...


tokenizer_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/953M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/953M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.8341,0.804212,0.72462,0.722749,0.723944,0.72462
2,0.728,0.731517,0.735306,0.732593,0.734862,0.735306



üéØ Building Stacking Meta-Learner...

üìã Classification Report for Stacked Ensemble:
               precision    recall  f1-score   support

         neg       0.78      0.78      0.78       612
     neutral       0.74      0.79      0.77      1175
         pos       0.73      0.63      0.67       647

    accuracy                           0.75      2434
   macro avg       0.75      0.74      0.74      2434
weighted avg       0.75      0.75      0.75      2434


üíæ Saved stacked ensemble ‚Üí stacked_ensemble_results.pkl
‚úÖ F1=0.7456, Acc=0.7473, Prec=0.7467, Recall=0.7473
