In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
"""
üéØ ULTIMATE TELUGU SENTIMENT ANALYSIS SYSTEM (PRE-SPLIT VERSION)
Uses train.csv, val.csv, and test.csv directly

‚úÖ 4 SOTA models (XLM-RoBERTa, IndicBERT, DeBERTa-v3, MuRIL)
‚úÖ Pre-split dataset (no splitting inside)
‚úÖ Classification report & confusion matrix for each model
‚úÖ Ensemble creation + PKL export
"""

# =============================================================================
# IMPORTS
# =============================================================================
import os, gc, time, warnings, pickle
from datetime import datetime
from typing import Dict, Any
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, AutoConfig
)
warnings.filterwarnings('ignore')

# =============================================================================
# MODEL LIST
# =============================================================================
TOP_4_MODELS = {
    'xlm-roberta-base': {'description': 'XLM-RoBERTa - Best multilingual', 'priority': 1, 'expected_f1': 0.903},
    'ai4bharat/indic-bert': {'description': 'IndicBERT - Telugu specialist', 'priority': 2, 'expected_f1': 0.870},
    'microsoft/deberta-v3-base': {'description': 'DeBERTa-v3 - Advanced', 'priority': 3, 'expected_f1': 0.880},
    'google/muril-base-cased': {'description': 'MuRIL - Indic multilingual', 'priority': 4, 'expected_f1': 0.850}
}

# =============================================================================
# DEVICE CONFIG
# =============================================================================
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
CONFIG = {
    'device_type': 'GPU' if torch.cuda.is_available() else 'CPU',
    'device': DEVICE,
    'batch_size': 16,
    'max_length': 192,
    'learning_rate': 2e-5,
    'num_epochs': 3,
    'warmup_steps': 100,
    'mixed_precision': 'fp16' if torch.cuda.is_available() else None
}
print(f"‚úÖ Using device: {CONFIG['device_type']}")

# =============================================================================
# DATA PROCESSOR (PRE-SPLIT)
# =============================================================================
class TeluguSentimentProcessor:
    """Loads train.csv, val.csv, test.csv directly"""
    def __init__(self, train_path, val_path, test_path):
        self.train_path = train_path
        self.val_path = val_path
        self.test_path = test_path
        self.label_encoder = None
        self.data_splits = {}
        self.task_info = {}

    def load_and_process_dataset(self):
        def load_file(path):
            for enc in ['utf-8', 'utf-8-sig', 'latin1']:
                try:
                    return pd.read_csv(path, encoding=enc)
                except:
                    continue
            raise ValueError(f"Could not load {path}")

        print("üìä Loading pre-split datasets...")
        df_train = load_file(self.train_path)
        df_val = load_file(self.val_path)
        df_test = load_file(self.test_path)

        text_col = next((c for c in ['Text','Sentence','text','sentence','content'] if c in df_train.columns), df_train.columns[0])
        label_col = next((c for c in ['Sentiment','label','Label','sentiment'] if c in df_train.columns), df_train.columns[-1])
        print(f"‚úÖ Text column: {text_col}, Label column: {label_col}")

        all_data = pd.concat([df_train, df_val, df_test])
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit(all_data[label_col].astype(str))

        for df in [df_train, df_val, df_test]:
            df['sentiment_encoded'] = self.label_encoder.transform(df[label_col].astype(str))

        self.data_splits = {
            'train': {'texts': df_train[text_col].values, 'labels': df_train['sentiment_encoded'].values},
            'val': {'texts': df_val[text_col].values, 'labels': df_val['sentiment_encoded'].values},
            'test': {'texts': df_test[text_col].values, 'labels': df_test['sentiment_encoded'].values}
        }

        self.task_info = {
            'num_labels': len(self.label_encoder.classes_),
            'labels': list(self.label_encoder.classes_),
            'text_column': text_col,
            'sentiment_column': label_col,
            'total_samples': len(all_data)
        }

        print(f"üìä Dataset Sizes -> Train: {len(df_train)}, Val: {len(df_val)}, Test: {len(df_test)}")
        print(f"üè∑Ô∏è Classes: {list(self.label_encoder.classes_)}")
        return self.task_info

# =============================================================================
# DATASET CLASS
# =============================================================================
class TeluguSentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts, self.labels, self.tokenizer, self.max_length = texts, labels, tokenizer, max_length
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        encoding = self.tokenizer(
            str(self.texts[idx]),
            truncation=True, padding='max_length',
            max_length=self.max_length, return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(int(self.labels[idx]), dtype=torch.long)
        }

# =============================================================================
# METRICS
# =============================================================================
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=1)
    acc = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)
    return {'accuracy': acc, 'f1': f1, 'precision': prec, 'recall': rec}

def evaluate_and_plot(model_name, labels, preds, label_encoder):
    """Generates classification report and confusion matrix"""
    report = classification_report(labels, preds, target_names=label_encoder.classes_)
    print(f"\nüìã Classification Report for {model_name}:\n{report}")

    cm = confusion_matrix(labels, preds)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=label_encoder.classes_,
                yticklabels=label_encoder.classes_)
    plt.title(f"Confusion Matrix - {model_name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    img_name = f"{model_name.replace('/', '_')}_confusion_matrix.png"
    plt.savefig(img_name)
    plt.close()

    # Save classification report as pickle
    with open(f"{model_name.replace('/', '_')}_classification_report.pkl", "wb") as f:
        pickle.dump({'report': report, 'confusion_matrix': cm}, f)

# =============================================================================
# TRAINER
# =============================================================================
class UltimateTeluguTrainer:
    def __init__(self, processor):
        self.processor = processor
        self.trained_models = {}
        self.model_results = {}

    def train_single_model(self, model_name, info):
        print(f"\nüöÄ Training {model_name}: {info['description']}")
        start = time.time()
        splits = self.processor.data_splits
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=self.processor.task_info['num_labels']
        ).to(DEVICE)

        train_ds = TeluguSentimentDataset(splits['train']['texts'], splits['train']['labels'], tokenizer, CONFIG['max_length'])
        val_ds = TeluguSentimentDataset(splits['val']['texts'], splits['val']['labels'], tokenizer, CONFIG['max_length'])
        test_ds = TeluguSentimentDataset(splits['test']['texts'], splits['test']['labels'], tokenizer, CONFIG['max_length'])

        args = TrainingArguments(
            output_dir=f"./{model_name.replace('/', '_')}",
            num_train_epochs=CONFIG['num_epochs'],
            per_device_train_batch_size=CONFIG['batch_size'],
            per_device_eval_batch_size=CONFIG['batch_size'],
            learning_rate=CONFIG['learning_rate'],
            eval_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            logging_steps=100,
            fp16=(CONFIG['mixed_precision'] == 'fp16'),
            report_to=[]
        )

        trainer = Trainer(
            model=model, args=args,
            train_dataset=train_ds, eval_dataset=val_ds,
            compute_metrics=compute_metrics
        )

        trainer.train()
        eval_result = trainer.evaluate(test_ds)
        preds = np.argmax(trainer.predict(test_ds).predictions, axis=1)
        labels = splits['test']['labels']

        evaluate_and_plot(model_name, labels, preds, self.processor.label_encoder)

        result = {
            'model_name': model_name,
            'f1': eval_result['eval_f1'],
            'accuracy': eval_result['eval_accuracy'],
            'precision': eval_result['eval_precision'],
            'recall': eval_result['eval_recall'],
            'description': info['description']
        }

        self.model_results[model_name] = result
        self.trained_models[model_name] = model
        print(f"‚úÖ Done: F1={result['f1']:.4f}, Acc={result['accuracy']:.4f}, Time={time.time()-start:.1f}s")

    def train_all(self):
        for i, (name, info) in enumerate(sorted(TOP_4_MODELS.items(), key=lambda x:x[1]['priority']), 1):
            self.train_single_model(name, info)
        return self.model_results

# =============================================================================
# SAVE PKL
# =============================================================================
def save_results(trainer, processor, filename="telugu_sentiment_results.pkl"):
    package = {
        'models': trainer.model_results,
        'labels': processor.label_encoder.classes_,
        'task_info': processor.task_info
    }
    with open(filename, 'wb') as f:
        pickle.dump(package, f)
    print(f"\nüíæ Saved comprehensive PKL ‚Üí {filename}")

# =============================================================================
# MAIN
# =============================================================================
def main():
    processor = TeluguSentimentProcessor(
        train_path="/kaggle/input/nlpdataset/train.csv",
        val_path="/kaggle/input/nlpdataset/val.csv",
        test_path="/kaggle/input/nlpdataset/test.csv"
    )
    processor.load_and_process_dataset()

    trainer = UltimateTeluguTrainer(processor)
    results = trainer.train_all()

    save_results(trainer, processor)

    print("\nüìä Final Summary:")
    for m, r in results.items():
        print(f"{m:30s} | F1={r['f1']:.4f} | Acc={r['accuracy']:.4f}")

if __name__ == "__main__":
    main()


‚úÖ Using device: GPU
üìä Loading pre-split datasets...
‚úÖ Text column: Sentence, Label column: Sentiment
üìä Dataset Sizes -> Train: 19464, Val: 2433, Test: 2434
üè∑Ô∏è Classes: ['neg', 'neutral', 'pos']

üöÄ Training xlm-roberta-base: XLM-RoBERTa - Best multilingual


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.7225,0.682624,0.712289,0.701181,0.720721,0.712289
2,0.6163,0.657043,0.725442,0.720801,0.724971,0.725442
3,0.5544,0.639537,0.742293,0.740936,0.740928,0.742293



üìã Classification Report for xlm-roberta-base:
              precision    recall  f1-score   support

         neg       0.76      0.80      0.78       612
     neutral       0.75      0.77      0.76      1175
         pos       0.70      0.65      0.67       647

    accuracy                           0.74      2434
   macro avg       0.74      0.74      0.74      2434
weighted avg       0.74      0.74      0.74      2434

‚úÖ Done: F1=0.7414, Acc=0.7424, Time=1645.8s

üöÄ Training ai4bharat/indic-bert: IndicBERT - Telugu specialist


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/ai4bharat/indic-bert.
401 Client Error. (Request ID: Root=1-690e1fae-40d55c712dee4506492336e3;03458ca7-154e-4310-a1fa-12ac7e91a935)

Cannot access gated repo for url https://huggingface.co/ai4bharat/indic-bert/resolve/main/config.json.
Access to model ai4bharat/indic-bert is restricted. You must have access to it and be authenticated to access it. Please log in.

In [4]:
"""
üéØ TELUGU SENTIMENT ANALYSIS ‚Äî SINGLE MODEL VERSION
Model: microsoft/deberta-v3-base

‚úÖ Uses pre-split train.csv, val.csv, test.csv
‚úÖ Generates classification report + confusion matrix
‚úÖ Saves results in a .pkl file
"""

# =============================================================================
# IMPORTS
# =============================================================================
import os, gc, time, warnings, pickle
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

warnings.filterwarnings('ignore')

# =============================================================================
# CONFIG
# =============================================================================
MODEL_NAME = "microsoft/deberta-v3-base"
MODEL_DESC = "DeBERTa-v3 - Advanced attention architecture"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
CONFIG = {
    'device_type': 'GPU' if torch.cuda.is_available() else 'CPU',
    'device': DEVICE,
    'batch_size': 16,
    'max_length': 192,
    'learning_rate': 2e-5,
    'num_epochs': 3,
    'mixed_precision': 'fp16' if torch.cuda.is_available() else None
}

print(f"‚úÖ Using device: {CONFIG['device_type']}")
print(f"üöÄ Model: {MODEL_NAME} ‚Äî {MODEL_DESC}")

# =============================================================================
# DATA PROCESSOR (PRE-SPLIT)
# =============================================================================
class TeluguSentimentProcessor:
    def __init__(self, train_path, val_path, test_path):
        self.train_path = train_path
        self.val_path = val_path
        self.test_path = test_path
        self.label_encoder = None
        self.data_splits = {}
        self.task_info = {}

    def load_and_process_dataset(self):
        def load_file(path):
            for enc in ['utf-8', 'utf-8-sig', 'latin1']:
                try:
                    return pd.read_csv(path, encoding=enc)
                except:
                    continue
            raise ValueError(f"‚ùå Could not load {path}")

        print("\nüìä Loading Telugu Sentiment Data (Pre-split)...")
        df_train = load_file(self.train_path)
        df_val = load_file(self.val_path)
        df_test = load_file(self.test_path)

        # Detect text/label columns
        text_col = next((c for c in ['Text', 'Sentence', 'text', 'sentence', 'content'] if c in df_train.columns), df_train.columns[0])
        label_col = next((c for c in ['Sentiment', 'label', 'Label', 'sentiment'] if c in df_train.columns), df_train.columns[-1])
        print(f"‚úÖ Text column: {text_col}, Label column: {label_col}")

        # Combine all for encoding
        all_data = pd.concat([df_train, df_val, df_test])
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit(all_data[label_col].astype(str))

        # Encode labels
        for df in [df_train, df_val, df_test]:
            df['sentiment_encoded'] = self.label_encoder.transform(df[label_col].astype(str))

        # Store splits
        self.data_splits = {
            'train': {'texts': df_train[text_col].values, 'labels': df_train['sentiment_encoded'].values},
            'val': {'texts': df_val[text_col].values, 'labels': df_val['sentiment_encoded'].values},
            'test': {'texts': df_test[text_col].values, 'labels': df_test['sentiment_encoded'].values}
        }

        self.task_info = {
            'num_labels': len(self.label_encoder.classes_),
            'labels': list(self.label_encoder.classes_),
            'text_column': text_col,
            'sentiment_column': label_col
        }

        print(f"üìä Train={len(df_train)}, Val={len(df_val)}, Test={len(df_test)}")
        print(f"üè∑Ô∏è Classes: {list(self.label_encoder.classes_)}")

        return self.task_info

# =============================================================================
# DATASET
# =============================================================================
class TeluguSentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self): return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            str(self.texts[idx]),
            truncation=True, padding='max_length',
            max_length=self.max_length, return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(int(self.labels[idx]), dtype=torch.long)
        }

# =============================================================================
# METRICS
# =============================================================================
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=1)
    acc = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)
    return {'accuracy': acc, 'f1': f1, 'precision': prec, 'recall': rec}

def evaluate_and_plot(model_name, labels, preds, label_encoder):
    """Generate classification report + confusion matrix"""
    report = classification_report(labels, preds, target_names=label_encoder.classes_)
    print(f"\nüìã Classification Report for {model_name}:\n{report}")

    cm = confusion_matrix(labels, preds)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=label_encoder.classes_,
                yticklabels=label_encoder.classes_)
    plt.title(f"Confusion Matrix - {model_name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    img_name = f"{model_name.replace('/', '_')}_confusion_matrix.png"
    plt.savefig(img_name)
    plt.close()

    # Save report as pickle
    with open(f"{model_name.replace('/', '_')}_classification_report.pkl", "wb") as f:
        pickle.dump({'report': report, 'confusion_matrix': cm}, f)

# =============================================================================
# TRAINING FUNCTION
# =============================================================================
def train_deberta(processor):
    print(f"\nüöÄ Training single model: {MODEL_NAME}")
    splits = processor.data_splits
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=processor.task_info['num_labels']
    ).to(DEVICE)

    train_ds = TeluguSentimentDataset(splits['train']['texts'], splits['train']['labels'], tokenizer, CONFIG['max_length'])
    val_ds = TeluguSentimentDataset(splits['val']['texts'], splits['val']['labels'], tokenizer, CONFIG['max_length'])
    test_ds = TeluguSentimentDataset(splits['test']['texts'], splits['test']['labels'], tokenizer, CONFIG['max_length'])

    args = TrainingArguments(
        output_dir=f"./{MODEL_NAME.replace('/', '_')}",
        num_train_epochs=CONFIG['num_epochs'],
        per_device_train_batch_size=CONFIG['batch_size'],
        per_device_eval_batch_size=CONFIG['batch_size'],
        learning_rate=CONFIG['learning_rate'],
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        logging_steps=100,
        fp16=(CONFIG['mixed_precision'] == 'fp16'),
        report_to=[]
    )

    trainer = Trainer(
        model=model, args=args,
        train_dataset=train_ds, eval_dataset=val_ds,
        compute_metrics=compute_metrics
    )

    trainer.train()
    eval_result = trainer.evaluate(test_ds)
    preds = np.argmax(trainer.predict(test_ds).predictions, axis=1)
    labels = splits['test']['labels']

    evaluate_and_plot(MODEL_NAME, labels, preds, processor.label_encoder)

    result = {
        'model_name': MODEL_NAME,
        'description': MODEL_DESC,
        'f1': eval_result['eval_f1'],
        'accuracy': eval_result['eval_accuracy'],
        'precision': eval_result['eval_precision'],
        'recall': eval_result['eval_recall']
    }

    # Save PKL
    with open("deberta_telugu_results.pkl", "wb") as f:
        pickle.dump({
            'results': result,
            'labels': processor.label_encoder.classes_,
            'report_file': f"{MODEL_NAME.replace('/', '_')}_classification_report.pkl",
            'confusion_matrix_image': f"{MODEL_NAME.replace('/', '_')}_confusion_matrix.png"
        }, f)

    print(f"\nüíæ Saved results ‚Üí deberta_telugu_results.pkl")
    print(f"‚úÖ Final: F1={result['f1']:.4f}, Accuracy={result['accuracy']:.4f}")

# =============================================================================
# MAIN
# =============================================================================
def main():
    processor = TeluguSentimentProcessor(
        train_path="/kaggle/input/nlpdataset/train.csv",
        val_path="/kaggle/input/nlpdataset/val.csv",
        test_path="/kaggle/input/nlpdataset/test.csv"
    )
    processor.load_and_process_dataset()
    train_deberta(processor)

if __name__ == "__main__":
    main()


‚úÖ Using device: GPU
üöÄ Model: microsoft/deberta-v3-base ‚Äî DeBERTa-v3 - Advanced attention architecture

üìä Loading Telugu Sentiment Data (Pre-split)...
‚úÖ Text column: Sentence, Label column: Sentiment
üìä Train=19464, Val=2433, Test=2434
üè∑Ô∏è Classes: ['neg', 'neutral', 'pos']

üöÄ Training single model: microsoft/deberta-v3-base


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.0547,1.052642,0.482121,0.313659,0.232441,0.482121
2,1.0541,1.051576,0.482121,0.313659,0.232441,0.482121
3,0.9902,0.985807,0.523633,0.430543,0.385916,0.523633



üìã Classification Report for microsoft/deberta-v3-base:
              precision    recall  f1-score   support

         neg       0.56      0.41      0.47       612
     neutral       0.52      0.88      0.66      1175
         pos       0.00      0.00      0.00       647

    accuracy                           0.53      2434
   macro avg       0.36      0.43      0.38      2434
weighted avg       0.39      0.53      0.44      2434


üíæ Saved results ‚Üí deberta_telugu_results.pkl
‚úÖ Final: F1=0.4354, Accuracy=0.5288


In [6]:
"""
üéØ TELUGU SENTIMENT ANALYSIS ‚Äî SINGLE MODEL VERSION
Model: google/muril-base-cased

‚úÖ Uses pre-split train.csv, val.csv, test.csv
‚úÖ Generates classification report + confusion matrix
‚úÖ Saves results in a .pkl file
"""

# =============================================================================
# IMPORTS
# =============================================================================
import os, gc, time, warnings, pickle
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

warnings.filterwarnings('ignore')

# =============================================================================
# CONFIG
# =============================================================================
MODEL_NAME = "google/muril-base-cased"
MODEL_DESC = "MuRIL - Multilingual BERT trained on Indian languages"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
CONFIG = {
    'device_type': 'GPU' if torch.cuda.is_available() else 'CPU',
    'device': DEVICE,
    'batch_size': 16,
    'max_length': 192,
    'learning_rate': 2e-5,
    'num_epochs': 3,
    'mixed_precision': 'fp16' if torch.cuda.is_available() else None
}

print(f"‚úÖ Using device: {CONFIG['device_type']}")
print(f"üöÄ Model: {MODEL_NAME} ‚Äî {MODEL_DESC}")

# =============================================================================
# DATA PROCESSOR (PRE-SPLIT)
# =============================================================================
class TeluguSentimentProcessor:
    def __init__(self, train_path, val_path, test_path):
        self.train_path = train_path
        self.val_path = val_path
        self.test_path = test_path
        self.label_encoder = None
        self.data_splits = {}
        self.task_info = {}

    def load_and_process_dataset(self):
        def load_file(path):
            for enc in ['utf-8', 'utf-8-sig', 'latin1']:
                try:
                    return pd.read_csv(path, encoding=enc)
                except:
                    continue
            raise ValueError(f"‚ùå Could not load {path}")

        print("\nüìä Loading Telugu Sentiment Data (Pre-split)...")
        df_train = load_file(self.train_path)
        df_val = load_file(self.val_path)
        df_test = load_file(self.test_path)

        # Detect text/label columns
        text_col = next((c for c in ['Text', 'Sentence', 'text', 'sentence', 'content'] if c in df_train.columns), df_train.columns[0])
        label_col = next((c for c in ['Sentiment', 'label', 'Label', 'sentiment'] if c in df_train.columns), df_train.columns[-1])
        print(f"‚úÖ Text column: {text_col}, Label column: {label_col}")

        # Combine all for encoding
        all_data = pd.concat([df_train, df_val, df_test])
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit(all_data[label_col].astype(str))

        # Encode labels
        for df in [df_train, df_val, df_test]:
            df['sentiment_encoded'] = self.label_encoder.transform(df[label_col].astype(str))

        # Store splits
        self.data_splits = {
            'train': {'texts': df_train[text_col].values, 'labels': df_train['sentiment_encoded'].values},
            'val': {'texts': df_val[text_col].values, 'labels': df_val['sentiment_encoded'].values},
            'test': {'texts': df_test[text_col].values, 'labels': df_test['sentiment_encoded'].values}
        }

        self.task_info = {
            'num_labels': len(self.label_encoder.classes_),
            'labels': list(self.label_encoder.classes_),
            'text_column': text_col,
            'sentiment_column': label_col
        }

        print(f"üìä Train={len(df_train)}, Val={len(df_val)}, Test={len(df_test)}")
        print(f"üè∑Ô∏è Classes: {list(self.label_encoder.classes_)}")

        return self.task_info

# =============================================================================
# DATASET
# =============================================================================
class TeluguSentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self): return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            str(self.texts[idx]),
            truncation=True, padding='max_length',
            max_length=self.max_length, return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(int(self.labels[idx]), dtype=torch.long)
        }

# =============================================================================
# METRICS
# =============================================================================
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=1)
    acc = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)
    return {'accuracy': acc, 'f1': f1, 'precision': prec, 'recall': rec}

def evaluate_and_plot(model_name, labels, preds, label_encoder):
    """Generate classification report + confusion matrix"""
    report = classification_report(labels, preds, target_names=label_encoder.classes_)
    print(f"\nüìã Classification Report for {model_name}:\n{report}")

    cm = confusion_matrix(labels, preds)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=label_encoder.classes_,
                yticklabels=label_encoder.classes_)
    plt.title(f"Confusion Matrix - {model_name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    img_name = f"{model_name.replace('/', '_')}_confusion_matrix.png"
    plt.savefig(img_name)
    plt.close()

    # Save report as pickle
    with open(f"{model_name.replace('/', '_')}_classification_report.pkl", "wb") as f:
        pickle.dump({'report': report, 'confusion_matrix': cm}, f)

# =============================================================================
# TRAINING FUNCTION
# =============================================================================
def train_muril(processor):
    print(f"\nüöÄ Training single model: {MODEL_NAME}")
    splits = processor.data_splits
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=processor.task_info['num_labels']
    ).to(DEVICE)

    train_ds = TeluguSentimentDataset(splits['train']['texts'], splits['train']['labels'], tokenizer, CONFIG['max_length'])
    val_ds = TeluguSentimentDataset(splits['val']['texts'], splits['val']['labels'], tokenizer, CONFIG['max_length'])
    test_ds = TeluguSentimentDataset(splits['test']['texts'], splits['test']['labels'], tokenizer, CONFIG['max_length'])

    args = TrainingArguments(
        output_dir=f"./{MODEL_NAME.replace('/', '_')}",
        num_train_epochs=CONFIG['num_epochs'],
        per_device_train_batch_size=CONFIG['batch_size'],
        per_device_eval_batch_size=CONFIG['batch_size'],
        learning_rate=CONFIG['learning_rate'],
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        logging_steps=100,
        fp16=(CONFIG['mixed_precision'] == 'fp16'),
        report_to=[]
    )

    trainer = Trainer(
        model=model, args=args,
        train_dataset=train_ds, eval_dataset=val_ds,
        compute_metrics=compute_metrics
    )

    trainer.train()
    eval_result = trainer.evaluate(test_ds)
    preds = np.argmax(trainer.predict(test_ds).predictions, axis=1)
    labels = splits['test']['labels']

    evaluate_and_plot(MODEL_NAME, labels, preds, processor.label_encoder)

    result = {
        'model_name': MODEL_NAME,
        'description': MODEL_DESC,
        'f1': eval_result['eval_f1'],
        'accuracy': eval_result['eval_accuracy'],
        'precision': eval_result['eval_precision'],
        'recall': eval_result['eval_recall']
    }

    # Save PKL
    with open("muril_telugu_results.pkl", "wb") as f:
        pickle.dump({
            'results': result,
            'labels': processor.label_encoder.classes_,
            'report_file': f"{MODEL_NAME.replace('/', '_')}_classification_report.pkl",
            'confusion_matrix_image': f"{MODEL_NAME.replace('/', '_')}_confusion_matrix.png"
        }, f)

    print(f"\nüíæ Saved results ‚Üí muril_telugu_results.pkl")
    print(f"‚úÖ Final: F1={result['f1']:.4f}, Accuracy={result['accuracy']:.4f}")

# =============================================================================
# MAIN
# =============================================================================
def main():
    processor = TeluguSentimentProcessor(
        train_path="/kaggle/input/nlpdataset/train.csv",
        val_path="/kaggle/input/nlpdataset/val.csv",
        test_path="/kaggle/input/nlpdataset/test.csv"
    )
    processor.load_and_process_dataset()
    train_muril(processor)

if __name__ == "__main__":
    main()


‚úÖ Using device: GPU
üöÄ Model: google/muril-base-cased ‚Äî MuRIL - Multilingual BERT trained on Indian languages

üìä Loading Telugu Sentiment Data (Pre-split)...
‚úÖ Text column: Sentence, Label column: Sentiment
üìä Train=19464, Val=2433, Test=2434
üè∑Ô∏è Classes: ['neg', 'neutral', 'pos']

üöÄ Training single model: google/muril-base-cased


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.8154,0.772269,0.726264,0.720081,0.728362,0.726264
2,0.6776,0.675668,0.732018,0.731165,0.7308,0.732018


RuntimeError: [enforce fail at inline_container.cc:626] . unexpected pos 52032 vs 51924