In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
"""
üéØ TELUGU SENTIMENT ANALYSIS ‚Äî SINGLE MODEL VERSION
Model: google/muril-base-cased

‚úÖ Uses pre-split train.csv, val.csv, test.csv
‚úÖ Generates classification report + confusion matrix
‚úÖ Saves results in a .pkl file
"""

# =============================================================================
# IMPORTS
# =============================================================================
import os, gc, time, warnings, pickle
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

warnings.filterwarnings('ignore')

# =============================================================================
# CONFIG
# =============================================================================
MODEL_NAME = "google/muril-base-cased"
MODEL_DESC = "MuRIL - Multilingual BERT trained on Indian languages"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
CONFIG = {
    'device_type': 'GPU' if torch.cuda.is_available() else 'CPU',
    'device': DEVICE,
    'batch_size': 16,
    'max_length': 192,
    'learning_rate': 2e-5,
    'num_epochs': 3,
    'mixed_precision': 'fp16' if torch.cuda.is_available() else None
}

print(f"‚úÖ Using device: {CONFIG['device_type']}")
print(f"üöÄ Model: {MODEL_NAME} ‚Äî {MODEL_DESC}")

# =============================================================================
# DATA PROCESSOR (PRE-SPLIT)
# =============================================================================
class TeluguSentimentProcessor:
    def __init__(self, train_path, val_path, test_path):
        self.train_path = train_path
        self.val_path = val_path
        self.test_path = test_path
        self.label_encoder = None
        self.data_splits = {}
        self.task_info = {}

    def load_and_process_dataset(self):
        def load_file(path):
            for enc in ['utf-8', 'utf-8-sig', 'latin1']:
                try:
                    return pd.read_csv(path, encoding=enc)
                except:
                    continue
            raise ValueError(f"‚ùå Could not load {path}")

        print("\nüìä Loading Telugu Sentiment Data (Pre-split)...")
        df_train = load_file(self.train_path)
        df_val = load_file(self.val_path)
        df_test = load_file(self.test_path)

        # Detect text/label columns
        text_col = next((c for c in ['Text', 'Sentence', 'text', 'sentence', 'content'] if c in df_train.columns), df_train.columns[0])
        label_col = next((c for c in ['Sentiment', 'label', 'Label', 'sentiment'] if c in df_train.columns), df_train.columns[-1])
        print(f"‚úÖ Text column: {text_col}, Label column: {label_col}")

        # Combine all for encoding
        all_data = pd.concat([df_train, df_val, df_test])
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit(all_data[label_col].astype(str))

        # Encode labels
        for df in [df_train, df_val, df_test]:
            df['sentiment_encoded'] = self.label_encoder.transform(df[label_col].astype(str))

        # Store splits
        self.data_splits = {
            'train': {'texts': df_train[text_col].values, 'labels': df_train['sentiment_encoded'].values},
            'val': {'texts': df_val[text_col].values, 'labels': df_val['sentiment_encoded'].values},
            'test': {'texts': df_test[text_col].values, 'labels': df_test['sentiment_encoded'].values}
        }

        self.task_info = {
            'num_labels': len(self.label_encoder.classes_),
            'labels': list(self.label_encoder.classes_),
            'text_column': text_col,
            'sentiment_column': label_col
        }

        print(f"üìä Train={len(df_train)}, Val={len(df_val)}, Test={len(df_test)}")
        print(f"üè∑Ô∏è Classes: {list(self.label_encoder.classes_)}")

        return self.task_info

# =============================================================================
# DATASET
# =============================================================================
class TeluguSentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self): return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            str(self.texts[idx]),
            truncation=True, padding='max_length',
            max_length=self.max_length, return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(int(self.labels[idx]), dtype=torch.long)
        }

# =============================================================================
# METRICS
# =============================================================================
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=1)
    acc = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)
    return {'accuracy': acc, 'f1': f1, 'precision': prec, 'recall': rec}

def evaluate_and_plot(model_name, labels, preds, label_encoder):
    """Generate classification report + confusion matrix"""
    report = classification_report(labels, preds, target_names=label_encoder.classes_)
    print(f"\nüìã Classification Report for {model_name}:\n{report}")

    cm = confusion_matrix(labels, preds)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=label_encoder.classes_,
                yticklabels=label_encoder.classes_)
    plt.title(f"Confusion Matrix - {model_name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    img_name = f"{model_name.replace('/', '_')}_confusion_matrix.png"
    plt.savefig(img_name)
    plt.close()

    # Save report as pickle
    with open(f"{model_name.replace('/', '_')}_classification_report.pkl", "wb") as f:
        pickle.dump({'report': report, 'confusion_matrix': cm}, f)

# =============================================================================
# TRAINING FUNCTION
# =============================================================================
def train_muril(processor):
    print(f"\nüöÄ Training single model: {MODEL_NAME}")
    splits = processor.data_splits
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=processor.task_info['num_labels']
    ).to(DEVICE)

    train_ds = TeluguSentimentDataset(splits['train']['texts'], splits['train']['labels'], tokenizer, CONFIG['max_length'])
    val_ds = TeluguSentimentDataset(splits['val']['texts'], splits['val']['labels'], tokenizer, CONFIG['max_length'])
    test_ds = TeluguSentimentDataset(splits['test']['texts'], splits['test']['labels'], tokenizer, CONFIG['max_length'])

    args = TrainingArguments(
        output_dir=f"./{MODEL_NAME.replace('/', '_')}",
        num_train_epochs=CONFIG['num_epochs'],
        per_device_train_batch_size=CONFIG['batch_size'],
        per_device_eval_batch_size=CONFIG['batch_size'],
        learning_rate=CONFIG['learning_rate'],
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        logging_steps=100,
        fp16=(CONFIG['mixed_precision'] == 'fp16'),
        report_to=[]
    )

    trainer = Trainer(
        model=model, args=args,
        train_dataset=train_ds, eval_dataset=val_ds,
        compute_metrics=compute_metrics
    )

    trainer.train()
    eval_result = trainer.evaluate(test_ds)
    preds = np.argmax(trainer.predict(test_ds).predictions, axis=1)
    labels = splits['test']['labels']

    evaluate_and_plot(MODEL_NAME, labels, preds, processor.label_encoder)

    result = {
        'model_name': MODEL_NAME,
        'description': MODEL_DESC,
        'f1': eval_result['eval_f1'],
        'accuracy': eval_result['eval_accuracy'],
        'precision': eval_result['eval_precision'],
        'recall': eval_result['eval_recall']
    }

    # Save PKL
    with open("muril_telugu_results.pkl", "wb") as f:
        pickle.dump({
            'results': result,
            'labels': processor.label_encoder.classes_,
            'report_file': f"{MODEL_NAME.replace('/', '_')}_classification_report.pkl",
            'confusion_matrix_image': f"{MODEL_NAME.replace('/', '_')}_confusion_matrix.png"
        }, f)

    print(f"\nüíæ Saved results ‚Üí muril_telugu_results.pkl")
    print(f"‚úÖ Final: F1={result['f1']:.4f}, Accuracy={result['accuracy']:.4f}")

# =============================================================================
# MAIN
# =============================================================================
def main():
    processor = TeluguSentimentProcessor(
        train_path="/kaggle/input/nlpdataset/train.csv",
        val_path="/kaggle/input/nlpdataset/val.csv",
        test_path="/kaggle/input/nlpdataset/test.csv"
    )
    processor.load_and_process_dataset()
    train_muril(processor)

if __name__ == "__main__":
    main()


‚úÖ Using device: GPU
üöÄ Model: google/muril-base-cased ‚Äî MuRIL - Multilingual BERT trained on Indian languages

üìä Loading Telugu Sentiment Data (Pre-split)...
‚úÖ Text column: Sentence, Label column: Sentiment
üìä Train=19464, Val=2433, Test=2434
üè∑Ô∏è Classes: ['neg', 'neutral', 'pos']

üöÄ Training single model: google/muril-base-cased


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.8111,0.779847,0.718455,0.712306,0.720624,0.718455
2,0.6809,0.678861,0.729552,0.728272,0.728077,0.729552
3,0.6034,0.6539,0.739827,0.739208,0.739294,0.739827



üìã Classification Report for google/muril-base-cased:
              precision    recall  f1-score   support

         neg       0.76      0.80      0.78       612
     neutral       0.76      0.74      0.75      1175
         pos       0.67      0.67      0.67       647

    accuracy                           0.74      2434
   macro avg       0.73      0.74      0.73      2434
weighted avg       0.74      0.74      0.74      2434


üíæ Saved results ‚Üí muril_telugu_results.pkl
‚úÖ Final: F1=0.7369, Accuracy=0.7371
