# Fine-Tune XLM-RoBERTa for Indic Sentiment Analysis

This notebook fine-tunes `xlm-roberta-base` for sentiment analysis (positive, negative, neutral) on Indic languages (Hindi, Tamil, Telugu, Malayalam, Kannada) using the AI4Bharat IndicSentiment dataset. The model will be integrated with a Flask API for audio transcription and sentiment analysis.

In [None]:
!pip install numpy==1.26.4
!pip install pandas==2.2.2
!pip install scikit-learn==1.5.2
!pip install transformers==4.44.2 datasets==2.21.0 torch==2.4.1

import os
os.kill(os.getpid(), 9)  


In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import torch
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

!nvidia-smi

logger.info("Environment setup completed")

## Cell 3: Load and Preprocess Dataset

Load the AI4Bharat IndicSentiment dataset, filter for Hindi, Tamil, Telugu, Malayalam, and Kannada, and preprocess the text.

In [None]:
from datasets import load_dataset

try:
    dataset = load_dataset('ai4bharat/IndicSentiment', split='train')
except Exception as e:
    logger.error(f"Failed to load dataset: {str(e)}")
    raise

target_languages = ['hi', 'ta', 'te', 'ml', 'kn']
df = dataset.to_pandas()
df = df[df['language'].isin(target_languages)]

def clean_text(text):
    import re
    if not isinstance(text, str):
        return ''
    text = re.sub(r'http\S+', '', text)  
    text = re.sub(r'[^\w\s]', '', text) 
    text = re.sub(r'\s+', ' ', text)  
    return text.strip()

# Map labels to integers (positive=2, neutral=1, negative=0)
label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
df['label'] = df['sentiment'].map(label_map)
df['text'] = df['text'].apply(clean_text)

# Drop rows with empty text or invalid labels
df = df.dropna(subset=['text', 'label'])
df = df[df['text'] != '']

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df[['text', 'label', 'language']])

# Split dataset
train_test = dataset.train_test_split(test_size=0.2, seed=42)
test_val = train_test['test'].train_test_split(test_size=0.5, seed=42)
datasets = DatasetDict({
    'train': train_test['train'],
    'validation': test_val['train'],
    'test': test_val['test']
})

logger.info(f"Dataset splits: Train={len(datasets['train'])}, Validation={len(datasets['validation'])}, Test={len(datasets['test'])}")
logger.info(f"Languages: {df['language'].value_counts().to_dict()}")

## Cell 4: Tokenize Dataset

Tokenize the text using the `xlm-roberta-base` tokenizer.

In [None]:
try:
    tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
except Exception as e:
    logger.error(f"Failed to load tokenizer: {str(e)}")
    raise

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

tokenized_datasets = datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['text', 'language'])
tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')
tokenized_datasets.set_format('torch')

logger.info("Dataset tokenized")

## Cell 5: Initialize Model

Load `xlm-roberta-base` and configure it for 3-class classification.

In [None]:
try:
    model = AutoModelForSequenceClassification.from_pretrained(
        'xlm-roberta-base',
        num_labels=3,
        id2label={0: 'negative', 1: 'neutral', 2: 'positive'},
        label2id={'negative': 0, 'neutral': 1, 'positive': 2}
    )
except Exception as e:
    logger.error(f"Failed to load model: {str(e)}")
    raise

logger.info("Model initialized")

## Cell 6: Define Metrics

Define functions to compute accuracy and F1-score for evaluation.

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    return {'accuracy': acc, 'f1': f1}

logger.info("Metrics defined")

## Cell 7: Configure Training

Set up training arguments for the `Trainer` API.

In [None]:
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/indic_sentiment_model',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1'
)

logger.info("Training arguments configured")

## Cell 8: Train Model

Initialize the `Trainer` and start fine-tuning.

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    compute_metrics=compute_metrics
)

try:
    trainer.train()
except Exception as e:
    logger.error(f"Training failed: {str(e)}")
    raise

logger.info("Training completed")

## Cell 9: Evaluate Model

Evaluate the model on the test set and display the confusion matrix.

In [None]:

try:
    eval_results = trainer.evaluate(tokenized_datasets['test'])
    logger.info(f"Test results: {eval_results}")

    predictions = trainer.predict(tokenized_datasets['test'])
    preds = np.argmax(predictions.predictions, axis=-1)
    cm = confusion_matrix(predictions.label_ids, preds)
    logger.info(f"Confusion Matrix:\n{cm}")
except Exception as e:
    logger.error(f"Evaluation failed: {str(e)}")
    raise

## Cell 10: Save Model

Save the fine-tuned model and tokenizer to Google Drive.

In [None]:
try:
    model.save_pretrained('/content/drive/MyDrive/indic_sentiment_model/final')
    tokenizer.save_pretrained('/content/drive/MyDrive/indic_sentiment_model/final')
    logger.info("Model and tokenizer saved to Google Drive")
except Exception as e:
    logger.error(f"Failed to save model: {str(e)}")
    raise

## Cell 11: Test Model

Test the fine-tuned model with long, phone call-like sentences for each language.

In [None]:
from transformers import pipeline

try:
    sentiment_pipeline = pipeline(
        'sentiment-analysis',
        model='/content/drive/MyDrive/indic_sentiment_model/final',
        tokenizer='/content/drive/MyDrive/indic_sentiment_model/final'
    )
except Exception as e:
    logger.error(f"Failed to load pipeline: {str(e)}")
    raise

test_sentences = [
    {
        'text': 'मैंने आपके ग्राहक सेवा दल के साथ अभी-अभी एक शानदार अनुभव प्राप्त किया; वे इतने मददगार और धैर्यवान थे, और मेरी समस्या को तुरंत हल कर दिया, मैं बहुत खुश हूँ!',
        'language': 'hi',
        'expected': 'positive'
    },
    {
        'text': 'நான் உங்கள் வாடிக்கையாளர் சேவைக் குழுவுடன் இப்போது ஒரு அற்புதமான அனுபவத்தைப் பெற்றேன்; அவர்கள் மிகவும் உதவிகரமாகவும் பொறுமையாகவும் இருந்தனர், எனது பிரச்சினையை உடனடியாக தீர்த்து விட்டனர், நான் மிகவும் மகிழ்ச்சியாக இருக்கிறேன்!',
        'language': 'ta',
        'expected': 'positive'
    },
    {
        'text': 'నేను మీ కస్టమర్ సర్వీస్ టీమ్‌తో ఇప్పుడే అద్భుతమైన అనుభవం పొందాను; వారు చాలా సహాయకారిగా మరియు ఓపికగా ఉన్నారు, నా సమస్యను వెంటనే పరిష్కరించారు, నేను చాలా సంతోషంగా ఉన్నాను!',
        'language': 'te',
        'expected': 'positive'
    },
    {
        'text': 'ഞാൻ ഇപ്പോൾ നിന്റെ ഉപഭോക്തൃ സേവന ടീമിനോട് ഒരു മനോഹരമായ അനുഭവം നേടി; അവർ വളരെ സഹായകരവും ക്ഷമയോടെയും ആയിരുന്നു, എന്റെ പ്രശ്നം തൽക്ഷണം പരിഹരിച്ചു, ഞാൻ വളരെ സന്തോഷവാനാണ്!',
        'language': 'ml',
        'expected': 'positive'
    },
    {
        'text': 'ನಾನು ಈಗ ತಾನೇ ನಿಮ್ಮ ಗ್ರಾಹಕ ಸೇವಾ ತಂಡದೊಂದಿಗೆ ಅದ್ಭುತವಾದ ಅನುಭವವನ್ನು ಪಡೆದೆ; ಅವರು ತುಂಬಾ ಸಹಾಯಕವಾಗಿದ್ದರು ಮತ್ತು ತಾಳ್ಮೆಯಿಂದ ಕೂಡಿದ್ದರು, ನನ್ನ ಸಮಸ್ಯೆಯನ್ನು ತಕ್ಷಣವೇ ಬಗೆಹರಿಸಿದರು, ನಾನು ತುಂಬಾ ಸಂತೋಷವಾಗಿದ್ದೇನೆ!',
        'language': 'kn',
        'expected': 'positive'
    }
]

for sentence in test_sentences:
    try:
        result = sentiment_pipeline(sentence['text'])[0]
        print(f"Language: {sentence['language']}")
        print(f"Text: {sentence['text']}")
        print(f"Sentiment: {result['label']} (score: {result['score']:.4f})")
        print(f"Expected: {sentence['expected']}")
        print('-' * 80)
    except Exception as e:
        logger.error(f"Failed to analyze '{sentence['text']}': {str(e)}")
        print(f"Error analyzing '{sentence['text']}': {str(e)}")