In [None]:
import pandas as pd
import torch
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, 
                          Trainer, TrainingArguments)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
import time

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load Data
train_data = pd.read_csv('/kaggle/input/ps-dataset/PS_train.csv') 
val_data = pd.read_csv('/kaggle/input/ps-dataset/PS_dev.csv')
test_data = pd.read_csv('/kaggle/input/ps-dataset/PS_test_without_lables.csv')

# Handle missing values
train_data.dropna(inplace=True)
val_data.dropna(inplace=True)
test_data.dropna(inplace=True)

# Sample 10% of data to reduce training time
train_data = train_data.sample(frac=0.1, random_state=42)

# Normalization
normalizer_factory = IndicNormalizerFactory()
normalizer = normalizer_factory.get_normalizer("ta")

def normalize_text(text):
    return normalizer.normalize(text)

train_data['content'] = train_data['content'].apply(normalize_text)
val_data['content'] = val_data['content'].apply(normalize_text)
test_data['content'] = test_data['content'].apply(normalize_text)

# Truncate long texts to 512 chars
train_data['content'] = train_data['content'].apply(lambda x: x[:512])
val_data['content'] = val_data['content'].apply(lambda x: x[:512])
test_data['content'] = test_data['content'].apply(lambda x: x[:512])

# Encode labels to integers
label_encoder = LabelEncoder()
train_data['labels'] = label_encoder.fit_transform(train_data['labels'])
val_data['labels'] = label_encoder.transform(val_data['labels'])

# Train-Test Split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_data['content'], train_data['labels'], test_size=0.1, random_state=42
)

# Dataset Class
class TamilDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Train and Evaluate Model
def train_and_evaluate(model_name):
    print(f"Training {model_name}...")
    start = time.time()

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Tokenize Data
    train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=256)
    val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=256)
    
    # Create Datasets
    train_dataset = TamilDataset(train_encodings, train_labels.tolist())
    val_dataset = TamilDataset(val_encodings, val_labels.tolist())
    
    # Load Model
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(set(train_labels)))
    model.to(device)
    
    # Training Arguments
    training_args = TrainingArguments(
        output_dir=f"./results/{model_name}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=1,
        logging_dir=f"./logs/{model_name}",
        logging_steps=200,
        fp16=torch.cuda.is_available(),
        report_to="none"
    )
    
    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer
    )
    
    # Train and Evaluate
    trainer.train()
    predictions = trainer.predict(val_dataset).predictions.argmax(axis=1)
    accuracy = accuracy_score(val_labels, predictions)
    print(f"{model_name} Accuracy: {accuracy:.4f}")
    print(classification_report(val_labels, predictions))
    print("Training Time:", time.time() - start)

    return predictions

# Use a Lightweight Model
model_name = "distilbert-base-multilingual-cased"

train_and_evaluate(model_name)
