In [None]:
import pandas as pd
import torch
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, 
                          Trainer, TrainingArguments)
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
import time
!pip install unidecode
from unidecode import unidecode
import re

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load Dataset

In [None]:
train_data = pd.read_csv('/kaggle/input/ps-dataset/PS_train.csv') 
val_data = pd.read_csv('/kaggle/input/ps-dataset/PS_dev.csv')
test_data = pd.read_csv('/kaggle/input/ps-dataset/PS_test_without_lables.csv')

# Handle Missing Valuse

In [None]:
train_data.dropna(inplace=True)
val_data.dropna(inplace=True)
test_data.dropna(inplace=True)

# Preprocess Dataset

In [None]:

def preprocess_text(text):
    text = unidecode(text)  # Remove accents
    text = text.lower()  # Lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

In [None]:
train_data['content'] = train_data['content'].apply(preprocess_text)
val_data['content'] = val_data['content'].apply(preprocess_text)
test_data['content'] = test_data['content'].apply(preprocess_text)

In [None]:
# Encode labels to integers
label_encoder = LabelEncoder()
train_data['labels'] = label_encoder.fit_transform(train_data['labels'])
val_data['labels'] = label_encoder.transform(val_data['labels'])

In [None]:
# Dataset Class
class TamilDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# Train and Evaluate the Model

In [None]:
# Train and Evaluate Model
def train_and_evaluate(model_name):
    print(f"Training {model_name}...")
    start = time.time()

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    train_encodings = tokenizer(list(train_data['content']), padding=True, truncation=True, max_length=256)
    val_encodings = tokenizer(list(val_data['content']), padding=True, truncation=True, max_length=256)

    
    train_dataset = TamilDataset(train_encodings, train_data['labels'].tolist())
    val_dataset = TamilDataset(val_encodings, val_data['labels'].tolist())
    
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(set(train_data['labels'])))
    model.to(device)
    
    # Remove max_length from TrainingArguments
    training_args = TrainingArguments(
    output_dir=f"./results/{model_name}",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8, 
    per_device_eval_batch_size=8,  
    num_train_epochs=17,
    logging_dir=f"./logs/{model_name}",
    logging_steps=200,
    fp16=True,  # Ensure fp16 is enabled
    report_to="none",
    learning_rate=5e-6,
    warmup_steps=500,
    weight_decay=0.02,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    gradient_accumulation_steps=8,
    seed=42,
    lr_scheduler_type="cosine",
    disable_tqdm=False,
)
 trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer
    )
    
    trainer.train()
    predictions = trainer.predict(val_dataset).predictions.argmax(axis=1)
    accuracy = accuracy_score(val_data['labels'], predictions)
    print(f"{model_name} Accuracy: {accuracy:.4f}")
    print(classification_report(val_data['labels'], predictions))
    print("Training Time:", time.time() - start)

    return model, tokenizer, trainer

In [None]:
# Predict on Test Set
def predict_on_test(model, tokenizer, trainer):
    print("Predicting on Test Data...")
    
    test_encodings = tokenizer(list(test_data['content']), padding=True, truncation=True, max_length=512)
    test_dataset = TamilDataset(test_encodings, [0]*len(test_data))
    
    predictions = trainer.predict(test_dataset).predictions.argmax(axis=1)
    
    predicted_labels = label_encoder.inverse_transform(predictions)
    
    test_data['predicted_labels'] = predicted_labels
    test_data[['Id', 'predicted_labels']].to_csv('/kaggle/working/mbert_predictions.csv', index=False)
    print("Predictions saved to test_predictions.csv")


In [None]:
model_name = "bert-base-multilingual-cased"

model, tokenizer, trainer = train_and_evaluate(model_name)
predict_on_test(model, tokenizer, trainer)
