<a href="https://colab.research.google.com/github/Ab-bijoy/Detecting_AI-generated-product-reviews/blob/main/Models%20/Transformer/Transformer_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Import Libraries**

In [41]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import torch
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)


https://drive.google.com/file/d/1vNvW3eC4mtT-Qbs6T0uTmITnRkgOmKHr/view?usp=sharing

In [None]:
!gdown --id 1vNvW3eC4mtT-Qbs6T0uTmITnRkgOmKHr


In [None]:
!unzip -q Datasets.zip


# **1. Data Loading and Preprocessing**

In [44]:
def load_and_preprocess_data(train_path, test_path):
    try:
        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)
    except FileNotFoundError:
        raise FileNotFoundError("Make sure the dataset files are in the correct path.")

    test_df.rename(columns={'Data': 'DATA', 'Label': 'LABEL'}, inplace=True)
    train_df = train_df[['DATA', 'LABEL']]

    def clean_text(text):
        text = str(text).lower()
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    train_df['DATA'] = train_df['DATA'].apply(clean_text)
    test_df['DATA'] = test_df['DATA'].apply(clean_text)

    label_encoder = LabelEncoder()
    train_df['LABEL'] = label_encoder.fit_transform(train_df['LABEL'])
    test_df['LABEL'] = label_encoder.transform(test_df['LABEL'])

    return train_df, test_df, label_encoder

# **2. Dataset Class**

In [45]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


# **3. Metric Calculation**

In [46]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# **4. Model Setup**

In [47]:
def setup_model_and_tokenizer(model_name, num_labels):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
    return model, tokenizer


# **5. Training Arguments**

In [48]:
def get_training_arguments():
    return TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        greater_is_better=True,
        report_to="none"
    )


#**6. Trainer Setup**

In [49]:

def train_model(model, tokenizer, train_df, test_df, label_encoder):
    train_dataset = TextClassificationDataset(
        texts=train_df['DATA'].tolist(),
        labels=train_df['LABEL'].tolist(),
        tokenizer=tokenizer
    )

    test_dataset = TextClassificationDataset(
        texts=test_df['DATA'].tolist(),
        labels=test_df['LABEL'].tolist(),
        tokenizer=tokenizer
    )

    training_args = get_training_arguments()

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics
    )

    trainer.train()
    return trainer

# **7. Evaluation**

In [50]:
def evaluate_model(trainer):
    results = trainer.evaluate()
    print("\n--- Evaluation Results ---")
    print(f"Accuracy : {results['eval_accuracy']:.4f}")
    print(f"F1 Score : {results['eval_f1']:.4f}")
    print(f"Precision: {results['eval_precision']:.4f}")
    print(f"Recall   : {results['eval_recall']:.4f}")
    print("--------------------------")
    return results

# **Pipeline for indic-bert : https://huggingface.co/ai4bharat/indic-bert**

## **Tamil**

In [None]:
def main_pipeline():
    TRAIN_PATH = "/content/Datasets/final_merged_augmented_data(tamil).csv"
    TEST_PATH = "/content/Datasets/tamil-test.xlsx - Sheet1.csv"
    MODEL_NAME = "ai4bharat/indic-bert"

    print("🔹 Loading and preprocessing data...")
    train_df, test_df, label_encoder = load_and_preprocess_data(TRAIN_PATH, TEST_PATH)

    print("🔹 Initializing tokenizer and model...")
    model, tokenizer = setup_model_and_tokenizer(MODEL_NAME, num_labels=len(label_encoder.classes_))

    print("🔹 Starting training...")
    trainer = train_model(model, tokenizer, train_df, test_df, label_encoder)

    print("🔹 Evaluating model...")
    evaluate_model(trainer)
if __name__ == "__main__":
    main_pipeline()

# **Malayalam**

In [None]:
def main_pipeline():
    TRAIN_PATH = "/content/Datasets/final_merged_augmented_data(Mal).csv"
    TEST_PATH = "/content/Datasets/mal_test.xlsx - Sheet1.csv"
    MODEL_NAME = "ai4bharat/indic-bert"

    print("🔹 Loading and preprocessing data...")
    train_df, test_df, label_encoder = load_and_preprocess_data(TRAIN_PATH, TEST_PATH)

    print("🔹 Initializing tokenizer and model...")
    model, tokenizer = setup_model_and_tokenizer(MODEL_NAME, num_labels=len(label_encoder.classes_))

    print("🔹 Starting training...")
    trainer = train_model(model, tokenizer, train_df, test_df, label_encoder)

    print("🔹 Evaluating model...")
    evaluate_model(trainer)
if __name__ == "__main__":
    main_pipeline()

# **Pipeline for muril-base-cased : https://huggingface.co/google/muril-base-cased**

## **Tamil**

In [None]:
def main_pipeline():
    TRAIN_PATH = "/content/Datasets/final_merged_augmented_data(tamil).csv"
    TEST_PATH = "/content/Datasets/tamil-test.xlsx - Sheet1.csv"
    MODEL_NAME = "google/muril-base-cased"

    print("🔹 Loading and preprocessing data...")
    train_df, test_df, label_encoder = load_and_preprocess_data(TRAIN_PATH, TEST_PATH)

    print("🔹 Initializing tokenizer and model...")
    model, tokenizer = setup_model_and_tokenizer(MODEL_NAME, num_labels=len(label_encoder.classes_))

    print("🔹 Starting training...")
    trainer = train_model(model, tokenizer, train_df, test_df, label_encoder)

    print("🔹 Evaluating model...")
    evaluate_model(trainer)
if __name__ == "__main__":
    main_pipeline()

# **Malayalam**

In [None]:
def main_pipeline():
    TRAIN_PATH = "/content/Datasets/final_merged_augmented_data(Mal).csv"
    TEST_PATH = "/content/Datasets/mal_test.xlsx - Sheet1.csv"
    MODEL_NAME = "google/muril-base-cased"

    print("🔹 Loading and preprocessing data...")
    train_df, test_df, label_encoder = load_and_preprocess_data(TRAIN_PATH, TEST_PATH)

    print("🔹 Initializing tokenizer and model...")
    model, tokenizer = setup_model_and_tokenizer(MODEL_NAME, num_labels=len(label_encoder.classes_))

    print("🔹 Starting training...")
    trainer = train_model(model, tokenizer, train_df, test_df, label_encoder)

    print("🔹 Evaluating model...")
    evaluate_model(trainer)
if __name__ == "__main__":
    main_pipeline()

# **Pipeline for xlm-roberta : https://huggingface.co/FacebookAI/xlm-roberta-base**

# **Tamil**

In [None]:
def main_pipeline():
    TRAIN_PATH = "/content/Datasets/final_merged_augmented_data(tamil).csv"
    TEST_PATH = "/content/Datasets/tamil-test.xlsx - Sheet1.csv"
    MODEL_NAME = "FacebookAI/xlm-roberta-base"

    print("🔹 Loading and preprocessing data...")
    train_df, test_df, label_encoder = load_and_preprocess_data(TRAIN_PATH, TEST_PATH)

    print("🔹 Initializing tokenizer and model...")
    model, tokenizer = setup_model_and_tokenizer(MODEL_NAME, num_labels=len(label_encoder.classes_))

    print("🔹 Starting training...")
    trainer = train_model(model, tokenizer, train_df, test_df, label_encoder)

    print("🔹 Evaluating model...")
    evaluate_model(trainer)
if __name__ == "__main__":
    main_pipeline()

# **Malayalam**

In [None]:
def main_pipeline():
    TRAIN_PATH = "/content/Datasets/final_merged_augmented_data(Mal).csv"
    TEST_PATH = "/content/Datasets/mal_test.xlsx - Sheet1.csv"
    MODEL_NAME = "FacebookAI/xlm-roberta-base"

    print("🔹 Loading and preprocessing data...")
    train_df, test_df, label_encoder = load_and_preprocess_data(TRAIN_PATH, TEST_PATH)

    print("🔹 Initializing tokenizer and model...")
    model, tokenizer = setup_model_and_tokenizer(MODEL_NAME, num_labels=len(label_encoder.classes_))

    print("🔹 Starting training...")
    trainer = train_model(model, tokenizer, train_df, test_df, label_encoder)

    print("🔹 Evaluating model...")
    evaluate_model(trainer)
if __name__ == "__main__":
    main_pipeline()