<a href="https://colab.research.google.com/github/Ab-bijoy/Detecting_AI-generated-product-reviews/blob/main/Data%20augmentation/Data_Augmentation_with_MuRIL(Tamil).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Import Libraries**

In [None]:
!pip install tensorflow==2.12.0 transformers pandas torch

import pandas as pd
import torch
from transformers import pipeline, AutoTokenizer
import numpy as np
import re
from tqdm import tqdm
import random

https://drive.google.com/file/d/1Too1o1eIAazaM-XQXYKI_9gR8dxcoiYu/view?usp=sharing

In [None]:
!gdown --id 1Too1o1eIAazaM-XQXYKI_9gR8dxcoiYu
!unzip -q Train.zip

# **CONFIGURE  FILES AND COLUMN**

In [3]:
INPUT_CSV_PATH = '/content/Train/tam_training_data_hum_ai.csv'
TEXT_COLUMN_NAME = 'DATA'
OUTPUT_CSV_PATH = 'augmented_output.csv'

# **Loads the MuRIL fill-mask pipeline.**

In [4]:
def load_model():

    print("Loading MuRIL model")
    try:
        # Use GPU if available for significantly faster processing
        device = 0 if torch.cuda.is_available() else -1
        unmasker = pipeline('fill-mask', model='google/muril-base-cased', device=device)
        print(" Model loaded successfully.")
        if device == 0:
            print(" Running on GPU for faster performance.")
        else:
            print("Running on CPU. For large datasets, a GPU is recommended.")
        return unmasker
    except Exception as e:
        print(f" Error loading model: {e}")
        return None

# **Performs contextual word substitution on a single sentence.**

In [5]:
def contextual_augmentation(sentence: str, unmasker_pipeline) -> str:
    if not isinstance(sentence, str) or not unmasker_pipeline:
        return sentence # Return original if input is not a string or model failed

    words = sentence.split()
    if len(words) <= 2: # Avoid augmenting very short sentences
        return sentence

    # Randomly select an index to mask (avoiding the first and last words for better context)
    mask_index = random.randint(1, len(words) - 2)
    original_word = words[mask_index]

    # Create the masked sentence using the model's specific mask token
    words[mask_index] = unmasker_pipeline.tokenizer.mask_token
    masked_sentence = " ".join(words)

    try:
        predictions = unmasker_pipeline(masked_sentence, top_k=5)
    except:
        # If the model fails for any reason, return the original sentence
        return sentence

    # Find a suitable replacement (not the same as the original word)
    for pred in predictions:
        predicted_token = pred['token_str'].strip()
        if predicted_token.lower() != original_word.lower():
            words[mask_index] = predicted_token
            return " ".join(words)

    return sentence # Return original if no suitable replacement was found

# **Main Execution**

In [None]:

if __name__ == "__main__":
    # 1. Load the augmentation model
    unmasker = load_model()

    if unmasker:
        try:
            # 2. Read the input CSV file
            print(f"\nReading data from '{INPUT_CSV_PATH}'...")
            # Use on_bad_lines='skip' to handle potential parsing errors
            df = pd.read_csv(INPUT_CSV_PATH, on_bad_lines='skip')
            print(" Input file read successfully.")

            # --- Add this line to inspect columns after reading ---
            print("Columns after reading CSV:", df.columns)
            # -----------------------------------------------------

            if TEXT_COLUMN_NAME not in df.columns:
                raise ValueError(f"Column '{TEXT_COLUMN_NAME}' not found in the CSV.")

            # 3. Apply the augmentation function to the specified column
            print(f"Augmenting text in the '{TEXT_COLUMN_NAME}' column... This may take time.")

            # Use tqdm for a progress bar
            tqdm.pandas(desc="Augmenting rows")
            df['augmented_text'] = df[TEXT_COLUMN_NAME].progress_apply(
                lambda text: contextual_augmentation(text, unmasker)
            )

            # Merge the original 'DATA' column and 'augmented_text' column into a single 'DATA' column
            print("\nMerging original and augmented text...")
            df['DATA'] = df['DATA'].astype(str) + " " + df['augmented_text'].astype(str)
            print(" Text merged successfully.")


            # Drop the 'augmented_text' column as it's now merged
            df = df.drop(columns=['augmented_text'])
            print(" Dropped 'augmented_text' column.")

            # 4. Save the new DataFrame to an output CSV
            print(f"\nSaving augmented data to '{OUTPUT_CSV_PATH}'...")
            df.to_csv(OUTPUT_CSV_PATH, index=False, encoding='utf-8')
            print(f" Process complete! Augmented file saved as '{OUTPUT_CSV_PATH}'.")

            # Display a sample of the result
            print("\n--- Sample of Augmented Data ---")
            print(df[[TEXT_COLUMN_NAME]].head())
            print("--------------------------------\n")

        except FileNotFoundError:
            print(f" ERROR: The file '{INPUT_CSV_PATH}' was not found.")
        except ValueError as ve:
            print(f" ERROR: {ve}")
        except Exception as e:
            print(f" An unexpected error occurred: {e}")

In [None]:
print(df.head())

In [None]:
# Load the original and augmented dataframes
original_df = pd.read_csv('/content/Train/tam_training_data_hum_ai.csv', on_bad_lines='skip')
augmented_df = pd.read_csv('augmented_output.csv', on_bad_lines='skip')

max_original_id_index = original_df.index.max() if not original_df.empty else -1
augmented_df['ID'] = 'TAM_HUAI_TR_' + (augmented_df.index + max_original_id_index + 1).astype(str)

# Concatenate the original and augmented dataframes
merged_final_df = pd.concat([original_df, augmented_df], ignore_index=True)

# Display the head of the merged dataframe and its info
print("Merged Final DataFrame Head:")
display(merged_final_df.head())


# **Merged Final DataFrame Info**

In [None]:
merged_final_df.info()

# **Save the final merged dataframe**

In [None]:
merged_final_df.to_csv('final_merged_augmented_data(tamil).csv', index=False, encoding='utf-8')
print("\nFinal merged data saved to 'final_merged_augmented_data.csv'")