<a href="https://colab.research.google.com/github/Ab-bijoy/Detecting_AI-generated-product-reviews/blob/main/Data%20augmentation/Data_Augmentation_with_MuRIL(Mal).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Import Libraries**

In [1]:
!pip install tensorflow==2.12.0 transformers pandas torch

import pandas as pd
import torch
from transformers import pipeline, AutoTokenizer
import numpy as np
import re
from tqdm import tqdm
import random



https://drive.google.com/file/d/1-Jb2s5kM8hPFfjYkNj3XcLozo1uAL5jH/view?usp=sharing

In [2]:
!gdown --id 1-Jb2s5kM8hPFfjYkNj3XcLozo1uAL5jH
!unzip -q Train.zip

Downloading...
From: https://drive.google.com/uc?id=1Too1o1eIAazaM-XQXYKI_9gR8dxcoiYu
To: /content/Train.zip
100% 66.9k/66.9k [00:00<00:00, 5.19MB/s]


# **CONFIGURE  FILES AND COLUMN**

In [3]:
INPUT_CSV_PATH = '/content/Train/mal_training_data_hum_ai.csv'
TEXT_COLUMN_NAME = 'DATA'
OUTPUT_CSV_PATH = 'augmented_output.csv'

# **Loads the MuRIL fill-mask pipeline.**

In [4]:
def load_model():

    print("Loading MuRIL model")
    try:
        # Use GPU if available for significantly faster processing
        device = 0 if torch.cuda.is_available() else -1
        unmasker = pipeline('fill-mask', model='google/muril-base-cased', device=device)
        print(" Model loaded successfully.")
        if device == 0:
            print(" Running on GPU for faster performance.")
        else:
            print("Running on CPU. For large datasets, a GPU is recommended.")
        return unmasker
    except Exception as e:
        print(f" Error loading model: {e}")
        return None

# **Performs contextual word substitution on a single sentence.**

In [5]:
def contextual_augmentation(sentence: str, unmasker_pipeline) -> str:
    if not isinstance(sentence, str) or not unmasker_pipeline:
        return sentence # Return original if input is not a string or model failed

    words = sentence.split()
    if len(words) <= 2: # Avoid augmenting very short sentences
        return sentence

    # Randomly select an index to mask (avoiding the first and last words for better context)
    mask_index = random.randint(1, len(words) - 2)
    original_word = words[mask_index]

    # Create the masked sentence using the model's specific mask token
    words[mask_index] = unmasker_pipeline.tokenizer.mask_token
    masked_sentence = " ".join(words)

    try:
        predictions = unmasker_pipeline(masked_sentence, top_k=5)
    except:
        # If the model fails for any reason, return the original sentence
        return sentence

    # Find a suitable replacement (not the same as the original word)
    for pred in predictions:
        predicted_token = pred['token_str'].strip()
        if predicted_token.lower() != original_word.lower():
            words[mask_index] = predicted_token
            return " ".join(words)

    return sentence # Return original if no suitable replacement was found

# **Main Execution**

In [6]:

if __name__ == "__main__":
    # 1. Load the augmentation model
    unmasker = load_model()

    if unmasker:
        try:
            # 2. Read the input CSV file
            print(f"\nReading data from '{INPUT_CSV_PATH}'...")
            # Use on_bad_lines='skip' to handle potential parsing errors
            df = pd.read_csv(INPUT_CSV_PATH, on_bad_lines='skip')
            print(" Input file read successfully.")

            # --- Add this line to inspect columns after reading ---
            print("Columns after reading CSV:", df.columns)
            # -----------------------------------------------------

            if TEXT_COLUMN_NAME not in df.columns:
                raise ValueError(f"Column '{TEXT_COLUMN_NAME}' not found in the CSV.")

            # 3. Apply the augmentation function to the specified column
            print(f"Augmenting text in the '{TEXT_COLUMN_NAME}' column... This may take time.")

            # Use tqdm for a progress bar
            tqdm.pandas(desc="Augmenting rows")
            df['augmented_text'] = df[TEXT_COLUMN_NAME].progress_apply(
                lambda text: contextual_augmentation(text, unmasker)
            )

            # Merge the original 'DATA' column and 'augmented_text' column into a single 'DATA' column
            print("\nMerging original and augmented text...")
            df['DATA'] = df['DATA'].astype(str) + " " + df['augmented_text'].astype(str)
            print(" Text merged successfully.")


            # Drop the 'augmented_text' column as it's now merged
            df = df.drop(columns=['augmented_text'])
            print(" Dropped 'augmented_text' column.")

            # 4. Save the new DataFrame to an output CSV
            print(f"\nSaving augmented data to '{OUTPUT_CSV_PATH}'...")
            df.to_csv(OUTPUT_CSV_PATH, index=False, encoding='utf-8')
            print(f" Process complete! Augmented file saved as '{OUTPUT_CSV_PATH}'.")

            # Display a sample of the result
            print("\n--- Sample of Augmented Data ---")
            print(df[[TEXT_COLUMN_NAME]].head())
            print("--------------------------------\n")

        except FileNotFoundError:
            print(f" ERROR: The file '{INPUT_CSV_PATH}' was not found.")
        except ValueError as ve:
            print(f" ERROR: {ve}")
        except Exception as e:
            print(f" An unexpected error occurred: {e}")

Loading MuRIL model


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/953M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/953M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/muril-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

Device set to use cuda:0


 Model loaded successfully.
 Running on GPU for faster performance.

Reading data from '/content/Train/mal_training_data_hum_ai.csv'...
 Input file read successfully.
Columns after reading CSV: Index(['ID', 'DATA', 'LABEL'], dtype='object')
Augmenting text in the 'DATA' column... This may take time.



Augmenting rows:   0%|          | 0/800 [00:00<?, ?it/s][A
Augmenting rows:   0%|          | 2/800 [00:00<06:12,  2.14it/s][A
Augmenting rows:   1%|          | 5/800 [00:01<02:19,  5.69it/s][A
Augmenting rows:   1%|▏         | 10/800 [00:01<01:04, 12.22it/s][AYou seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset

Augmenting rows:   2%|▏         | 15/800 [00:01<00:42, 18.42it/s][A
Augmenting rows:   2%|▏         | 19/800 [00:01<00:34, 22.47it/s][A
Augmenting rows:   3%|▎         | 25/800 [00:01<00:26, 29.59it/s][A
Augmenting rows:   4%|▍         | 30/800 [00:01<00:23, 32.62it/s][A
Augmenting rows:   4%|▍         | 36/800 [00:01<00:20, 37.86it/s][A
Augmenting rows:   5%|▌         | 41/800 [00:01<00:19, 39.13it/s][A
Augmenting rows:   6%|▌         | 47/800 [00:02<00:19, 38.15it/s][A
Augmenting rows:   6%|▋         | 52/800 [00:02<00:29, 25.53it/s][A
Augmenting rows:   7%|▋         | 56/800 [00:02<00:29, 25.56it/s][A
Augm


Merging original and augmented text...
 Text merged successfully.
 Dropped 'augmented_text' column.

Saving augmented data to 'augmented_output.csv'...
 Process complete! Augmented file saved as 'augmented_output.csv'.

--- Sample of Augmented Data ---
                                                DATA
0  ഞാൻ കുറച്ച് കാലമായി മുച്ചട്ച്ചിൻ്റെ ഫേസ് വാഷ് ...
1  ഈ ഫേസ് വാഷ് തണുപ്പ് വെതറിലും ഉപയോഗിക്കാം ഈ ഫേസ...
2  അണ്ണാ എനിക്ക് 14 വയസ് ആയ തേയോളു എനിക്ക് സ്കിൻക...
3  ബ്രോ ഇതെല്ലം യൂസ്  ആക്കീട്ട് നൈറ്റ് പിന്നെ വേറ...
4  ഇത് ഫേസ് വാഷ് ഡെയിലി ചെയ്താ സ്കിൻകെയറിന് നല്ലത...
--------------------------------



In [7]:
print(df.head())

                ID                                               DATA  LABEL
0  MAL_HUAI_TR_001  ഞാൻ കുറച്ച് കാലമായി മുച്ചട്ച്ചിൻ്റെ ഫേസ് വാഷ് ...  HUMAN
1  MAL_HUAI_TR_002  ഈ ഫേസ് വാഷ് തണുപ്പ് വെതറിലും ഉപയോഗിക്കാം ഈ ഫേസ...  HUMAN
2  MAL_HUAI_TR_003  അണ്ണാ എനിക്ക് 14 വയസ് ആയ തേയോളു എനിക്ക് സ്കിൻക...  HUMAN
3  MAL_HUAI_TR_004  ബ്രോ ഇതെല്ലം യൂസ്  ആക്കീട്ട് നൈറ്റ് പിന്നെ വേറ...  HUMAN
4  MAL_HUAI_TR_005  ഇത് ഫേസ് വാഷ് ഡെയിലി ചെയ്താ സ്കിൻകെയറിന് നല്ലത...  HUMAN


In [8]:
# Load the original and augmented dataframes
original_df = pd.read_csv('/content/Train/mal_training_data_hum_ai.csv', on_bad_lines='skip')
augmented_df = pd.read_csv('augmented_output.csv', on_bad_lines='skip')

max_original_id_index = original_df.index.max() if not original_df.empty else -1
augmented_df['ID'] = 'MAL_HUAI_TR_' + (augmented_df.index + max_original_id_index + 1).astype(str)

# Concatenate the original and augmented dataframes
merged_final_df = pd.concat([original_df, augmented_df], ignore_index=True)

# Display the head of the merged dataframe and its info
print("Merged Final DataFrame Head:")
display(merged_final_df.head())


Merged Final DataFrame Head:


Unnamed: 0,ID,DATA,LABEL
0,MAL_HUAI_TR_001,ഞാൻ കുറച്ച് കാലമായി മുച്ചട്ച്ചിൻ്റെ ഫേസ് വാഷ് ...,HUMAN
1,MAL_HUAI_TR_002,ഈ ഫേസ് വാഷ് തണുപ്പ് വെതറിലും ഉപയോഗിക്കാം,HUMAN
2,MAL_HUAI_TR_003,അണ്ണാ എനിക്ക് 14 വയസ് ആയ തേയോളു എനിക്ക് സ്കിൻക...,HUMAN
3,MAL_HUAI_TR_004,ബ്രോ ഇതെല്ലം യൂസ് ആക്കീട്ട് നൈറ്റ് പിന്നെ വേറ...,HUMAN
4,MAL_HUAI_TR_005,ഇത് ഫേസ് വാഷ് ഡെയിലി ചെയ്താ സ്കിൻകെയറിന് നല്ലതാ,HUMAN


# **Merged Final DataFrame Info**

In [9]:
merged_final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600 entries, 0 to 1599
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      1600 non-null   object
 1   DATA    1600 non-null   object
 2   LABEL   1600 non-null   object
dtypes: object(3)
memory usage: 37.6+ KB


# **Save the final merged dataframe**

In [10]:
merged_final_df.to_csv('final_merged_augmented_data(Mal).csv', index=False, encoding='utf-8')
print("\nFinal merged data saved to 'final_merged_augmented_data.csv'")


Final merged data saved to 'final_merged_augmented_data.csv'
