<a href="https://colab.research.google.com/github/AbeerProg/RRDS/blob/main/Augmentation%20.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install deep-translator

Collecting deep-translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deep-translator
Successfully installed deep-translator-1.11.4


In [None]:
!pip install deep-translator




In [None]:
import pandas as pd
import random
import nltk
import re
import multiprocessing
from nltk.corpus import wordnet
from deep_translator import GoogleTranslator
from tqdm import tqdm

nltk.download('wordnet')
nltk.download('omw-1.4')

# ------------------------------
# Define Text Augmentation Functions
# ------------------------------

def synonym_replacement(text, n=2):
    """Replace up to n words in the text with their synonyms using WordNet."""
    words = text.split()
    new_words = words.copy()
    random_word_list = list(set(words))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = wordnet.synsets(random_word)
        if synonyms:
            # Use the first synonym found
            synonym = synonyms[0].lemmas()[0].name()
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break
    return " ".join(new_words)

def random_deletion(text, p=0.2):
    """Randomly delete words from the text with probability p."""
    words = text.split()
    if len(words) == 1:
        return text
    new_words = [word for word in words if random.uniform(0, 1) > p]
    return " ".join(new_words) if new_words else text

def swap_words(text, n=2):
    """Randomly swap two words in the text n times."""
    words = text.split()
    if len(words) < 2:
        return text
    for _ in range(n):
        idx1, idx2 = random.sample(range(len(words)), 2)
        words[idx1], words[idx2] = words[idx2], words[idx1]
    return " ".join(words)

def back_translate(text, src="en", target="fr"):
    """Perform back translation: English -> French -> English."""
    try:
        translated = GoogleTranslator(source=src, target=target).translate(text)
        back_translated = GoogleTranslator(source=target, target=src).translate(translated)
        return back_translated
    except Exception as e:
        return text

def parallel_back_translate(text_list, num_workers=4):
    """Uses multiprocessing to speed up back translation."""
    with multiprocessing.Pool(processes=num_workers) as pool:
        results = list(tqdm(pool.imap(back_translate, text_list), total=len(text_list)))
    return results

# ------------------------------
# Define Numeric Augmentation Function
# ------------------------------

def augment_numeric_features(row, numeric_cols, noise_factor=0.05):
    """
    For each numeric column, add a small random noise
    equal to ±(noise_factor * value).
    """
    new_values = []
    for col in numeric_cols:
        try:
            val = float(row[col])
        except:
            val = 0
        noise = random.uniform(-noise_factor, noise_factor) * val
        new_values.append(val + noise)
    return new_values

# ------------------------------
# Load Original Dataset
# ------------------------------

df = pd.read_excel("Final_dataset.xlsx")
df.columns = df.columns.str.strip()  # Clean column names

all_columns = df.columns.tolist()
numeric_cols = all_columns[1:-1]  # All columns between text and label

print("Original columns:", all_columns)
print("Numeric columns:", numeric_cols)

# ------------------------------
# Parallel Back Translation for Speed
# ------------------------------

print("Performing parallel back translation on text column...")
back_translated_texts = parallel_back_translate(df["text"].tolist(), num_workers=4)

df["back_translation"] = back_translated_texts
print("Back translation complete.")

# ------------------------------
# Create Augmented Data (New File)
# ------------------------------

# We'll create a new DataFrame with augmented versions of the text.
# Each augmented row will have the same columns as the original (excluding the back_translation column).
# For each original row, we'll generate 4 augmented rows:
# 1. Synonym Replacement
# 2. Random Deletion
# 3. Word Swap
# 4. Back Translation (from our parallel output)

# Exclude 'back_translation' from column list for final augmented file.
aug_columns = [col for col in all_columns if col != "back_translation"]

augmented_rows = []

for idx, row in df.iterrows():
    original_text = row["text"]      # The original text is still available if needed
    back_text = row["back_translation"]  # Use our back-translated version
    label = row["label"]

    # Apply text augmentation techniques:
    aug_text_syn = synonym_replacement(original_text)
    aug_text_del = random_deletion(original_text)
    aug_text_swap = swap_words(original_text)
    aug_text_back = back_text  # Already processed via parallel back translation

    # Augment numeric features (for each augmentation we add random noise)
    aug_numeric_syn = augment_numeric_features(row, numeric_cols)
    aug_numeric_del = augment_numeric_features(row, numeric_cols)
    aug_numeric_swap = augment_numeric_features(row, numeric_cols)
    aug_numeric_back = augment_numeric_features(row, numeric_cols)

    # Build augmented row (structure: [text] + [numeric features] + [label])
    row_syn = [aug_text_syn] + aug_numeric_syn + [label]
    row_del = [aug_text_del] + aug_numeric_del + [label]
    row_swap = [aug_text_swap] + aug_numeric_swap + [label]
    row_back = [aug_text_back] + aug_numeric_back + [label]

    augmented_rows.append(row_syn)
    augmented_rows.append(row_del)
    augmented_rows.append(row_swap)
    augmented_rows.append(row_back)

# Debug: Check sample augmented row length
print("Sample augmented row length:", len(augmented_rows[0]))
print("Expected number of columns:", len(aug_columns))

# Create a new DataFrame for augmented data
augmented_df = pd.DataFrame(augmented_rows, columns=aug_columns)

augmented_df.to_excel("augmented_dataset.xlsx", index=False)
print("Augmented dataset saved as 'augmented_dataset.xlsx'.")


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Original columns: ['text', 'PM Ratio', 'FPSP Ratio', 'Review Length', 'RW Ratio', 'Sentiment', 'Generalization', 'Passive Voice', 'Total reviewer reviews', 'Account type', 'Useful votes', 'Attached Medias', 'label']
Numeric columns: ['PM Ratio', 'FPSP Ratio', 'Review Length', 'RW Ratio', 'Sentiment', 'Generalization', 'Passive Voice', 'Total reviewer reviews', 'Account type', 'Useful votes', 'Attached Medias']
Performing parallel back translation on text column...


100%|██████████| 21476/21476 [34:35<00:00, 10.35it/s]


Back translation complete.
Sample augmented row length: 13
Expected number of columns: 13
Augmented dataset saved as 'augmented_dataset.xlsx'.
