<a href="https://colab.research.google.com/github/CBaffelli/CAS-NLP_Machine-translation/blob/main/05_CAS_NLP_final_project_data_augmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install nltk

# **Data augmentation**

This script is used to do data augmentation: swap words and synonym replacement.

In [None]:
#@title Imports and varia
import pandas as pd
import random
import nltk

In [None]:
#@title Mount GDrive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#@title Load data
#Load the datasets
italian = pd.read_csv('italian.csv', dtype=str)
french = pd.read_csv('french.csv', dtype=str)
spanish = pd.read_csv('spanish.csv', dtype=str)
romanian = pd.read_csv('romanian.csv', dtype=str)
portuguese = pd.read_csv('portuguese.csv', dtype=str)

#Mapping for the dataset
languages = {
    'Italian': italian,
    'French': french,
    'Spanish' : spanish,
    'Romanian' : romanian,
    'Portuguese' : portuguese
}


## **1. Random swap**

In [None]:
#@title Function to swap words, creating a new word order
def random_swap(sentence, num_swaps):
    words = sentence.split()
    new_words = words.copy()

    if len(words) >= 2:
        for _ in range(num_swaps):
            idx1, idx2 = random.sample(range(len(words)), 2)
            new_words[idx1], new_words[idx2] = new_words[idx2], new_words[idx1]

    return ' '.join(new_words)


In [None]:
#@title Apply the swap and create new datasets
for language_name, language_df in languages.items():
  df_output = pd.DataFrame()
  df_output['sourceExpression'] = language_df['sourceExpression'].apply(lambda x: random_swap(x, 2))
  df_output['targetExpression'] = language_df['targetExpression'].apply(lambda x: random_swap(x, 2))
  df_output.to_csv(f'{language_name}_swap.csv', index=False)


## **2. Synonym replacement**

In [None]:
#@title Function to replace words with synonyms
#Download wordnet from NLTK
nltk.download('wordnet')

def synonym_replacement(sentence, num_replacements):
    words = sentence.split()
    new_words = words.copy()

    for _ in range(num_replacements):
        idx = random.randint(0, len(words) - 1)
        word = words[idx]

        synsets = wordnet.synsets(word)
        if synsets:
            synonyms = [syn.lemmas()[0].name() for syn in synsets]
            synonym = random.choice(synonyms)
            new_words[idx] = synonym

    return ' '.join(new_words)

In [None]:
#@title Apply the synonym replacement (to source only) and create new datasets
for language_name, language_df in languages.items():
  df_output = pd.DataFrame()
  df_output['sourceExpression'] = language_df['sourceExpression'].apply(lambda x: synonym_replacement(x, 2))
  df_output['targetExpression'] = language_df['targetExpression']
  df_output.to_csv(f'{language_name}_synonym.csv', index=False)