In [None]:
!pip install googletrans==4.0.0-rc1
!pip install dask

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from googletrans import Translator # use version 4.0.0-rc1
from dask import bag, diagnostics
import nlpaug.augmenter.word as naw
import nlpaug.flow as naf

In [None]:
def random_char_insertion(text):
    aug = nac.KeyboardAug()
    augmented_text = aug.augment(text)
    return augmented_text[0]

def random_swap(text):
    aug = naw.RandomWordAug(action="swap")
    augmented_text = aug.augment(text)
    return augmented_text[0]

def synonym_replacement(text):
    aug = naw.SynonymAug(aug_src='wordnet')
    augmented_text = aug.augment(text)
    return augmented_text[0]

def back_translate(sequence, target_lang):

    languages = ['en', 'fr', 'th', 'tr', 'ur', 'ru', 'bg', 'de', 'ar', 'zh-cn', 'hi',
                 'sw', 'vi', 'es', 'el']
    #instantiate translator
    translator = Translator()
    
    #store original language so we can convert back
    org_lang = translator.detect(sequence).lang

    try:
        if org_lang in languages:
            #translate to new language and back to original
            translated = translator.translate(sequence, dest = target_lang).text
            #translate back to original language
            translated_back = translator.translate(translated, dest = org_lang).text
        
            output_sequence = translated_back        
        #if detected language not in our list of languages, do nothing
        else:
            output_sequence = sequence
    except:
        output_sequence = sequence
    
    return output_sequence

# Applies above define function with Dask
def back_translate_parallel(dataset, target_lang):
    dataset = dataset.copy()
    text_bag = bag.from_sequence(dataset['text'].tolist()).map(back_translate, target_lang)
    
    with diagnostics.ProgressBar():
        text_bag = text_bag.compute()

    # Add the translated to a new dataframe
    df_augmented = pd.DataFrame({"text": text_bag, "class": dataset['class']})
    return df_augmented

In [None]:
def data_augmentation(pcl_df_train_train):

    ## Back translation

    for i in range(0,600,100):
        pcl_df_train_train_aug = pcl_df_train_train[pcl_df_train_train['class'] == 1].iloc[i:i+100].copy()
        pcl_df_train_train_aug.dropna(inplace=True)
        pcl_df_train_train_aug = back_translate_parallel(pcl_df_train_train_aug, 'fr')

        pcl_df_train_train_aug['class'] = 1

        pcl_df_train_train = pd.concat([pcl_df_train_train, pcl_df_train_train_aug], ignore_index=True)

    ## Synonym replacement 

    pcl_df_train_train_aug = pcl_df_train_train[pcl_df_train_train['class'] == 1].copy()
    pcl_df_train_train_aug['text'] = pcl_df_train_train_aug['text'].apply(lambda x: synonym_replacement(x))
    pcl_df_train_train_aug['class'] = 1

    pcl_df_train_train = pd.concat([pcl_df_train_train, pcl_df_train_train_aug], ignore_index=True)

    ## Random swap

    pcl_df_train_train_aug = pcl_df_train_train[pcl_df_train_train['class'] == 1][:1000].copy()
    pcl_df_train_train_aug['text'] = pcl_df_train_train_aug['text'].apply(lambda x: random_swap(x))
    pcl_df_train_train_aug['class'] = 1

    pcl_df_train_train = pd.concat([pcl_df_train_train, pcl_df_train_train_aug], ignore_index=True)

    ## Random char insertion

    pcl_df_train_train_aug = pcl_df_train_train[pcl_df_train_train['class'] == 1][:1000].copy()
    pcl_df_train_train_aug['text'] = pcl_df_train_train_aug['text'].apply(lambda x: random_char_insertion(x))
    pcl_df_train_train_aug['class'] = 1

    pcl_df_train_train = pd.concat([pcl_df_train_train, pcl_df_train_train_aug], ignore_index=True)

    return pcl_df_train_train    