In [1]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
import numpy as np
import random
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet 
from random import sample
import string

from sklearn.model_selection import train_test_split

from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tqdm import tqdm

In [2]:
data = pd.read_csv('./IMDB Dataset.csv')

In [3]:
rnn_hyper_param = {
    'VOCAB_SIZE':50000,
    'EPOCHS':150,
    'BS':1024,
    'LR':0.005,
    'OOV_TOK':"<OOV>",
    'MAX_LENGTH':520,
    'PADDING_TYPE':"post",
    'TRUNC_TYPE':"post"
}

In [4]:
cnn_hyper_param = {
    'VOCAB_SIZE':50000,
    'EPOCHS':150,
    'BS':1024,
    'LR':0.001,
    'OOV_TOK':"<OOV>",
    'MAX_LENGTH':520,
    'PADDING_TYPE':"post",
    'TRUNC_TYPE':"post"
}

In [5]:
# Keep to the same size of test set we've been using to test how well it generalizes.
x_train, x_test, y_train, y_test = train_test_split(data['review'], data['sentiment'], 
                                                    test_size=0.2, random_state=0, 
                                                    stratify=data['sentiment'])

In [11]:
df = {
    'review':x_train,
    'sentiment':y_train
}
df = pd.DataFrame(df)
df.head()

Unnamed: 0,review,sentiment
38414,The notion of marital fidelity portrayed in th...,positive
24010,What a good film! Made Men is a great action m...,positive
29873,"Joe Don Baker. He was great in ""Walking Tall"" ...",negative
2868,Monarch Cove was one of the best Friday night'...,positive
15107,This film is so unbelievable; - the whole prem...,negative


# <center>Data Augmentation

In [12]:
# Main method used to perform data augmentation. It calls on all methods below it
# to perform data augmentation on eaech sentence.

def data_augment(df, alpha):
    df_arr = []
    for index, row in df.iterrows():
        _arr = []
        # Remove <br /> and split data into sentence length
        review = re.split("\.+", row['review'].replace('<br />', ''))
        
        # Perform data augmentation on each sentence.
        for idx, sentence in enumerate(review):
            sentence = random_swap(sentence, alpha)
            sentence = random_insertion(sentence, alpha)
            sentence = synonym_replacement(sentence,alpha)
            sentence = random_deletion(sentence, alpha)

            review[idx] = sentence

        # rejoin the split sentences into a single paragraph again
        separator = '. '
        review = separator.join(review)
        _arr.append(review)
        _arr.append(row['sentiment'])
        df_arr.append(_arr)
    
    return pd.DataFrame(np.array(df_arr), columns=['review','sentiment']) 


In [13]:
# Randomly swap two words in a sentence. This is done alpha*length of sentence times

def random_swap(sentence, alpha):
    sentence = sentence.split()
    for i in range(int(alpha*len(sentence))):
        sample = random.sample(range(0, len(sentence)-1), 2)
        word_1 = sentence[sample[0]]
        word_2 = sentence[sample[1]]
        sentence[sample[0]] = word_2
        sentence[sample[1]] = word_1
        
    return ' '.join(word for word in sentence)

In [14]:
# Randomly deletes words in a sentence iwth probability of alpha*length of sentence

def random_deletion(sentence, alpha):
    sentence = sentence.split()
    sample = (random.sample(range(0, len(sentence)-1), int(alpha*len(sentence))))
    sample.sort(reverse=True)

    for idx in sample:
        del sentence[idx]
        
    return ' '.join(word for word in sentence)

In [15]:
# Randomly pick non-stopwords in a sentence with probability alpha*length of sentence
# Replace these words with a random synonym of itself.

def synonym_replacement(sentence, alpha):
    sentence = sentence.split()
    # Remove punctuations except for inverted commas and hyphens as these can have meanings
    sentence = [''.join(c for c in s if c not in [',!#$%&\()*+,./:;<=>?@[\\]^_`{|}~']) for s in sentence]
    
    stop_words = set(stopwords.words('english'))
    non_stopwords_idx = []    
    
    # Find indices of words that are not stop words
    for i in range (len(sentence)):
        if sentence[i].lower() not in stop_words:
            non_stopwords_idx.append(i)
    
    # If we need to replace more words than there are non-stop words, then just replace all non-stop words
    if int(alpha*len(sentence)) > len(non_stopwords_idx):
        words = sample(non_stopwords_idx, len(non_stopwords_idx))
    else:
        words = sample(non_stopwords_idx, int(alpha*len(sentence)))

    # Get synonym for each word to be replaced
    for word in words:
        synonym_list = get_synonym(sentence[word])
        if not synonym_list:
            continue   
        # Replace word with a randomy synonym from a list of synonyms
        synonym = sample(synonym_list, 1)
        sentence[word] = synonym[0]
    
    return ' '.join(word for word in sentence)

In [16]:
# Randomly inserts a synonym of a non-stop word into a sentence. Probability is given by alpha*sentence length

def random_insertion(sentence, alpha):
    sentence = sentence.split()
    
    # Remove punctuations except for inverted commas and hyphens as these can have meanings
    sentence = [''.join(c for c in s if c not in [',!#$%&\()*+,./:;<=>?@[\\]^_`{|}~']) for s in sentence]
    
    stop_words = set(stopwords.words('english'))
    non_stopwords_idx = []    
    
    # Find indices of words that are not stop words
    for i in range (len(sentence)):
        if sentence[i].lower() not in stop_words:
            non_stopwords_idx.append(i)
    
    # If we need to replace more words than there are non-stop words, then just replace all non-stop words
    if int(alpha*len(sentence)) > len(non_stopwords_idx):
        words = sample(non_stopwords_idx, len(non_stopwords_idx))
    else:
        words = sample(non_stopwords_idx, int(alpha*len(sentence)))

    # Get synonym for each word to be replaced
    for word in words:
        synonym_list = get_synonym(sentence[word])
        # If no synonyms is found, continue
        if not synonym_list:
            continue
        synonym = sample(synonym_list, 1)
        sentence.insert(random.randint(0, len(sentence)-1), synonym[0])
        
    return ' '.join(word for word in sentence)

In [17]:
# This function returns a random synonym given a word

def get_synonym(word):
    word = word.lower()
    
    synonyms = set()
    for syn in wordnet.synsets(word):
        for l in syn.lemmas(): 
            synonym = l.name().replace("_", " ").replace("-", " ").lower()
            synonyms.add(synonym)

    if word in synonyms:
        synonyms.remove(word)
    
    return list(synonyms)

In [18]:
#  Perform data augmentation on full training dataset.
aug_df = []
aug_df_list = []
for i in tqdm(range(4)):
    aug_df = data_augment(df, 0.1)
    aug_df_list.append(aug_df)

for dataframes in aug_df_list:
    df = pd.concat([df, dataframes], ignore_index=True)

100%|███████████████████████████████████████████████████████████████████████████████████| 4/4 [39:09<00:00, 587.46s/it]


In [23]:
# Save the augmented datset into a csv file. The data augmentation code for all other data subsets is not in
# this notebook as it was on the GPU cluster but was deleted away. They all follow the same logic

hist_csv_file = 'data_aug_1.0.csv'
with open(hist_csv_file, mode='w', encoding='UTF-8') as f:
    df.to_csv(f, encoding='UTF-8')

In [None]:
# Perform data augmentation on all  other subsets of dataset

for size, dataset in data_dict.items():
    print(f"Data Augmenting on dataset of size: {size}")
    print("Current size of dataset:", len(dataset))
    _history = []
    _dataset_list = []
    if len(dataset) <= 500:
        alpha = 0.05
        iterate = 16
    elif len(dataset) <= 2000:
        alpha = 0.05
        iterate = 8
    else:
        alpha = 0.1
        iterate = 4
        
    for i in tqdm(range(iterate)):
        _dataset = data_augment(dataset, alpha)
        _dataset_list.append(_dataset)
    for item in _dataset_list:
        dataset = pd.concat([dataset, item], ignore_index = True)
    
    data_dict[size] = dataset
    print("Augmented Size of Dataset:", len(data_dict[size]))
    print()

In [None]:
# Write augmented data into csv

for key, items in data_dict.items():
    items.to_csv(f'data_aug_{key}.csv', encoding='utf-8', index=False)