# Textual Data Amplification (TDA) Experiments - Prototype 7

## Paraphrase Generation - Noise Injection

## Movie Reviews Data

## Sentiment polarity prediction task

### Python Basic Packages

In [92]:
import os
import re
import sys
import numpy as np
import pandas as pd
import matplotlib
import nltk
import IPython
import sklearn

# Versions of the packages
print('Python: ', sys.version, float(str(sys.version).split('.')[0])+float(str(sys.version).split('.')[1])/10  >= 2.6)
print('NumPy: ', np.__version__, float(np.__version__.split('.')[0])+float(np.__version__.split('.')[1])/10 >= 1.6 )
print('Pandas: ', pd.__version__)
print('Matplotlib: ', matplotlib.__version__)
print('NLTK: ', nltk.__version__)
print('IPython: ', IPython.__version__)
print('skikit-learn', sklearn.__version__)

Python:  3.6.5 (v3.6.5:f59c0932b4, Mar 28 2018, 03:03:55) 
[GCC 4.2.1 (Apple Inc. build 5666) (dot 3)] True
NumPy:  1.16.2 True
Pandas:  0.24.2
Matplotlib:  3.0.3
NLTK:  3.4
IPython:  7.4.0
skikit-learn 0.20.3


### Data

#### Movie Review Data
http://www.cs.cornell.edu/people/pabo/movie-review-data/

#### Download link
http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz

#### Data files structure

Create a directory for the data (DATA) then negative (pos) and positive reviews (neg) sub directories.

You should also create two augmented data directories (noiseinjection_aug_neg and noiseinjection_aug_pos)

## Text Data Augmentation Object
### Generic Class Definition

In [93]:
import re
from nltk import wordpunct_tokenize
from nltk import sent_tokenize
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from os import listdir
import random
import timeit
import numpy as np
from scipy.stats import norm

class TextDataAugmentation(object):
    
    # Constructor 
    def __init__(self, data_dir, 
                 neg_rev, pos_rev, aug_neg_rev, aug_pos_rev,
                 paraphr_nbr_max=5, paratext_nbr_max=5
                ): 
        self.DATA_DIR = data_dir
        self.NEG_REVIEWS = neg_rev
        self.POS_REVIEWS = pos_rev
        self.AUGMENTED_NEG_REVIEWS = aug_neg_rev
        self.AUGMENTED_POS_REVIEWS = aug_pos_rev
        self.name = "TextDataAugmentation" 
        self.encoding = 'utf-8'
        self.SENTENCE_LENGTH_MAX = 1000
        self.PARAPHRASES_NUMBER_MAX = paraphr_nbr_max
        self.PARATEXTS_NUMBER_MAX = paratext_nbr_max
        self.PARATEXT_ITERATION_LIMIT = 0.50
        self.TRACE_RANDOM_KEY = False
        self.TRACE_COMBINATIONS = False

    # To check if this object is TextDataAugmentation class 
    def getName(self): 
        return self.name
 
    # To check if this object is TextDataAugmentation class 
    def isTextDataAugmentation(self): 
        return True

    # load doc into memory
    def load_doc(self, file_path_with_name):
        # open the file as read only
        with open(file_path_with_name, 'r', encoding=self.encoding) as input_file:
            # read all text
            text = input_file.read()
        return text

    # save doc to file
    def save_doc(self,file_name_with_path,text):
        # open the file as write only
        with open(file_name_with_path, 'w', encoding=self.encoding) as ouput_file:
            # write text
            ouput_file.write(text)

    def tokenizeTextString(self, text_string):
        return wordpunct_tokenize(text_string.replace('_',' ').replace('(',' ').replace(')',' ').replace("'",' ').replace('"','').replace('/',' ').replace("\\", " ").replace('  ',' '))

    def standardize_text(self, raw_input_text):
        raw_input_text = raw_input_text.lower().replace(" n't","n't").replace(" 's","'s")
        return ' '.join(self.tokenizeTextString(raw_input_text.lower()))
    
    def sentence_length(self, sentence):
        return len(sentence.split(" "))

    def get_heuristic_weigth(self, original_sentence, new_paraphrase):
        return 1

    def add_new_paraphrase(self, paraphrases, original_sentence, new_paraphrase):
        new_paraphrase_dict = dict()
        if  new_paraphrase not in paraphrases.keys():
            new_paraphrase_dict[new_paraphrase] = self.get_heuristic_weigth(original_sentence,new_paraphrase)
            return new_paraphrase_dict
        return new_paraphrase_dict

    def splitted(self, document):
        return '\n' in document

    def sentences_splitting(self, document):
        if self.splitted(document):
            # returns a list of sentences already splitted
            # and marked by a \n tag
            return document.split('\n')
        else:
            return sent_tokenize(document)

    def evaluate_combinations_number(self, paratext_dict):
        combinations_nbr = 1
        for sentence_index in paratext_dict.keys():
            combinations_nbr *= len(paratext_dict[sentence_index])
        return combinations_nbr

    # Normalizing the replacement weights list in order to get probabilities 
    def normalize_weights_list(self, weights_list):
        weights_sum = sum(weights_list)
        # Avoid division by 0 
        if weights_sum == 0:
            weights_sum = 1
        return [a_weight/weights_sum for a_weight in weights_list]

    def get_an_integer_normaly_draw_from_n_range(n):
        if n > 0:
            mean = n/2.0
            std = n/5.0
            draw_number = list(norm.ppf(np.random.random(1), loc=mean, scale=std).astype(int))[0]
            while not (draw_number >= 1) and (draw_number <= n):
                draw_number = list(norm.ppf(np.random.random(1), loc=mean, scale=std).astype(int))[0]
        else:
            draw_number = 0
        return draw_number 

    # draw a random number of replacements among all possible replacements 
    # distribution: 'left_skewed', 'right_skewed', 'uniform'
    def get_variants_using_random_distribution(self, paratext_dict,distribution='uniform'):
        variants_index = [n for n in range(1,len(paratext_dict)+1) if len(paratext_dict[str(n)])>1]
        if len(variants_index) > 0:
            variants_quantity = self.get_an_integer_normaly_draw_from_n_range(len(variants_index))
            if distribution == 'left_skewed':
                distribution_weights = [2*n for n in range(1,len(variants_index)+1)]
            elif distribution == 'right_skewed':
                distribution_weights = [2/n for n in range(1,len(variants_index)+1)]
            else: # uniform
                distribution_weights = [1 for n in range(1,len(variants_index)+1)]
            normalized_distribution_weights = self.normalize_weights_list(distribution_weights)
            if self.TRACE_RANDOM_KEY:
                print("len(variants_index):",len(variants_index),"variants_quantity:",variants_quantity)
            if variants_quantity > len(variants_index):
                variants_quantity = len(variants_index)
                if self.TRACE_RANDOM_KEY:
                    print("New variants_quantity:",variants_quantity)
            randomized_variants = np.random.choice(variants_index,\
                                                   size=variants_quantity,\
                                                   replace=False,\
                                                   p=normalized_distribution_weights)
            if self.TRACE_RANDOM_KEY:
                print("*** variants_quantity: ", variants_quantity)
                print("*** randomized_variants: ",randomized_variants)
                print("distribution_weights: ",distribution_weights)
                print("normalized_distribution_weights: ",normalized_distribution_weights)
                print("variants_quantity:",variants_quantity,"len(paratext_dict):",len(paratext_dict),"len(variants_index)",len(variants_index))
        else:
            randomized_variants = []
        return list(randomized_variants)

    # Choose randomly one variant for a part among all possible variants based on heuristic weights
    def get_one_variant_using_heuristic_weights(self, paraphrase_dict,part_index):
        # Choose the replacement among all the possible replacements based on heuristic weights
        variants = list(range(0,len(paraphrase_dict[str(part_index)])))
        variants_heuristic_weights = list(paraphrase_dict[str(part_index)].values())
        if len(variants_heuristic_weights)> 1:
            a_variant = random.choices(population=variants,\
                                       weights=variants_heuristic_weights,k=1)[0]
        else:
            a_variant = variants[0]
        return a_variant

    def generate_sampling_random_key(self, paratext_dict,distribution='uniform'):
        random_part_key = ""
        # Choose randomly the variants based on a distribution
        variants_indexes = self.get_variants_using_random_distribution(paratext_dict,distribution=distribution)
        if self.TRACE_RANDOM_KEY:
            print("variants_indexes: ",variants_indexes)
        for part_index in range(1,len(paratext_dict)+1):
            if part_index in variants_indexes:
                if self.TRACE_RANDOM_KEY:
                    print("part_index found: ",part_index)
                # Choose randomly one variant for a precise part among all possible variants based on heuristic weights
                a_variant = self.get_one_variant_using_heuristic_weights(paratext_dict,part_index)
                if self.TRACE_RANDOM_KEY:
                    print("a_variant: ",a_variant)
                random_part_key = random_part_key + str(part_index) + "-" + str(a_variant) + "_"
            else:
                random_part_key = random_part_key + str(part_index) + "-0" + "_"
        return random_part_key[:-1]

    def generate_new_unique_key(self, paratext_dict,keys_memory,distribution="uniform"):
        random_key = self.generate_sampling_random_key(paratext_dict,distribution)
        while random_key in keys_memory:
            random_key = self.generate_sampling_random_key(paratext_dict,distribution)
        keys_memory.add(random_key)
        return (keys_memory,random_key)

    def generate_one_random_paratext(self, paratext_dict, random_key):
        paratext = []
        for index,key in enumerate(random_key.split("_")):
            sub_keys = key.split("-")
            paraphrase = list(paratext_dict[str(index+1)].keys())[int(sub_keys[1])]
            paratext.append(paraphrase)
        return paratext

    def generate_paraphrases(new_sentence):
        return new_sentence
    
    def create_paratext_dict(self, sentences_list):
        sentences_number = len(sentences_list)
        paratext_dict = {}
        for sentence_index in range(sentences_number):
            new_sentence = ' '.join(re.findall(r"[a-zA-Z'-]+|[.,;!?\'\’]+",sentences_list[sentence_index].lower()))
            paraphrases_dict = {new_sentence:1}
            if len(new_sentence) > 0:
                new_paraphrases_dict = self.generate_paraphrases(new_sentence)
                paraphrases_dict.update(new_paraphrases_dict)
            paratext_dict[str(sentence_index+1)] = paraphrases_dict
        return paratext_dict

    def generate_sampling(self, original_document, augmentation_factor):
        splitted_document = self.sentences_splitting(original_document)
        paratext_dict = self.create_paratext_dict(splitted_document)
        print("Paratext Dict created for the document")
        combinations_number_max = self.evaluate_combinations_number(paratext_dict)
        if augmentation_factor > combinations_number_max:
            print("*** WARNING! Desired paratexts_sample number: " + str(augmentation_factor) + " exceeds Combinations Number Max, which is: ", combinations_number_max)
            augmentation_factor = combinations_number_max
        if augmentation_factor > self.PARATEXTS_NUMBER_MAX:
            print("*** WARNING! Desired paratexts_sample number: " + str(augmentation_factor) + " exceeds PARATEXTS_NUMBER_MAX, which is: ", PARATEXTS_NUMBER_MAX)
            augmentation_factor = self.PARATEXTS_NUMBER_MAX
        paratexts_sample = []
        keys_memory = set()
        while len(paratexts_sample) < augmentation_factor:
            if len(keys_memory) > combinations_number_max * self.PARATEXT_ITERATION_LIMIT:
                break
            keys_memory, random_key = self.generate_new_unique_key(paratext_dict,keys_memory,distribution='uniform')
            new_paratext = self.generate_one_random_paratext(paratext_dict,random_key)
            paratexts_sample.append(new_paratext)
        print("Text Data Augmentation done for the document")
        return paratexts_sample

    def get_datafile_id(filename):
        return filename.split("_")[-2][-3:]
        
    def augment_text_data(self, augmentation_factor=5):
        is_train = True
        # WARNING - many lines of code below are data source specific !
        for sub_directory in [self.NEG_REVIEWS,self.POS_REVIEWS]:
            data_directory = data_dir + sub_directory
            for filename in listdir(data_directory):
                start_time = timeit.default_timer()
                # skip files that do not have the right extension
                # WARNING - data source specific
                if not filename.endswith(".txt"):
                    continue
                # Processing training set, so skip any reviews in the test set
                # which start with 'cv9' - WARNING - data source specific
                if is_train and filename.startswith('cv9'):
                    continue
                # Processing test set, so skip any reviews in the training set
                # which not start with 'cv9' - WARNING - data source specific
                if not is_train and not filename.startswith('cv9'):
                    continue
                # create the full path of the file to open
                file_path = data_directory + filename
                # load the original document
                original_text = self.load_doc(file_path)
                print("Augmenting...", file_path)
                paratexts = self.generate_sampling(original_text,augmentation_factor=augmentation_factor)
                for index,paraphrases in enumerate(paratexts):
                    new_text = " \n".join(paraphrases)
                    new_filename = "ampl_" + filename.split(".")[0] + "_" + str(index)+ '.txt'
                    if sub_directory == self.NEG_REVIEWS:
                        new_file_path = self.DATA_DIR+self.AUGMENTED_NEG_REVIEWS+new_filename 
                        self.save_doc(new_file_path,new_text)
                    else:
                        new_file_path = self.DATA_DIR+self.AUGMENTED_POS_REVIEWS+new_filename
                        self.save_doc(new_file_path,new_text)
                    if (index % 10) == 0:
                        print("Saved:",new_file_path)
                        end_time = timeit.default_timer()
                        print("Elapsed time: {}".format(end_time - start_time))
                        print("-----")

## Noise Injection TDA Object
### Sub Class Definition

In [94]:
import re
import numpy as np
import random
import string

# Inherited or Sub class (TextDataAugmentation) 
class NoiseInjection(TextDataAugmentation): 
    # Based on work done in collaboration with Simon ROQUETTE at Polytechnique Montreal
    # https://github.com/simonroquette/CORAP

    # Constructor 
    def __init__(self,data_dir, neg_rev, pos_rev,aug_neg_rev, aug_pos_rev, 
                 paraphr_nbr_max=5, paratext_nbr_max=5,
                 noise_percent=0.1,error_table="no_errors_table"): 
        self.NOISE_PERCENT_PER_LONG_WORD = noise_percent
        self.MAX_ITERATIONS = 1000
        self.MAX_WORD_LENGTH = 50
        self.TRACE = False
        #TODO Should space be part of our alphabet ??? probably if want deal spaced out words
        self.ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,:;'*!?`$%&(){}[]-/\@_#" 
        # ERRORS_TABLE and ERRORS_DICT should be build at the beginning, only one time !
        # Don't put the same char in two equivalence tables. It will be overwritten.
        # Indeed, equivalence is a transitive property, if A looks like B and B looks like C, A looks like C
        # And therefore they should be in the same table
        # TYPICAL OCR (Optical Character Recognition) ERRORS
        self.OCR_ERRORS_TABLE = [['i', 'j', 'l'],
                                 ['n', 'r'],
                                 ['m', 'nn', 'rn', 'nr'],
                                 ['mm', 'nnm', 'mnn', 'nnn', 'nnnn', 'rnm', 'nrm', 'mrn'],
                                 ['u', 'v', ],
                                 ['w', 'vv'],
                                 ['o', 'a'],
                                 #['s', 'z'],  # s and z are more for handwriting
                                 #['x', 'ae', 'oe', 'oc'],  # Maybe too much
                                 ['g', 'q'],
                                 #['k', 'le', 'lR'], # Maybe too much
                                 ['c', 'e'],
                                 ['h', 'b']
                                 #['d', 't']  # Maybe too much
                                ]
        # QWERTY KEYBOARD ERRORS (based on key physical proximity)
        self.KEYBOARD_ERRORS_TABLE = [
            ['q','1','2','3','w','s','a'],
            ['w','2','3','4','e','d','s','a','q'],
            ['e','3','4','5','r','f','d','s','w'],
            ['e','3','4','5','r','f','d','s','w'],
            ['r','4','5','6','t','g','f','d','e'],
            ['t','5','6','7','y','h','g','f','r'],
            ['y','6','7','8','u','j','h','g','t'],
            ['u','7','8','9','i','k','j','h','y'],
            ['i','8','9','0','o','l','k','j','u'],
            ['o','9','0','-','p',';','l','k','i'],
            ['p','0','-','=','[',"'",';','l','o'],
            ['a','q','w','s','x','z','\\'],
            ['s','q','w','e','d','c','x','z','a'],
            ['d','w','e','r','f','v','c','x','s'],
            ['f','e','r','t','g','b','v','c','d'],
            ['f','e','r','t','g','b','v','c','d'],
            ['g','r','t','y','h','n','b','v','f'],
            ['h','t','y','u','j','m','n','b','g'],
            ['j','y','u','i','k',',','m','n','h'],
            ['k','u','i','o','l','.',',','m','j'],
            ['l','i','o','p',';','/','.',',','k'],
            ['z','a','s','x',' ','\\'],
            ['x','a','s','d','c',' ','z'],
            ['c','s','d','f','v',' ','x'],
            ['v','d','f','g','b',' ','c'],
            ['b','f','g','h','n',' ','v'],
            ['n','g','h','j','m',' ','b'],
            ['m','h','j','k',',',' ','n'],
        ]
        # Above and below someone could add new specific ERRORS_TABLE if needed
        if error_table == "keyboard":
            self.ERRORS_TABLE = self.KEYBOARD_ERRORS_TABLE
        elif error_table == "ocr":
            self.ERRORS_TABLE = self.OCR_ERRORS_TABLE
        elif error_table == "no_errors_table":
            self.ERRORS_TABLE = []
            
        # Build dictionnary once, because it is faster when looking for equivalences
        ERRORS_DICT = dict()
        for line in range(len(self.ERRORS_TABLE)):
            for entry in range(len(self.ERRORS_TABLE[line])):
                if len(self.ERRORS_TABLE[line][entry]) <= 2:  # We don't care entries that are bigger than 2
                    copy = list(self.ERRORS_TABLE[line])
                    del copy[entry]
                    ERRORS_DICT[self.ERRORS_TABLE[line][entry]] = copy
        # MANUALLY ADD SOME ERRORS HERE THAT ARE ONLY ONE WAY: ERRORS_DICT[to_add] = [add1, add2]
        self.ERRORS_DICT = ERRORS_DICT
        super(NoiseInjection, self).__init__(data_dir, 
                                             neg_rev, pos_rev,
                                             aug_neg_rev, aug_pos_rev,
                                             paraphr_nbr_max, paratext_nbr_max
                                            )

    # To check if this object is Backtranslation class 
    def isNoiseInjection(self): 
        return True

    def hasnum(self,word):
        for c_i in word:
            if c_i.isdigit():
                return True
        return False

    def noise_char(self,w):
        # only one error per word
        # only random operations, no informed errors
        # in order to make ERRORS_TABLE more probable
        # noise_op = random.choice(['DELETE', 'INSERT', 'REPLACE', 'ERRORS_TABLE', 'ERRORS_TABLE', 'ERRORS_TABLE']) 
        if self.ERRORS_TABLE == []:
            # ONLY DELETE, INSERT or REPLACE, NO ERRORS TABLE
            noise_op = random.choice(['DELETE', 'INSERT', 'REPLACE'])
        else:
            noise_op = random.choice(['DELETE', 'INSERT', 'REPLACE', 'ERRORS_TABLE'])

        # Events with no noise here : - numbers
        # - ponctuation like "!" "??" or "),"
        if self.hasnum(w) or ((not w.isalpha()) and len(w) < 3):
            w = w
        else:
            if noise_op == "DELETE" and len(w) > 1: # Words of length 1 don't overgo deletion...
                    idx = random.randint(0, len(w) - 1)
                    w = w[:idx] + w[idx + 1:]

            if noise_op == "INSERT":
                ins_idx = random.randint(0, len(w) - 1)
                ins_char_idx = np.random.randint(0, len(string.ascii_lowercase))
                ins_char = list(string.ascii_lowercase)[ins_char_idx]
                w = w[:ins_idx] + ins_char + w[ins_idx:]

            if noise_op == "REPLACE":
                target_idx = random.randint(0, len(w) - 1)
                rep_char_idx = np.random.randint(0, len(string.ascii_lowercase))
                rep_char = list(string.ascii_lowercase)[rep_char_idx]
                w = w[:target_idx] + rep_char + w[target_idx + 1:]

            if noise_op == "ERRORS_TABLE":
                # Choose randomly the character to replace
                # if fusion of two character is more probable should be modified...
                choices = list(range(len(w)))
                added = ''
                while len(choices) > 0 and added == '':
                    idx = random.randint(0, len(choices)-1)
                    i = choices[idx]
                    if i == len(w) - 1:
                        if (i > 0) and w[i - 1] + w[i] in self.ERRORS_DICT:
                            added = random.choice(self.ERRORS_DICT[w[i - 1] + w[i]])
                            w = w[:i - 1] + added
                        elif w[i] in self.ERRORS_DICT:
                            added = random.choice(self.ERRORS_DICT[w[i]])
                            w = w[:i] + added
                        else:
                            del choices[idx]
                    elif i == len(w) - 2:
                        if w[i] + w[i + 1] in self.ERRORS_DICT:
                            added = random.choice(self.ERRORS_DICT[w[i] + w[i + 1]])
                            w = w[:i] + added
                        elif (i > 0) and w[i - 1] + w[i] in self.ERRORS_DICT:
                            added = random.choice(self.ERRORS_DICT[w[i - 1] + w[i]])
                            w = w[:i - 1] + added + w[i + 1]
                        elif w[i] in self.ERRORS_DICT:
                            added = random.choice(self.ERRORS_DICT[w[i]])
                            w = w[:i] + added + w[i + 1:]
                        else:
                            del choices[idx]
                    else:
                        if w[i] + w[i + 1] in self.ERRORS_DICT:
                            added = random.choice(self.ERRORS_DICT[w[i] + w[i + 1]])
                            w = w[:i] + added + w[i + 2:]
                        elif (i > 0) and w[i - 1] + w[i] in self.ERRORS_DICT:
                            added = random.choice(self.ERRORS_DICT[w[i - 1] + w[i]])
                            w = w[:i - 1] + added + w[i + 1:]
                        elif w[i] in self.ERRORS_DICT:
                            added = random.choice(self.ERRORS_DICT[w[i]])
                            w = w[:i] + added + w[i + 1:]
                        else:
                            del choices[idx]
        return w

    def inject_noise(self, source_sentence):
        tokens_list= self.tokenizeTextString(source_sentence)
        replacements = []
        for token in tokens_list:
            if len(token) > 3:
                if random.uniform(0, 1) <= self.NOISE_PERCENT_PER_LONG_WORD  :
                    token = self.noise_char(token)
            replacements.append(token)
        return " ".join(replacements) 

    def generate_paraphrases(self, source_sentence):
        paraphrases = {}
        iterations = 0
        while (len(paraphrases) < self.PARAPHRASES_NUMBER_MAX) and (iterations < self.MAX_ITERATIONS) :
            noisy_sentence = self.inject_noise(source_sentence)
            if len(paraphrases) <= self.PARAPHRASES_NUMBER_MAX:
                paraphrases.update(self.add_new_paraphrase(paraphrases,source_sentence,noisy_sentence))
            iterations += 1
        return paraphrases       

### Local data directories setup

In [95]:
data_dir = "DATA/"
neg_rev = "neg/"
pos_rev = "pos/"
aug_neg_rev = "noiseinjection_aug_neg/"
aug_pos_rev = "noiseinjection_aug_pos/"

### Experiment - Creation of TDA Noise Injection object 

In [144]:
paraphr_nbr_max = 5
paratext_nbr_max = 5
noise_percent = 0.1
errors_table = 'no_errors_table'
my_tda = NoiseInjection(data_dir, neg_rev, pos_rev, aug_neg_rev, aug_pos_rev, 
                        paraphr_nbr_max, paratext_nbr_max, noise_percent, errors_table)
my_tda.isNoiseInjection()

True

In [145]:
my_tda.isTextDataAugmentation()

True

In [146]:
my_tda.name

'TextDataAugmentation'

### Experiment - word level noise injection

In [147]:
word = 'augmentation'
print(word)
w = my_tda.noise_char(word)
print(w)
w = my_tda.noise_char(word)
print(w)
w = my_tda.noise_char(word)
print(w)

augmentation
augmenhation
augomentation
atgmentation


### Experiment - sentence level noise injection

In [175]:
my_tda.NOISE_PERCENT_PER_LONG_WORD

0.1

In [176]:
original_text = "to be or not to be that is the question"

In [177]:
my_tda.inject_noise(original_text)

'to be or not to be hat is the question'

### Experiment - paraphrases generation

In [178]:
my_tda.generate_paraphrases(original_text)

{'to be or not to be that is the question': 1,
 'to be or not to be thadt is the question': 1,
 'to be or not to be thdat is the question': 1,
 'to be or not to be tha is the question': 1,
 'to be or not to be that is the queswtion': 1}

### Experiment - Text Data Augmentation by extensive paratexts generation using Noise injection

In [None]:
my_tda.augment_text_data()

    Augmenting... DATA/neg/cv676_22202.txt
    Paratext Dict created for the document
    Text Data Augmentation done for the document
    Saved: DATA/noiseinjection_aug_neg/ampl_cv676_22202_0.txt
    Elapsed time: 0.054947464959695935
    -----
    Augmenting... DATA/neg/cv839_22807.txt
    Paratext Dict created for the document
    Text Data Augmentation done for the document
    Saved: DATA/noiseinjection_aug_neg/ampl_cv839_22807_0.txt
    Elapsed time: 0.034176019951701164
    -----
    Augmenting... DATA/neg/cv155_7845.txt
    Paratext Dict created for the document
    Text Data Augmentation done for the document
    Saved: DATA/noiseinjection_aug_neg/ampl_cv155_7845_0.txt
    Elapsed time: 0.011110538151115179
    ...