# Textual Data Amplification (TDA) Experiments - Prototype 8

## Paraphrase Generation - Backtranslation

## Movie Reviews Data

## Sentiment polarity prediction task

### Python Basic Packages

In [35]:
import os
import re
import sys
import numpy as np
import pandas as pd
import matplotlib
import nltk
import IPython
import sklearn

# Versions of the packages
print('Python: ', sys.version, float(str(sys.version).split('.')[0])+float(str(sys.version).split('.')[1])/10  >= 2.6)
print('NumPy: ', np.__version__, float(np.__version__.split('.')[0])+float(np.__version__.split('.')[1])/10 >= 1.6 )
print('Pandas: ', pd.__version__)
print('Matplotlib: ', matplotlib.__version__)
print('NLTK: ', nltk.__version__)
print('IPython: ', IPython.__version__)
print('skikit-learn', sklearn.__version__)

Python:  3.6.5 (v3.6.5:f59c0932b4, Mar 28 2018, 03:03:55) 
[GCC 4.2.1 (Apple Inc. build 5666) (dot 3)] True
NumPy:  1.16.2 True
Pandas:  0.24.2
Matplotlib:  3.0.3
NLTK:  3.4
IPython:  7.4.0
skikit-learn 0.20.3


### Data

#### Movie Review Data
http://www.cs.cornell.edu/people/pabo/movie-review-data/

#### Download link
http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz

#### Data files structure

Create a directory for the data (DATA) then negative (pos) and positive reviews (neg) sub directories.

You should also create two augmented data sub-directories (backtranslation_aug_neg and backtranslation_aug_pos) and a cuttoff_model sub-directory which contains CuttOff_LogisticRegression_model.pkl

## Text Data Augmentation Object
### Generic Class Definition

In [36]:
import re
from nltk import wordpunct_tokenize
from nltk import sent_tokenize
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from os import listdir
import random
import timeit
import numpy as np
from scipy.stats import norm

class TextDataAugmentation(object):
    
    # Constructor 
    def __init__(self, data_dir, 
                 neg_rev, pos_rev, aug_neg_rev, aug_pos_rev,
                 paraphr_nbr_max=5, paratext_nbr_max = 5
                ): 
        self.DATA_DIR = data_dir
        self.NEG_REVIEWS = neg_rev
        self.POS_REVIEWS = pos_rev
        self.AUGMENTED_NEG_REVIEWS = aug_neg_rev
        self.AUGMENTED_POS_REVIEWS = aug_pos_rev
        self.name = "TextDataAugmentation" 
        self.encoding = 'utf-8'
        self.SENTENCE_LENGTH_MAX = 1000
        self.PARAPHRASES_NUMBER_MAX = paraphr_nbr_max
        self.PARATEXTS_NUMBER_MAX = paraphr_nbr_max
        self.PARATEXT_ITERATION_LIMIT = 0.50
        self.TRACE_RANDOM_KEY = False
        self.TRACE_COMBINATIONS = False

    # To check if this object is TextDataAugmentation class 
    def getName(self): 
        return self.name
 
    # To check if this object is TextDataAugmentation class 
    def isTextDataAugmentation(self): 
        return True

    # load doc into memory
    def load_doc(self, file_path_with_name):
        # open the file as read only
        with open(file_path_with_name, 'r', encoding=self.encoding) as input_file:
            # read all text
            text = input_file.read()
        return text

    # save doc to file
    def save_doc(self,file_name_with_path,text):
        # open the file as write only
        with open(file_name_with_path, 'w', encoding=self.encoding) as ouput_file:
            # write text
            ouput_file.write(text)

    def tokenizeTextString(self, text_string):
        return wordpunct_tokenize(text_string.replace('_',' ').replace('(',' ').replace(')',' ').replace("'",' ').replace('"','').replace('/',' ').replace("\\", " ").replace('  ',' '))

    def standardize_text(self, raw_input_text):
        raw_input_text = raw_input_text.lower().replace(" n't","n't").replace(" 's","'s")
        return ' '.join(self.tokenizeTextString(raw_input_text.lower()))
    
    def sentence_length(self, sentence):
        return len(sentence.split(" "))

    def get_heuristic_weigth(self, original_sentence, new_paraphrase):
        return 1

    def add_new_paraphrase(self, paraphrases, original_sentence, new_paraphrase):
        new_paraphrase_dict = dict()
        if  new_paraphrase not in paraphrases.keys():
            new_paraphrase_dict[new_paraphrase] = self.get_heuristic_weigth(original_sentence,new_paraphrase)
            return new_paraphrase_dict
        return new_paraphrase_dict

    def splitted(self, document):
        return '\n' in document

    def sentences_splitting(self, document):
        if self.splitted(document):
            # returns a list of sentences already splitted
            # and marked by a \n tag
            return document.split('\n')
        else:
            return sent_tokenize(document)

    def evaluate_combinations_number(self, paratext_dict):
        combinations_nbr = 1
        for sentence_index in paratext_dict.keys():
            combinations_nbr *= len(paratext_dict[sentence_index])
        return combinations_nbr

    # Normalizing the replacement weights list in order to get probabilities 
    def normalize_weights_list(self, weights_list):
        weights_sum = sum(weights_list)
        # Avoid division by 0 
        if weights_sum == 0:
            weights_sum = 1
        return [a_weight/weights_sum for a_weight in weights_list]

    def get_an_integer_normaly_draw_from_n_range(n):
        if n > 0:
            mean = n/2.0
            std = n/5.0
            draw_number = list(norm.ppf(np.random.random(1), loc=mean, scale=std).astype(int))[0]
            while not (draw_number >= 1) and (draw_number <= n):
                draw_number = list(norm.ppf(np.random.random(1), loc=mean, scale=std).astype(int))[0]
        else:
            draw_number = 0
        return draw_number 

    # draw a random number of replacements among all possible replacements 
    # distribution: 'left_skewed', 'right_skewed', 'uniform'
    def get_variants_using_random_distribution(self, paratext_dict,distribution='uniform'):
        variants_index = [n for n in range(1,len(paratext_dict)+1) if len(paratext_dict[str(n)])>1]
        if len(variants_index) > 0:
            variants_quantity = self.get_an_integer_normaly_draw_from_n_range(len(variants_index))
            if distribution == 'left_skewed':
                distribution_weights = [2*n for n in range(1,len(variants_index)+1)]
            elif distribution == 'right_skewed':
                distribution_weights = [2/n for n in range(1,len(variants_index)+1)]
            else: # uniform
                distribution_weights = [1 for n in range(1,len(variants_index)+1)]
            normalized_distribution_weights = self.normalize_weights_list(distribution_weights)
            if self.TRACE_RANDOM_KEY:
                print("len(variants_index):",len(variants_index),"variants_quantity:",variants_quantity)
            if variants_quantity > len(variants_index):
                variants_quantity = len(variants_index)
                if self.TRACE_RANDOM_KEY:
                    print("New variants_quantity:",variants_quantity)
            randomized_variants = np.random.choice(variants_index,\
                                                   size=variants_quantity,\
                                                   replace=False,\
                                                   p=normalized_distribution_weights)
            if self.TRACE_RANDOM_KEY:
                print("*** variants_quantity: ", variants_quantity)
                print("*** randomized_variants: ",randomized_variants)
                print("distribution_weights: ",distribution_weights)
                print("normalized_distribution_weights: ",normalized_distribution_weights)
                print("variants_quantity:",variants_quantity,"len(paratext_dict):",len(paratext_dict),"len(variants_index)",len(variants_index))
        else:
            randomized_variants = []
        return list(randomized_variants)

    # Choose randomly one variant for a part among all possible variants based on heuristic weights
    def get_one_variant_using_heuristic_weights(self, paraphrase_dict,part_index):
        # Choose the replacement among all the possible replacements based on heuristic weights
        variants = list(range(0,len(paraphrase_dict[str(part_index)])))
        variants_heuristic_weights = list(paraphrase_dict[str(part_index)].values())
        if len(variants_heuristic_weights)> 1:
            a_variant = random.choices(population=variants,\
                                       weights=variants_heuristic_weights,k=1)[0]
        else:
            a_variant = variants[0]
        return a_variant

    def generate_sampling_random_key(self, paratext_dict,distribution='uniform'):
        random_part_key = ""
        # Choose randomly the variants based on a distribution
        variants_indexes = self.get_variants_using_random_distribution(paratext_dict,distribution=distribution)
        if self.TRACE_RANDOM_KEY:
            print("variants_indexes: ",variants_indexes)
        for part_index in range(1,len(paratext_dict)+1):
            if part_index in variants_indexes:
                if self.TRACE_RANDOM_KEY:
                    print("part_index found: ",part_index)
                # Choose randomly one variant for a precise part among all possible variants based on heuristic weights
                a_variant = self.get_one_variant_using_heuristic_weights(paratext_dict,part_index)
                if self.TRACE_RANDOM_KEY:
                    print("a_variant: ",a_variant)
                random_part_key = random_part_key + str(part_index) + "-" + str(a_variant) + "_"
            else:
                random_part_key = random_part_key + str(part_index) + "-0" + "_"
        return random_part_key[:-1]

    def generate_new_unique_key(self, paratext_dict,keys_memory,distribution="uniform"):
        random_key = self.generate_sampling_random_key(paratext_dict,distribution)
        while random_key in keys_memory:
            random_key = self.generate_sampling_random_key(paratext_dict,distribution)
        keys_memory.add(random_key)
        return (keys_memory,random_key)

    def generate_one_random_paratext(self, paratext_dict, random_key):
        paratext = []
        for index,key in enumerate(random_key.split("_")):
            sub_keys = key.split("-")
            paraphrase = list(paratext_dict[str(index+1)].keys())[int(sub_keys[1])]
            paratext.append(paraphrase)
        return paratext

    def generate_paraphrases(new_sentence):
        return new_sentence
    
    def create_paratext_dict(self, sentences_list):
        sentences_number = len(sentences_list)
        paratext_dict = {}
        for sentence_index in range(sentences_number):
            new_sentence = ' '.join(re.findall(r"[a-zA-Z'-]+|[.,;!?\'\’]+",sentences_list[sentence_index].lower()))
            paraphrases_dict = {new_sentence:1}
            if len(new_sentence) > 0:
                new_paraphrases_dict = self.generate_paraphrases(new_sentence)
                paraphrases_dict.update(new_paraphrases_dict)
            paratext_dict[str(sentence_index+1)] = paraphrases_dict
        return paratext_dict

    def generate_sampling(self, original_document, augmentation_factor):
        splitted_document = self.sentences_splitting(original_document)
        paratext_dict = self.create_paratext_dict(splitted_document)
        print("Paratext Dict created for the document")
        combinations_number_max = self.evaluate_combinations_number(paratext_dict)
        if augmentation_factor > combinations_number_max:
            print("*** WARNING! Desired paratexts_sample number: " + str(augmentation_factor) + " exceeds Combinations Number Max, which is: ", combinations_number_max)
            augmentation_factor = combinations_number_max
        if augmentation_factor > self.PARATEXTS_NUMBER_MAX:
            print("*** WARNING! Desired paratexts_sample number: " + str(augmentation_factor) + " exceeds PARATEXTS_NUMBER_MAX, which is: ", PARATEXTS_NUMBER_MAX)
            augmentation_factor = self.PARATEXTS_NUMBER_MAX
        paratexts_sample = []
        keys_memory = set()
        while len(paratexts_sample) < augmentation_factor:
            if len(keys_memory) > combinations_number_max * self.PARATEXT_ITERATION_LIMIT:
                break
            keys_memory, random_key = self.generate_new_unique_key(paratext_dict,keys_memory,distribution='uniform')
            new_paratext = self.generate_one_random_paratext(paratext_dict,random_key)
            paratexts_sample.append(new_paratext)
        print("Text Data Augmentation done for the document")
        return paratexts_sample

    def get_datafile_id(filename):
        return filename.split("_")[-2][-3:]
        
    def augment_text_data(self, augmentation_factor=5):
        is_train = True
        # WARNING - many lines of code below are data source specific !
        for sub_directory in [self.NEG_REVIEWS,self.POS_REVIEWS]:
            data_directory = data_dir + sub_directory
            for filename in listdir(data_directory):
                start_time = timeit.default_timer()
                # skip files that do not have the right extension
                # WARNING - data source specific
                if not filename.endswith(".txt"):
                    continue
                # Processing training set, so skip any reviews in the test set
                # which start with 'cv9' - WARNING - data source specific
                if is_train and filename.startswith('cv9'):
                    continue
                # Processing test set, so skip any reviews in the training set
                # which not start with 'cv9' - WARNING - data source specific
                if not is_train and not filename.startswith('cv9'):
                    continue
                # create the full path of the file to open
                file_path = data_directory + filename
                # load the original document
                original_text = self.load_doc(file_path)
                print("Augmenting...", file_path)
                paratexts = self.generate_sampling(original_text,augmentation_factor=augmentation_factor)
                for index,paraphrases in enumerate(paratexts):
                    new_text = " \n".join(paraphrases)
                    new_filename = "ampl_" + filename.split(".")[0] + "_" + str(index)+ '.txt'
                    if sub_directory == self.NEG_REVIEWS:
                        new_file_path = self.DATA_DIR+self.AUGMENTED_NEG_REVIEWS+new_filename 
                        self.save_doc(new_file_path,new_text)
                    else:
                        new_file_path = self.DATA_DIR+self.AUGMENTED_POS_REVIEWS+new_filename
                        self.save_doc(new_file_path,new_text)
                    if (index % 10) == 0:
                        print("Saved:",new_file_path)
                        end_time = timeit.default_timer()
                        print("Elapsed time: {}".format(end_time - start_time))
                        print("-----")

## Backtranslation TDA Object
### Sub Class Definition

IMPORTANT: Don't forget to get and replace the Google Translate Developer Key API

So this line of code: self.api_key = 'YOUR OWN GOOGLE TRANSLATE DEVELOPER KEY'

In [37]:
from sklearn.externals import joblib
import re
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import html
import urllib
import numpy as np


# Inherited or Sub class (TextDataAugmentation) 
class Backtranslation(TextDataAugmentation): 

    # Constructor 
    def __init__(self,cutoff_model_filepath,
                 data_dir,neg_rev, pos_rev,aug_neg_rev, aug_pos_rev,
                 paraphr_nbr_max=5, paratext_nbr_max = 5,): 
        self.SIMILARITY_LENGTH_MIN = 0.6
        self.SIMILARITY_BLEU_MIN = 0.6
        self.CUTOFF_MODEL_FILEPATH = cutoff_model_filepath
        # Less accurate translation languages were commented
        self.Google_Translate_languages = {
            #'Afrikaans':'af',
            #'Albanian':'sq',
            #'Arabic':'ar',
            'Belarusian':'be',
            'Bulgarian':'bg',
            #'Catalan':'ca',
            'ChineseSimplified':'zh-CN',
            'ChineseTraditional':'zh-TW',
            #'Croatian':'hr',
            #'Czech':'cs',
            'Danish':'da',
            'Dutch':'nl',
            #'English':'en',
            #'Estonian':'et',
            #'Filipino':'tl',
            #'Finnish':'fi',
            'French':'fr',
            'Galician':'gl',
            'German':'de',
            #'Greek':'el',
            'Hebrew':'iw',
            #'Hindi':'hi',
            #'Hungarian':'hu',
            #'Icelandic':'is',
            #'Indonesian':'id',
            #'Irish':'ga',
            'Italian':'it',
            'Japanese':'ja',
            #'Korean':'ko',
            #'Latvian':'lv',
            #'Lithuanian':'lt',
            #'Macedonian':'mk',
            #'Malay':'ms',
            #'Maltese':'mt',
            'Norwegian':'no',
            #'Persian':'fa',
            'Polish':'pl',
            'Portuguese':'pt',
            'Romanian':'ro',
            'Russian':'ru',
            #'Serbian':'sr',
            #'Slovak':'sk',
            'Slovenian':'sl',
            'Spanish':'es',
            #'Swahili':'sw',
            'Swedish':'sv',
            # 'Thai':'th',
            'Turkish':'tr',
            'Ukrainian':'uk',
            'Vietnamese':'vi',
            #'Welsh':'cy',
            #'Yiddish':'yi',
            } 
        self.language_codes = list(self.Google_Translate_languages.values())
        # Translate API key
        self.api_key = 'YOUR OWN GOOGLE TRANSLATE DEVELOPER KEY'
        self.agent = {
            'User-Agent':"Mozilla/4.0 (\
            compatible;\
            MSIE 6.0;\
            Windows NT 5.1;\
            SV1;\
            .NET CLR 1.1.4322;\
            .NET CLR 2.0.50727;\
            .NET CLR 3.0.04506.30\
            )"}
        self.logreg_model = joblib.load(self.CUTOFF_MODEL_FILEPATH)
        self.conversion_langtag_to_int = {'be': 0, 'bg': 1, 'zh+AC0-CN': 2, 'zh+AC0-TW': 3, 'da': 4,\
                                          'nl': 5, 'fr': 6,'gl': 7, 'de': 8,\
                                          'iw': 9, 'it': 10, 'ja': 11, 'no': 12,\
                                          'pl': 13, 'pt': 14, 'ro': 15, 'ru': 16,\
                                          'sl': 17, 'es': 18,'sv': 19, 'tr': 20,\
                                          'uk': 21, 'vi': 22, 'zh-CN': 23, 'zh-TW': 24
                                         }
        super(Backtranslation, self).__init__(data_dir, 
                                              neg_rev, pos_rev, 
                                              aug_neg_rev, aug_pos_rev,
                                              paraphr_nbr_max, paratext_nbr_max
                                             )

    # To check if this object is Backtranslation class 
    def isBacktranslation(self): 
        return True

    # To get ModelPah 
    def getModelPath(self): 
        return self.model_path 

    # http://docs.python-requests.org/en/master/user/quickstart/#raw-response-content
    # Requests is an elegant and simple HTTP library for Python, built for human beings. 
    # https://www.peterbe.com/plog/best-practice-with-retries-with-requests
    def requests_retry_session(self,session=None):
        session = session or requests.Session()
        retry = Retry(
            total = 3,
            read = 3,
            connect = 3,
            backoff_factor = 0.3,
            status_forcelist = (500, 502, 504),
        )
        adapter = HTTPAdapter(max_retries=retry)
        session.mount('http://', adapter)
        session.mount('https://', adapter)
        return session

    def translate(self, text, to_language="auto", from_language="auto"):
        text = urllib.parse.quote(text)
        link = "http://translate.google.com/m?hl=%s&sl=%s&q=%s&key=%s" % (to_language, from_language, text, self.api_key)
        session = requests.Session()
        session.auth = ('user', 'pass')
        session.headers.update({'x-test': 'true'})
        response = self.requests_retry_session(session=session).get(link)
        re_result = re.findall(r'class="t0">(.*?)<', response.text)
        if (len(re_result) == 0):
            result = ""
        else:
            result = html.unescape(re_result[0])
        return (result)

    def get_BLEU_similarity(self, sentence1, sentence2):
        bleu_score = sentence_bleu([sentence1.split()], sentence2.split(), smoothing_function=SmoothingFunction().method4)
        return bleu_score

    def get_simple_similarity(self, sentence1, sentence2):
        length_sentence1 = len(sentence1.split())
        length_sentence2 = len(sentence2.split())
        return 1- abs(length_sentence2 - length_sentence1)/length_sentence1

    def get_prediction_LogisticRegression_model(self, target_language_tag, source_sentence, backtranslation):
        target_language_int = self.conversion_langtag_to_int[target_language_tag]
        source_length = len(source_sentence.split())
        backtranslation_length = len(backtranslation.split())
        bleu_score = self.get_BLEU_similarity(source_sentence,backtranslation)
        new_data_array = np.array([target_language_int,source_length,backtranslation_length,bleu_score])
        return self.logreg_model.predict(new_data_array.reshape(1, -1))[0]
    
    def generate_paraphrases(self, source_sentence, source_language_code="en", cut_off_method="logistic_model"):
        standardized_source_sentence = self.standardize_text(source_sentence)
        paraphrases = {}
        for target_language_code in self.language_codes:
            if (self.sentence_length(source_sentence) < self.SENTENCE_LENGTH_MAX):
                try:
                    forward_translation = self.translate(source_sentence, target_language_code, source_language_code)
                    back_translation = self.translate(forward_translation, source_language_code, target_language_code)
                    standardized_back_translation = self.standardize_text(back_translation)
                    if (standardized_source_sentence != standardized_back_translation):
                        if cut_off_method == "logistic_model":
                            if self.get_prediction_LogisticRegression_model(target_language_code,standardized_source_sentence,standardized_back_translation) > 0:
                                if len(paraphrases) < self.PARAPHRASES_NUMBER_MAX:
                                    paraphrases.update(self.add_new_paraphrase(paraphrases,standardized_source_sentence,standardized_back_translation))
                                else:
                                    break
                        elif cut_off_method == "bleu_similarity":
                            if (self.get_BLEU_similarity(standardized_source_sentence, standardized_back_translation) >  self.SIMILARITY_BLEU_MIN):
                                if len(paraphrases) < self.PARAPHRASES_NUMBER_MAX:
                                    paraphrases.update(self.add_new_paraphrase(paraphrases,standardized_source_sentence,standardized_back_translation))
                                else:
                                    break
                        elif cut_off_method == "length_similarity":
                            if (self.get_simple_similarity(standardized_source_sentence, standardized_back_translation) >  self.SIMILARITY_LENGTH_MIN):
                                if len(paraphrases) < self.PARAPHRASES_NUMBER_MAX:
                                    paraphrases.update(self.add_new_paraphrase(paraphrases,standardized_source_sentence,standardized_back_translation))
                                else:
                                    break
                except Exception:
                    print("Google Translate server exception",target_language_code)
                    continue
        return paraphrases

### Local data directories setup

In [38]:
data_dir = "DATA/"
neg_rev = "neg/"
pos_rev = "pos/"
aug_neg_rev = "backtranslation_aug_neg/"
aug_pos_rev = "backtranslation_aug_pos/"
model_dir = "DATA/cutoff_model/"
model_file_name = "CuttOff_LogisticRegression_model.pkl"

### Experiment - Creation of TDA Backtranslation object 

In [39]:
paraphr_nbr_max = 5
paratext_nbr_max = 5
my_tda = Backtranslation(model_dir+model_file_name,data_dir,
                         neg_rev, pos_rev, aug_neg_rev, aug_pos_rev, 
                         paraphr_nbr_max, paratext_nbr_max)
my_tda.isBacktranslation()

True

In [40]:
my_tda.isTextDataAugmentation()

True

In [41]:
my_tda.name

'TextDataAugmentation'

### Experiment - Cutoff Metrics

In [42]:
first_sentence = "this is a first sentence"
second_sentence = "this is a second test sentence"
print("Simple Length Similarity Cutoff:",my_tda.get_simple_similarity(first_sentence,second_sentence))
print("BLEU Metric Similarity Cutoff:",my_tda.get_BLEU_similarity(first_sentence,second_sentence))
print("Logistic Regression Model Cuttoff:",my_tda.get_prediction_LogisticRegression_model('fr',first_sentence,second_sentence))

Simple Length Similarity Cutoff: 0.8
BLEU Metric Similarity Cutoff: 0.32756475929865714
Logistic Regression Model Cuttoff: 1


### Experiment - sentence translation

In [43]:
original_text = "to be or not to be that is the question"
my_tda.translate(original_text, to_language="fr", from_language="en")

'être ou ne pas être telle est la question'

In [44]:
my_tda.generate_paraphrases(original_text, source_language_code='en', cut_off_method="logistic_model")

{'be or not to be , it is a question': 1,
 'be it or not this is the question': 1,
 'yes or no that question': 1,
 'to be or not to be , that is the question': 1,
 'to be or not to be such is the question': 1}

### Experiment - Text Data Augmentation by extensive paratexts generation using Backtranslation

Should be distributed over many CPUs / GPUs to accelerate the processing time

In [None]:
my_tda.augment_text_data()

    Augmenting... DATA/neg/cv676_22202.txt
    Google Translate server exception ja
    Google Translate server exception ru
    Google Translate server exception tr
    Paratext Dict created for the document
    Text Data Augmentation done for the document
    Saved: DATA/backtranslation_aug_neg/ampl_cv676_22202_0.txt
    Elapsed time: 707.6035030048806
    -----
    Augmenting... DATA/neg/cv839_22807.txt
    Google Translate server exception ru
    Paratext Dict created for the document
    Text Data Augmentation done for the document
    Saved: DATA/backtranslation_aug_neg/ampl_cv839_22807_0.txt
    Elapsed time: 406.81371479993686
    -----
    Augmenting... DATA/neg/cv155_7845.txt
    Paratext Dict created for the document
    Text Data Augmentation done for the document
    Saved: DATA/backtranslation_aug_neg/ampl_cv155_7845_0.txt
    Elapsed time: 247.3113233819604
    -----
    ...