In [25]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from torchtext.legacy import data
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy import data
from torchtext.legacy.datasets import IMDB
from torchtext.legacy.data import Field, LabelField, BucketIterator

In [4]:
import googletrans
from googletrans import Translator

In [1]:
from tqdm import tqdm
import random
import sys
import pandas as pd
import numpy as np
import os
import random
import torch, torchtext
import os
import googletrans
from googletrans import Translator

In [26]:
import os
for dirname, _, filenames in os.walk('stanfordSentimentTreebank'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

stanfordSentimentTreebank/SOStr.txt
stanfordSentimentTreebank/sentiment_labels.txt
stanfordSentimentTreebank/README.txt
stanfordSentimentTreebank/original_rt_snippets.txt
stanfordSentimentTreebank/datasetSplit.txt
stanfordSentimentTreebank/dictionary.txt
stanfordSentimentTreebank/STree.txt
stanfordSentimentTreebank/datasetSentences.txt
stanfordSentimentTreebank/.ipynb_checkpoints/README-checkpoint.txt
stanfordSentimentTreebank/.ipynb_checkpoints/datasetSplit-checkpoint.txt
stanfordSentimentTreebank/.ipynb_checkpoints/datasetSentences-checkpoint.txt
stanfordSentimentTreebank/.ipynb_checkpoints/sentiment_labels-checkpoint.txt


In [2]:
def get_phrase_sentiments(base_directory):
    def group_labels(label):
        if label in ["very negative", "negative"]:
            return "negative"
        elif label in ["positive", "very positive"]:
            return "positive"
        else:
            return "neutral"

    dictionary = pd.read_csv(os.path.join(base_directory, "dictionary.txt"), sep="|")
    dictionary.columns = ["phrase", "id"]
    dictionary = dictionary.set_index("id")

    sentiment_labels = pd.read_csv(os.path.join(base_directory, "sentiment_labels.txt"), sep="|")
    sentiment_labels.columns = ["id", "sentiment"]
    sentiment_labels = sentiment_labels.set_index("id")

    phrase_sentiments = dictionary.join(sentiment_labels)

    phrase_sentiments["fine"] = pd.cut(phrase_sentiments.sentiment, [0, 0.2, 0.4, 0.6, 0.8, 1.0],
                                           include_lowest=True,
                                           labels=["very negative", "negative", "neutral", "positive", "very positive"])
    phrase_sentiments["coarse"] = phrase_sentiments.fine.apply(group_labels)
    return phrase_sentiments


def get_sentence_partitions(base_directory):
    sentences = pd.read_csv(os.path.join(base_directory, "datasetSentences.txt"), index_col="sentence_index",
                                sep="\t")
    splits = pd.read_csv(os.path.join(base_directory, "datasetSplit.txt"), index_col="sentence_index")
    return sentences.join(splits)


def partition(base_directory):
    phrase_sentiments = get_phrase_sentiments(base_directory).reset_index(level=0)
    sentence_partitions = get_sentence_partitions(base_directory)
    # noinspection PyUnresolvedReferences
    data = sentence_partitions.join(phrase_sentiments.set_index("phrase"), on="sentence")
    data["splitset_label"] = data["splitset_label"].fillna(1).astype(int)
    # data["sentence"] = data["sentence"].str.replace(r"\s('s|'d|'re|'ll|'m|'ve|n't)\b", lambda m: m.group(1))
    return data.groupby("splitset_label")

In [14]:
def back_translate(sequence,lab, PROB = 1):
    languages = ['en', 'fr', 'th', 'tr', 'ur', 'ru', 'bg', 'de', 'ar', 'zh-cn', 'hi',
                 'sw', 'vi', 'es', 'el']
    
    #instantiate translator
    translator = Translator()
    
    #store original language so we can convert back
    org_lang = translator.detect(sequence).lang
    
    #randomly choose language to translate sequence to  
    random_lang = np.random.choice([lang for lang in languages if lang is not org_lang])
    #print(random_lang)
    if org_lang in languages:
        #translate to new language and back to original
        translated = translator.translate(sequence, dest = random_lang).text
        #translate back to original language
        translated_back = translator.translate(translated, dest = org_lang).text
        #print(translated,translated_back)
        #apply with certain probability
        if np.random.uniform(0, 1) <= PROB:
            output_sequence = translated_back
        else:
            output_sequence = sequence
            
    #if detected language not in our list of languages, do nothing
    else:
        output_sequence = sequence
    
    return output_sequence,lab


def random_deletion(words,lab, p=0.5): 
    if len(words) == 1: # return if single word
        return words
    remaining = list(filter(lambda x: random.uniform(0,1) > p,words)) 
    
    if len(remaining) == 0: # if not left, sample a random word
        return [random.choice(words)] ,lab
    else:
        return remaining,lab

def random_swap(sentence,lab, n=5): 
    length = range(len(sentence)) 
    for _ in range(n):
        idx1, idx2 = random.sample(length, 2)
        sentence[idx1], sentence[idx2] = sentence[idx2], sentence[idx1] 
    return sentence,lab

In [3]:
base_directory, output_directory = 'stanfordSentimentTreebank','./';
os.makedirs(output_directory, exist_ok=True)
for splitset, partition in partition(base_directory):
    split_name = {1: "train", 2: "test", 3: "dev"}[splitset]
    filename = os.path.join(output_directory, "%s.csv" % split_name)
    del partition["splitset_label"]
    partition.to_csv(filename)

In [4]:
def discretize_label(label):
    if label <= 0.05*100: return 'Class1'
    if label <= 0.1*100: return 'Class2'
    if label <= 0.15*100: return 'Class3'
    if label <= 0.2*100: return 'Class4'
    return 'Class5'

In [5]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
dev_data = pd.read_csv('dev.csv')

In [6]:
Train_df = pd.concat([train_data,test_data])

In [8]:
Train_df.reset_index(inplace = True,drop = True)

In [9]:
Train_df.shape

(10754, 6)

In [12]:
def random_pick(df):
    for num in range (df.shape[0]):
#     num = np.random.randint(0,df.shape[0])
        return df.sentence[num],df.sentiment[num]

In [17]:
df2 = pd.read_csv('all_transforms.csv',index_col=[0])

In [None]:
# train_data.sentence
# train_data.sentiment
df = Train_df.copy()
pbar = tqdm(range(5492+1057,df.shape[0]))
count = len(df)
print (f'Before the shape was :{len(df)}' )

aug_data = []
aug_label = []
for i in pbar:
    
    
    word,val = random_pick(df)
    word1,val1 = back_translate(word,val)
    
    word,val = random_pick(df)
    word = word.split()
    word2,val2 = random_deletion(word,val)
    word2 = ' '.join(i for i in word2)
    
    word,val = random_pick(df)
    word = word.split()
    word3,val3 = random_swap(word,val)
    word3 = ' '.join(i for i in word3)
     
    ins = {'sentence':[word1,word2,word3],'label':[val1,val2,val3]}
    df2 = pd.concat([df2,pd.DataFrame(ins)])
    df2.to_csv('all_transforms_1.csv')
pbar.set_description(desc = f'Loop:{i}')
    

  0%|          | 0/4205 [00:00<?, ?it/s]

Before the shape was :10754


 47%|████▋     | 1979/4205 [57:19<1:05:38,  1.77s/it]

In [None]:
  0%|          | 0/4205 [00:00<?, ?it/s]
Before the shape was :10754
 30%|██▉       | 1258/4205 [36:24<1:25:02,  1.73s/it]