### Importing Important Library

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy import data
from torchtext.legacy.datasets import IMDB
from torchtext.legacy.data import Field, LabelField, BucketIterator,Example

In [3]:
from tqdm import tqdm
import random
import sys
import pandas as pd
import numpy as np
import os
import random
import torch, torchtext
import os
import googletrans
from googletrans import Translator

### Version chcek

In [4]:
f'Torch CUDA Version :{torch.version.cuda}'
f'Torch Version :{torch.__version__}'
f'Python Version :{sys.version}'

'Torch CUDA Version :10.2'

'Torch Version :1.8.1'

'Python Version :3.8.10 (default, May 19 2021, 18:05:58) \n[GCC 7.3.0]'

### GPU Checker 

In [5]:
def gpu_check(seed_val = 1):
    print('The Seed is set to {}'.format(seed_val))
    if torch.cuda.is_available():
        print('Model will Run on CUDA.')
        print ("Type 'watch nvidia-smi' to monitor GPU\n")
        torch.cuda.manual_seed(seed_val)
        device = 'cuda'
    else:
        torch.manual_seed(seed_val)
        print ('Running in CPU')
        device = 'cpu'
    cuda = torch.cuda.is_available()
    return cuda,seed_val,device

In [6]:
cuda,SEED,device = gpu_check(seed_val=1234)

The Seed is set to 1234
Running in CPU


In [7]:
def get_merged_dataset(sst_dir):
    sentiment_labels = pd.read_csv(os.path.join(sst_dir, "sentiment_labels.txt"), sep="|")
    sentence_ids = pd.read_csv(os.path.join(sst_dir, "datasetSentences.txt"), sep="\t")
    dictionary = pd.read_csv(os.path.join(sst_dir, "dictionary.txt"), sep="|", names=['phrase', 'phrase ids'])
    train_test_split = pd.read_csv(os.path.join(sst_dir, "datasetSplit.txt"))
    sentence_phrase_merge = pd.merge(sentence_ids, dictionary, left_on='sentence', right_on='phrase')
    sentence_phrase_split = pd.merge(sentence_phrase_merge, train_test_split, on='sentence_index')
    return pd.merge(sentence_phrase_split, sentiment_labels, on='phrase ids').sample(frac=1)

def discretize_label(label):
    if label <= 0.2: return 'Class1'
    if label <= 0.4: return 'Class2'
    if label <= 0.6: return 'Class3'
    if label <= 0.8: return 'Class4'
    return 'Class5'

In [8]:
sst_dir = 'stanfordSentimentTreebank/'
df = get_merged_dataset(sst_dir)

In [9]:
df.head()

Unnamed: 0,sentence_index,sentence,phrase,phrase ids,splitset_label,sentiment values
2257,2351,Muccino seems to be exploring the idea of why ...,Muccino seems to be exploring the idea of why ...,67693,1,0.61111
7536,7892,Is the time really ripe for a warmed-over Jame...,Is the time really ripe for a warmed-over Jame...,146377,3,0.375
3081,3219,Based on Dave Barry 's popular book of the sam...,Based on Dave Barry 's popular book of the sam...,64257,1,0.73611
6530,6830,If you 're looking to rekindle the magic of th...,If you 're looking to rekindle the magic of th...,146230,1,0.26389
4611,4823,"If Signs is a good film , and it is , the esse...","If Signs is a good film , and it is , the esse...",106575,1,0.51389


In [10]:
df['label'] = df['sentiment values'].apply(discretize_label)

In [11]:
df.shape
df.label.value_counts()

(11286, 7)

Class2    2971
Class4    2966
Class3    2144
Class5    1773
Class1    1432
Name: label, dtype: int64

In [12]:
df_data = df[['sentence','label']]

In [13]:
df = df_data.reset_index(drop = True)

In [14]:
df.shape

(11286, 2)

In [15]:
df.sentence[2733]
df.label[2733]

'... begins with promise , but runs aground after being snared in its own tangled plot .'

'Class2'

In [16]:
df.head()

Unnamed: 0,sentence,label
0,Muccino seems to be exploring the idea of why ...,Class4
1,Is the time really ripe for a warmed-over Jame...,Class2
2,Based on Dave Barry 's popular book of the sam...,Class4
3,If you 're looking to rekindle the magic of th...,Class2
4,"If Signs is a good film , and it is , the esse...",Class3


In [17]:
df.head(2)

Unnamed: 0,sentence,label
0,Muccino seems to be exploring the idea of why ...,Class4
1,Is the time really ripe for a warmed-over Jame...,Class2


In [18]:
TEXT = Field(sequential = True, tokenize = 'spacy', batch_first =True, include_lengths=True)
LABEL = LabelField(tokenize ='spacy', is_target=True, batch_first =True, sequential =False)



In [19]:
fields = [('text', TEXT),('labels',LABEL)]

In [20]:
def back_translate(sequence,lab, PROB = 1):
    languages = ['en', 'fr', 'th', 'tr', 'ur', 'ru', 'bg', 'de', 'ar', 'zh-cn', 'hi',
                 'sw', 'vi', 'es', 'el']
    
    #instantiate translator
    translator = Translator()
    
    #store original language so we can convert back
    org_lang = translator.detect(sequence).lang
    
    #randomly choose language to translate sequence to  
    random_lang = np.random.choice([lang for lang in languages if lang is not org_lang])
    #print(random_lang)
    if org_lang in languages:
        #translate to new language and back to original
        translated = translator.translate(sequence, dest = random_lang).text
        #translate back to original language
        translated_back = translator.translate(translated, dest = org_lang).text
        #print(translated,translated_back)
        #apply with certain probability
        if np.random.uniform(0, 1) <= PROB:
            output_sequence = translated_back
        else:
            output_sequence = sequence
            
    #if detected language not in our list of languages, do nothing
    else:
        output_sequence = sequence
    
    return output_sequence,lab


def random_deletion(words,lab, p=0.5): 
    if len(words) == 1: # return if single word
        return words
    remaining = list(filter(lambda x: random.uniform(0,1) > p,words)) 
    
    if len(remaining) == 0: # if not left, sample a random word
        return [random.choice(words)] ,lab
    else:
        return remaining,lab

def random_swap(sentence,lab, n=5): 
    length = range(len(sentence)) 
    for _ in range(n):
        idx1, idx2 = random.sample(length, 2)
        sentence[idx1], sentence[idx2] = sentence[idx2], sentence[idx1] 
    return sentence,lab

In [21]:
def random_pick(train_data):
    num = np.random.randint(0,len(train_data.examples))
    return train_data.examples[num].text,train_data.examples[num].labels

In [32]:
example = [Example.fromlist([df.iloc[i].sentence,df.iloc[i].label], fields) for i in range(df.shape[0])] 

CustomDataset = data.Dataset(example, fields)

(train_data, valid_data) = CustomDataset.split(split_ratio=[0.85, 0.15], random_state=random.seed(SEED))

In [23]:
pbar = tqdm(range(0,1000))
count = len(train_data.examples)
print (f'Before the shape was :{len(train_data.examples)}' )

aug_data = []
aug_label = []
for i in pbar:
    
    word,val = random_pick(train_data)
    word_l = ' '.join(i for i in word)
    word1,val1 = back_translate(word_l,val)
    word1 = word1.split(' ')
    
    aug_data.append(word1)
    aug_label.append(val1)

    word,val = random_pick(train_data)
    word2,val2 = random_deletion(word,val)
    
    aug_data.append(word2)
    aug_label.append(val2)
    
    word,val = random_pick(train_data)
    word3,val3 = random_swap(word,val)
    
    aug_data.append(word3)
    aug_label.append(val3)
    
#     ins = {'sentence':[word1,word2,word3],'label':[val,val,val]}
#     df2 = pd.concat([df2,pd.DataFrame(ins)])
    
pbar.set_description(desc = f'Loop:{i}')
    

  0%|          | 0/1000 [00:00<?, ?it/s]

Before the shape was :9593


100%|██████████| 1000/1000 [42:07<00:00,  2.53s/it]


In [24]:
train_data.examples[0].labels

'Class2'

In [38]:
len(train_data.examples)

9593

In [39]:
 train_data.examples.extend({'text':d,'labels':l})

In [40]:
len(train_data.examples)

9595

In [51]:
train_data.examples[9592].text

['Since',
 'Dahmer',
 'resorts',
 'to',
 'standard',
 'slasher',
 'flick',
 'thrills',
 'when',
 'it',
 'should',
 'be',
 'most',
 'in',
 'the',
 'mind',
 'of',
 'the',
 'killer',
 ',',
 'it',
 'misses',
 'a',
 'major',
 'opportunity',
 'to',
 'be',
 'truly',
 'revelatory',
 'about',
 'his',
 'psyche',
 '.']

In [43]:
d
l

['`',
 'In',
 'this',
 'poor',
 'remake',
 'of',
 'such',
 'a',
 'well',
 'loved',
 'classic',
 ',',
 'Parker',
 'exposes',
 'the',
 'limitations',
 'of',
 'his',
 'skill',
 'and',
 'the',
 'basic',
 'flaws',
 'in',
 'his',
 'vision',
 '.',
 "'"]

'Class1'

In [37]:
count = 9593
for d,l in zip(aug_data,aug_label):
#     train_data.examples[count].text = d
#     train_data.examples[count]. = l
    break
    count+= 1

In [None]:
count

In [None]:
len(train_data.examples[])

In [None]:
(len(train_data), len(valid_data))

In [None]:
tra

In [None]:
vars(train_data.examples[10])

### Visualizing the data

In [None]:
train_data.examples[0].labels

In [None]:
line = ''
filenumber = np.random.randint(1000)

for i in train_data.examples[filenumber].text:
    line += i + ' '
line

train_data.examples[filenumber].labels

###  Build vocabulary for source and target from training data
 

In [None]:
train_data.fields

In [None]:
TEXT.build_vocab(train_data)
LABEL.build_vocab(train_data)

In [None]:
print('Size of input vocab : ', len(TEXT.vocab))
print('Size of label vocab : ', len(LABEL.vocab))
print('Top 10 words appreared repeatedly :', list(TEXT.vocab.freqs.most_common(10)))
print('Labels : ', LABEL.vocab.stoi)

In [None]:
df.label.value_counts()

### train and test iteartor

In [None]:
BATCH_SIZE = 128

train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch=True, # necessary for packed_padded_sequence
    device = device)

In [None]:
import os, pickle
with open('tokenizer.pkl', 'wb') as tokens: 
    pickle.dump(TEXT.vocab.stoi, tokens)

### Creating Model

In [None]:
# Model class
import torch.nn.functional as F
class Model(nn.Module):
    def __init__(self, input_dim, output_dim,emb_dim, hidden_dim, n_layers, dropout):
        # input_dim <--- vocabulary size
        # output_dim <--- len ([positive, negative]) == 2 
        # emb_dim <--- embedding dimension of embedding matrix

        super(Model, self).__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.encoder = nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout,batch_first=True)

        self.fc1 = nn.Linear(hidden_dim, 512)
        self.fc2 = nn.Linear(512, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src,Len):
        # shape: [source_len, batch_size]
        embedded = self.dropout(self.embedding(src)) # shape: [src_len, batch_size, embed_dim]
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, Len.to('cpu'),batch_first=True)
        output, (hidden, cell) = self.encoder(packed) 
        # output shape -> [batch, hidden_dim]
        # hiddden shape -> [n_layers, batch, hidden_dim]
        # cell shape -> [n_layers, batch, hidden_dim]
        output = self.fc2(self.fc1(hidden))
        output = F.softmax(output, dim=1)
        return output.squeeze(0)

In [None]:
#initializing variables and hyper parameters
INPUT_DIM = len(TEXT.vocab)
OUTPUT_DIM = len(LABEL.vocab)

EMBEDDING_DIM = 128
HIDDEN_DIM = 256

N_LAYERS = 1
DROPOUT = 0.4

# initializing our model
model = Model(INPUT_DIM, OUTPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, N_LAYERS, DROPOUT).to(device)



In [None]:
train_loss = []
train_accuracy = []
test_loss = []
test_accuracy = []
validation_loss = []
validation_accuracy = []

In [None]:
# loop and train our model
optimizer = optim.Adam(model.parameters(), lr=1e-2)

# defining learnig rate scheduler (optional)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)

criterion = nn.CrossEntropyLoss()


### Model training function

In [None]:

def train(EPOCH,model, iterator, optimizer=optimizer, criterion=criterion, clip=1,):
    model.train()
    epoch_loss = 0
    total_correct = 0
    total_count = 0
    pbar = tqdm(iterator)
    for i, batch in enumerate(pbar):
        src,data_len = batch.text
        src = src.to(device)
        trg = batch.labels.to(device)
        trg = trg.long()
        optimizer.zero_grad()
        output = model(src,data_len)
        
        total_correct += torch.sum(torch.eq(output.argmax(1), trg))
        total_count+=len(trg)
        
        loss = criterion(output, trg)
        
        loss.backward() 
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
        pbar.set_description(desc= f'Epoch {EPOCH} Train data Batch No : {i} Loss : {loss.item():.3f} Accuracy : {total_correct/total_count * 100 :.2f}% ' )
    
    train_accuracy.append(total_correct/total_count)
    mean_loss = epoch_loss / len(iterator)
    train_loss.append(mean_loss)
    
    scheduler.step(mean_loss)


### Model Validation function

In [None]:

def evaluate(EPOCH,model, iterator, criterion,typ_loader):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    pbar  = tqdm(iterator)
    with torch.no_grad():
        
        for i,batch in enumerate(pbar):
            src,data_len = batch.text
            src = src.to(device)
            trg = batch.labels.to(device)
            trg = trg.long()
            predictions = model(src,data_len)
            
            loss = criterion(predictions, trg)
            
            acc = binary_accuracy(predictions, trg)

            epoch_loss += loss.item()
            epoch_acc += acc
            if typ_loader == 'Valid data':
                validation_loss.append(loss)
                validation_accuracy.append(acc)
            elif typ_loader == 'Test data':
                test_loss.append(loss)
                test_accuracy.append(acc)
            pbar.set_description(desc= f'Epoch {EPOCH} {typ_loader} Batch No : {i} Loss : {loss.item():.3f} | Accuracy : {epoch_acc / len(iterator)* 100 :.2f}% ' )

In [None]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds.argmax(1) == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc.item()

In [None]:
total_epoch = 100
for epoch in range(total_epoch):
    result = train(epoch,model=model, iterator=train_iterator)
    evaluate(epoch,model,valid_iterator,criterion,'Valid data')
#     evaluate(epoch,model,test_iterator,criterion,'Test data')

In [None]:
predict('Very good') # predict funciton will predict if this is positive or negative review.

# predict('i recommend to watch the movie once. It is mindblowing') # predict funciton will predict if this is positive or negative review.

In [None]:
import matplotlib.pyplot as plt

plt.plot(train_accuracy,'green')
plt.title('train_acc')

In [None]:
plt.plot(train_loss,'green')
plt.title('train_loss')

In [None]:
plt.plot(test_accuracy,'red')
plt.title('test_acc')

In [None]:
plt.plot(test_loss,'red')
plt.title('test_loss')

In [None]:
plt.plot(validation_accuracy,'blue')
plt.title('test_acc')

In [None]:
plt.plot(validation_loss,'blue')
plt.title('test_loss')