### Importing Important Library

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy import data
from torchtext.legacy.datasets import IMDB
from torchtext.legacy.data import Field, LabelField, BucketIterator,Example

In [3]:
from tqdm import tqdm
import random
import sys
import pandas as pd
import numpy as np
import os
import random
import torch, torchtext
import os
import googletrans
from googletrans import Translator

### Version chcek

In [4]:
f'Torch CUDA Version :{torch.version.cuda}'
f'Torch Version :{torch.__version__}'
f'Python Version :{sys.version}'

'Torch CUDA Version :10.2'

'Torch Version :1.8.1'

'Python Version :3.8.10 (default, May 19 2021, 18:05:58) \n[GCC 7.3.0]'

### GPU Checker 

In [5]:
def gpu_check(seed_val = 1):
    print('The Seed is set to {}'.format(seed_val))
    if torch.cuda.is_available():
        print('Model will Run on CUDA.')
        print ("Type 'watch nvidia-smi' to monitor GPU\n")
        torch.cuda.manual_seed(seed_val)
        device = 'cuda'
    else:
        torch.manual_seed(seed_val)
        print ('Running in CPU')
        device = 'cpu'
    cuda = torch.cuda.is_available()
    return cuda,seed_val,device

In [6]:
cuda,SEED,device = gpu_check(seed_val=1234)

The Seed is set to 1234
Running in CPU


In [7]:
def get_merged_dataset(sst_dir):
    sentiment_labels = pd.read_csv(os.path.join(sst_dir, "sentiment_labels.txt"), sep="|")
    sentence_ids = pd.read_csv(os.path.join(sst_dir, "datasetSentences.txt"), sep="\t")
    dictionary = pd.read_csv(os.path.join(sst_dir, "dictionary.txt"), sep="|", names=['phrase', 'phrase ids'])
    train_test_split = pd.read_csv(os.path.join(sst_dir, "datasetSplit.txt"))
    sentence_phrase_merge = pd.merge(sentence_ids, dictionary, left_on='sentence', right_on='phrase')
    sentence_phrase_split = pd.merge(sentence_phrase_merge, train_test_split, on='sentence_index')
    return pd.merge(sentence_phrase_split, sentiment_labels, on='phrase ids').sample(frac=1)

def discretize_label(label):
    if label <= 0.2: return 'Class1'
    if label <= 0.4: return 'Class2'
    if label <= 0.6: return 'Class3'
    if label <= 0.8: return 'Class4'
    return 'Class5'

In [8]:
sst_dir = 'stanfordSentimentTreebank/'
df = get_merged_dataset(sst_dir)

In [9]:
df.head()

Unnamed: 0,sentence_index,sentence,phrase,phrase ids,splitset_label,sentiment values
10811,11355,And that should tell you everything you need t...,And that should tell you everything you need t...,222377,1,0.41667
3371,3529,` Moore is like a progressive bull in a china ...,` Moore is like a progressive bull in a china ...,71201,1,0.34722
11237,11801,Grating and tedious .,Grating and tedious .,223531,1,0.36111
10247,10750,Dismally dull sci-fi comedy .,Dismally dull sci-fi comedy .,183481,1,0.083333
8390,8785,While Super Troopers is above Academy standard...,While Super Troopers is above Academy standard...,150910,2,0.26389


In [10]:
df['label'] = df['sentiment values'].apply(discretize_label)

In [11]:
df.shape
df.label.value_counts()

(11286, 7)

Class2    2971
Class4    2966
Class3    2144
Class5    1773
Class1    1432
Name: label, dtype: int64

In [12]:
df_data = df[['sentence','label']]

In [13]:
df = df_data.reset_index(drop = True)

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
train,test = train_test_split(df,train_size=.80)

In [17]:
df.shape

(11286, 2)

In [18]:
df.sentence[2733]
df.label[2733]

'Dense and enigmatic ... elusive ... stagy and stilted'

'Class3'

In [19]:
df.head()

Unnamed: 0,sentence,label
0,And that should tell you everything you need t...,Class3
1,` Moore is like a progressive bull in a china ...,Class2
2,Grating and tedious .,Class2
3,Dismally dull sci-fi comedy .,Class1
4,While Super Troopers is above Academy standard...,Class2


In [20]:
df.head(2)

Unnamed: 0,sentence,label
0,And that should tell you everything you need t...,Class3
1,` Moore is like a progressive bull in a china ...,Class2


In [21]:
TEXT = Field(sequential = True, tokenize = 'spacy', batch_first =True, include_lengths=True)
LABEL = LabelField(tokenize ='spacy', is_target=True, batch_first =True, sequential =False)



In [22]:
fields = [('text', TEXT),('labels',LABEL)]

In [23]:
def back_translate(sequence,lab, PROB = 1):
    languages = ['en', 'fr', 'th', 'tr', 'ur', 'ru', 'bg', 'de', 'ar', 'zh-cn', 'hi',
                 'sw', 'vi', 'es', 'el']
    
    #instantiate translator
    translator = Translator()
    
    #store original language so we can convert back
    org_lang = translator.detect(sequence).lang
    
    #randomly choose language to translate sequence to  
    random_lang = np.random.choice([lang for lang in languages if lang is not org_lang])
    #print(random_lang)
    if org_lang in languages:
        #translate to new language and back to original
        translated = translator.translate(sequence, dest = random_lang).text
        #translate back to original language
        translated_back = translator.translate(translated, dest = org_lang).text
        #print(translated,translated_back)
        #apply with certain probability
        if np.random.uniform(0, 1) <= PROB:
            output_sequence = translated_back
        else:
            output_sequence = sequence
            
    #if detected language not in our list of languages, do nothing
    else:
        output_sequence = sequence
    
    return output_sequence,lab


def random_deletion(words,lab, p=0.5): 
    if len(words) == 1: # return if single word
        return words
    remaining = list(filter(lambda x: random.uniform(0,1) > p,words)) 
    
    if len(remaining) == 0: # if not left, sample a random word
        return [random.choice(words)] ,lab
    else:
        return remaining,lab

def random_swap(sentence,lab, n=5): 
    length = range(len(sentence)) 
    for _ in range(n):
        idx1, idx2 = random.sample(length, 2)
        sentence[idx1], sentence[idx2] = sentence[idx2], sentence[idx1] 
    return sentence,lab

In [24]:
def random_pick(train_data):
    num = np.random.randint(0,len(train_data.examples))
    return train_data.examples[num].text,train_data.examples[num].labels

In [25]:
example = [Example.fromlist([df.iloc[i].sentence,df.iloc[i].label], fields) for i in range(df.shape[0])] 

CustomDataset = data.Dataset(example, fields)

(train_data, valid_data) = CustomDataset.split(split_ratio=[0.85, 0.15], random_state=random.seed(SEED))

In [26]:
train_data.examples[0].labels

'Class2'

In [27]:
len(train_data.examples)

9593

In [28]:
train_data.examples[0].text
train_data.examples[0].labels

['Ray',
 'Liotta',
 'and',
 'Jason',
 'Patric',
 'do',
 'some',
 'of',
 'their',
 'best',
 'work',
 'in',
 'their',
 'underwritten',
 'roles',
 ',',
 'but',
 'do',
 "n't",
 'be',
 'fooled',
 ':',
 'Nobody',
 'deserves',
 'any',
 'prizes',
 'here',
 '.']

'Class2'

### Visualizing the data

In [31]:
train_data.examples[0].labels

'Class2'

In [32]:
line = ''
filenumber = np.random.randint(1000)

for i in train_data.examples[filenumber].text:
    line += i + ' '
line

train_data.examples[filenumber].labels

'Very much a home video , and so devoid of artifice and purpose that it appears not to have been edited at all . '

'Class1'

###  Build vocabulary for source and target from training data
 

In [33]:
train_data.fields

{'text': <torchtext.legacy.data.field.Field at 0x7fd0770008b0>,
 'labels': <torchtext.legacy.data.field.LabelField at 0x7fd077000910>}

In [34]:
TEXT.build_vocab(train_data)
LABEL.build_vocab(train_data)

In [35]:
print('Size of input vocab : ', len(TEXT.vocab))
print('Size of label vocab : ', len(LABEL.vocab))
print('Top 10 words appreared repeatedly :', list(TEXT.vocab.freqs.most_common(10)))
print('Labels : ', LABEL.vocab.stoi)

Size of input vocab :  17994
Size of label vocab :  5
Top 10 words appreared repeatedly : [('.', 9029), (',', 7943), ('the', 6812), ('of', 4948), ('and', 4899), ('a', 4895), ('to', 3392), ('-', 3138), ('is', 2809), ("'s", 2805)]
Labels :  defaultdict(None, {'Class2': 0, 'Class4': 1, 'Class3': 2, 'Class5': 3, 'Class1': 4})


In [36]:
df.label.value_counts()

Class2    2971
Class4    2966
Class3    2144
Class5    1773
Class1    1432
Name: label, dtype: int64

### train and test iteartor

In [37]:
BATCH_SIZE = 128

train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch=True, # necessary for packed_padded_sequence
    device = device)

In [38]:
import os, pickle
with open('tokenizer.pkl', 'wb') as tokens: 
    pickle.dump(TEXT.vocab.stoi, tokens)

### Creating Model

In [39]:
# Model class
import torch.nn.functional as F
class Model(nn.Module):
    def __init__(self, input_dim, output_dim,emb_dim, hidden_dim, n_layers, dropout):
        # input_dim <--- vocabulary size
        # output_dim <--- len ([positive, negative]) == 2 
        # emb_dim <--- embedding dimension of embedding matrix

        super(Model, self).__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.encoder = nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout,batch_first=True)

        self.fc1 = nn.Linear(hidden_dim, 512)
        self.fc2 = nn.Linear(512, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src,Len):
        # shape: [source_len, batch_size]
        embedded = self.dropout(self.embedding(src)) # shape: [src_len, batch_size, embed_dim]
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, Len.to('cpu'),batch_first=True)
        output, (hidden, cell) = self.encoder(packed) 
        # output shape -> [batch, hidden_dim]
        # hiddden shape -> [n_layers, batch, hidden_dim]
        # cell shape -> [n_layers, batch, hidden_dim]
        output = self.fc2(self.fc1(hidden))
        output = F.softmax(output, dim=1)
        return output.squeeze(0)

In [40]:
#initializing variables and hyper parameters
INPUT_DIM = len(TEXT.vocab)
OUTPUT_DIM = len(LABEL.vocab)

EMBEDDING_DIM = 128
HIDDEN_DIM = 256

N_LAYERS = 1
DROPOUT = 0.4

# initializing our model
model = Model(INPUT_DIM, OUTPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, N_LAYERS, DROPOUT).to(device)





In [41]:
train_loss = []
train_accuracy = []
test_loss = []
test_accuracy = []
validation_loss = []
validation_accuracy = []

In [42]:
# loop and train our model
optimizer = optim.Adam(model.parameters(), lr=1e-2)

# defining learnig rate scheduler (optional)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)

criterion = nn.CrossEntropyLoss()


### Model training function

In [43]:

def train(EPOCH,model, iterator, optimizer=optimizer, criterion=criterion, clip=1,):
    model.train()
    epoch_loss = 0
    total_correct = 0
    total_count = 0
    pbar = tqdm(iterator)
    for i, batch in enumerate(pbar):
        src,data_len = batch.text
        src = src.to(device)
        trg = batch.labels.to(device)
        trg = trg.long()
        optimizer.zero_grad()
        output = model(src,data_len)
        
        total_correct += torch.sum(torch.eq(output.argmax(1), trg))
        total_count+=len(trg)
        
        loss = criterion(output, trg)
        
        loss.backward() 
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
        pbar.set_description(desc= f'Epoch {EPOCH} Train data Batch No : {i} Loss : {loss.item():.3f} Accuracy : {total_correct/total_count * 100 :.2f}% ' )
    
    train_accuracy.append(total_correct/total_count)
    mean_loss = epoch_loss / len(iterator)
    train_loss.append(mean_loss)
    
    scheduler.step(mean_loss)


### Model Validation function

In [None]:
git config --global user.email "you@example.com"
git config --global user.name "Your Name"

In [44]:

def evaluate(EPOCH,model, iterator, criterion,typ_loader):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    pbar  = tqdm(iterator)
    with torch.no_grad():
        
        for i,batch in enumerate(pbar):
            src,data_len = batch.text
            src = src.to(device)
            trg = batch.labels.to(device)
            trg = trg.long()
            predictions = model(src,data_len)
            
            loss = criterion(predictions, trg)
            
            acc = binary_accuracy(predictions, trg)

            epoch_loss += loss.item()
            epoch_acc += acc
            if typ_loader == 'Valid data':
                validation_loss.append(loss)
                validation_accuracy.append(acc)
            elif typ_loader == 'Test data':
                test_loss.append(loss)
                test_accuracy.append(acc)
            pbar.set_description(desc= f'Epoch {EPOCH} {typ_loader} Batch No : {i} Loss : {loss.item():.3f} | Accuracy : {epoch_acc / len(iterator)* 100 :.2f}% ' )

In [45]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds.argmax(1) == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc.item()

In [None]:
total_epoch = 100
for epoch in range(total_epoch):
    result = train(epoch,model=model, iterator=train_iterator)
    evaluate(epoch,model,valid_iterator,criterion,'Valid data')
#     evaluate(epoch,model,test_iterator,criterion,'Test data')

Epoch 0 Train data Batch No : 74 Loss : 1.610 Accuracy : 14.65% : 100%|██████████| 75/75 [00:18<00:00,  4.13it/s]
Epoch 0 Valid data Batch No : 13 Loss : 1.614 | Accuracy : 23.12% : 100%|██████████| 14/14 [00:00<00:00, 29.58it/s]
Epoch 1 Train data Batch No : 74 Loss : 1.605 Accuracy : 17.79% : 100%|██████████| 75/75 [00:18<00:00,  4.10it/s]
Epoch 1 Valid data Batch No : 13 Loss : 1.609 | Accuracy : 22.79% : 100%|██████████| 14/14 [00:00<00:00, 31.90it/s]
Epoch 2 Train data Batch No : 74 Loss : 1.607 Accuracy : 21.80% : 100%|██████████| 75/75 [00:35<00:00,  2.14it/s]
Epoch 2 Valid data Batch No : 13 Loss : 1.581 | Accuracy : 26.67% : 100%|██████████| 14/14 [00:00<00:00, 28.22it/s]
Epoch 3 Train data Batch No : 74 Loss : 1.611 Accuracy : 26.04% : 100%|██████████| 75/75 [00:22<00:00,  3.38it/s]
Epoch 3 Valid data Batch No : 13 Loss : 1.601 | Accuracy : 26.78% : 100%|██████████| 14/14 [00:00<00:00, 28.90it/s]
Epoch 4 Train data Batch No : 74 Loss : 1.610 Accuracy : 24.57% : 100%|█████████

Epoch     7: reducing learning rate of group 0 to 1.0000e-03.


Epoch 6 Valid data Batch No : 13 Loss : 1.609 | Accuracy : 27.03% : 100%|██████████| 14/14 [00:00<00:00, 32.64it/s]
Epoch 7 Train data Batch No : 74 Loss : 1.609 Accuracy : 21.92% : 100%|██████████| 75/75 [00:41<00:00,  1.80it/s]
Epoch 7 Valid data Batch No : 13 Loss : 1.610 | Accuracy : 26.67% : 100%|██████████| 14/14 [00:00<00:00, 30.07it/s]
Epoch 8 Train data Batch No : 74 Loss : 1.611 Accuracy : 21.51% : 100%|██████████| 75/75 [00:43<00:00,  1.74it/s]
Epoch 8 Valid data Batch No : 13 Loss : 1.615 | Accuracy : 26.39% : 100%|██████████| 14/14 [00:00<00:00, 30.06it/s]
Epoch 9 Train data Batch No : 74 Loss : 1.610 Accuracy : 21.67% : 100%|██████████| 75/75 [00:43<00:00,  1.71it/s]
Epoch 9 Valid data Batch No : 13 Loss : 1.651 | Accuracy : 26.72% : 100%|██████████| 14/14 [00:00<00:00, 30.85it/s]
Epoch 10 Train data Batch No : 74 Loss : 1.611 Accuracy : 21.69% : 100%|██████████| 75/75 [00:42<00:00,  1.78it/s]
Epoch 10 Valid data Batch No : 13 Loss : 1.639 | Accuracy : 26.91% : 100%|█████

In [None]:
predict('Very good') # predict funciton will predict if this is positive or negative review.

# predict('i recommend to watch the movie once. It is mindblowing') # predict funciton will predict if this is positive or negative review.

In [None]:
import matplotlib.pyplot as plt

plt.plot(train_accuracy,'green')
plt.title('train_acc')

In [None]:
plt.plot(train_loss,'green')
plt.title('train_loss')

In [None]:
plt.plot(test_accuracy,'red')
plt.title('test_acc')

In [None]:
plt.plot(test_loss,'red')
plt.title('test_loss')

In [None]:
plt.plot(validation_accuracy,'blue')
plt.title('test_acc')

In [None]:
plt.plot(validation_loss,'blue')
plt.title('test_loss')