In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext import data
from torchtext.legacy.datasets import IMDB
from torchtext.legacy.data import Field, LabelField, BucketIterator

In [3]:
from tqdm import tqdm
import random
import sys


In [4]:
f'Torch CUDA Version :{torch.version.cuda}'
f'Torch Version :{torch.__version__}'
f'Python Version :{sys.version}'

'Torch CUDA Version :10.2'

'Torch Version :1.8.1'

'Python Version :3.8.8 (default, Feb 24 2021, 21:46:12) \n[GCC 7.3.0]'

In [5]:
def gpu_check(seed_val = 1):
    print('The Seed is set to {}'.format(seed_val))
    if torch.cuda.is_available():
        print('Model will Run on CUDA.')
        print ("Type 'watch nvidia-smi' to monitor GPU\n")
        torch.cuda.manual_seed(seed_val)
        device = 'cuda'
    else:
        torch.manual_seed(seed_val)
        print ('Running in CPU')
        device = 'cpu'
    cuda = torch.cuda.is_available()
    return cuda,seed_val,device

In [6]:
cuda,SEED,device = gpu_check(seed_val=1234)

The Seed is set to 1234
Model will Run on CUDA.
Type 'watch nvidia-smi' to monitor GPU



In [7]:
!nvidia-smi

Sun May 23 20:40:49 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.73.01    Driver Version: 460.73.01    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce GTX 960M    Off  | 00000000:01:00.0 Off |                  N/A |
| N/A   58C    P0    N/A /  N/A |   2447MiB /  4046MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [8]:
import json

In [9]:
TEXT = Field(tokenizer_language = 'en_core_web_sm', lower = True)
LABEL = LabelField(dtype = torch.float)

fields = {'text': ('text', TEXT), 'label': ('label', LABEL)}

In [10]:
fields = {'text': ('text', TEXT), 'label': ('label', LABEL)}

In [11]:
from torchtext.legacy.data import TabularDataset

In [12]:
train_data, test_data = TabularDataset.splits(
    path = '.data',
    train = 'train.json',
    test = 'test.json',
    format = 'json',
    fields = fields
)

In [13]:
train_data, valid_data = train_data.split(random_state = random.seed(SEED))

In [15]:
# display lenght of test and traing data
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")


Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000


In [None]:
# display single example at index 0
print(vars(train_data.examples[0]))

In [None]:
line = ''
filenumber = 11

for i in train_data.examples[filenumber].text:
    line += i + ' '
line

train_data.examples[filenumber].label

In [None]:
# Build vocabulary for source and target from training data

TEXT.build_vocab(train_data, max_size=25_000)  # using pretrained word embedding
LABEL.build_vocab(train_data)


In [None]:

# print(vars(TEXT.vocab))
print(f"Unique tokens in source vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in TRG vocabulary: {len(LABEL.vocab)}")

In [None]:

BATCH_SIZE = 32

# train and test iteartor
train_iterator,valid_iterator ,test_iterator = BucketIterator.splits(
      (train_data, valid_data,test_data), 
      batch_size = BATCH_SIZE, 
      device = device
    )

In [None]:
# Model class
class Model(nn.Module):
    def __init__(self, input_dim, output_dim,emb_dim, hidden_dim, n_layers, dropout):
        # input_dim <--- vocabulary size
        # output_dim <--- len ([positive, negative]) == 2 
        # emb_dim <--- embedding dimension of embedding matrix

        super(Model, self).__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout)

#         self.fc1 = nn.Linear(hidden_dim, hidden_dim//2)
#         self.fc2 = nn.Linear(hidden_dim//2, output_dim)
        self.fc1 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # shape: [source_len, batch_size]
        embedded = self.dropout(self.embedding(src)) # shape: [src_len, batch_size, embed_dim]
        output, (hidden, cell) = self.rnn(embedded) 
        # output shape -> [batch, hidden_dim]
        # hiddden shape -> [n_layers, batch, hidden_dim]
        # cell shape -> [n_layers, batch, hidden_dim]
        output = self.fc1(output[-1])
#         output = self.fc2(self.relu(output))
        return output

In [None]:
#initializing variables and hyper parameters
INPUT_DIM = len(TEXT.vocab)
OUTPUT_DIM = len(LABEL.vocab)

EMBEDDING_DIM = 100
DEC_EMB_DIM = 100
HIDDEN_DIM = 256

N_LAYERS = 1
DROPOUT = 0.6

# initializing our model
model = Model(INPUT_DIM, OUTPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, N_LAYERS, DROPOUT).to(device)



In [None]:
train_loss = []
train_accuracy = []

In [None]:
# loop and train our model
optimizer = optim.Adam(model.parameters(), lr=3e-3)

# defining learnig rate scheduler (optional)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True)

criterion = nn.CrossEntropyLoss()


In [None]:

# Model training function
def train(model, iterator, optimizer=optimizer, criterion=criterion, clip=1):
    model.train()
    epoch_loss = 0
    total_correct = 0
    total_count = 0
    pbar = tqdm(iterator)
    for i, batch in enumerate(pbar):
        src = batch.text.to(device)
        trg = batch.label.to(device)
        trg = trg.long()
        optimizer.zero_grad()
        output = model(src)
        
        total_correct += torch.sum(torch.eq(output.argmax(1), trg))
        total_count+=len(trg)
        
        loss = criterion(output, trg)
        
        loss.backward() 
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
        pbar.set_description(desc= f'Train data Batch No : {i} Loss : {loss.item():.3f} Accuracy : {total_correct/total_count * 100 :.2f}% ' )
    
    train_accuracy.append(total_correct/total_count)
    mean_loss = epoch_loss / len(iterator)
    train_loss.append(mean_loss)
    
    scheduler.step(mean_loss)
    

In [None]:
def evaluate(model, iterator, criterion,typ_loader):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    pbar  = tqdm(iterator)
    with torch.no_grad():
        
        for i,batch in enumerate(pbar):
            src = batch.text.to(device)
            trg = batch.label.to(device)
            trg = trg.long()
            predictions = model(src)
            
            loss = criterion(predictions, trg)
            
            acc = binary_accuracy(predictions, trg)

            epoch_loss += loss.item()
            epoch_acc += acc
            pbar.set_description(desc= f'{typ_loader} Batch No : {i} Loss : {loss.item():.3f} Accuracy : {epoch_acc / len(iterator)* 100 :.2f}% ' )

In [None]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds.argmax(1) == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc.item()

In [None]:

total_epoch = 5
for epoch in range(total_epoch):
    result = train(model=model, iterator=train_iterator)
    evaluate(model,valid_iterator,criterion,'Valid data')
    evaluate(model,test_iterator,criterion,'Test data')
    print(f'Epoch {epoch}')

In [None]:
# function to experiment movie review sentences
import spacy
sp = spacy.load('en_core_web_sm')


def predict(sentence):
    if type(sentence) == str:
        tokanized_sentence = [word.text for word in sp.tokenizer(sentence)]
    else:
        tokanized_sentence = sentence


    input_data = [TEXT.vocab.stoi[word.lower()] for word in tokanized_sentence]
    input_data = torch.tensor(input_data, dtype=torch.int64).unsqueeze(1).to(device)


    model.eval()
    output = model(input_data)
    # print(output)
    predict = output.argmax(1)
    predict = predict.squeeze(0)
    print(output)

    if predict>0:
        return "---->> Positive Review"
    else:
        return '---->> Negative Review'

In [None]:
predict('Very bad') # predict funciton will predict if this is positive or negative review.