In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
#from google.colab import drive
#drive.mount('/content/gdrive')

In [3]:
!ls /content/imdb_data/

imdb


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.legacy.datasets import IMDB
from torchtext.legacy.data import Field, LabelField, BucketIterator

In [5]:
from tqdm import tqdm
import random
import sys

In [6]:
f'Torch CUDA Version :{torch.version.cuda}'
f'Torch Version :{torch.__version__}'
f'Python Version :{sys.version}'

'Torch CUDA Version :10.1'

'Torch Version :1.8.1+cu101'

'Python Version :3.7.10 (default, May  3 2021, 02:48:31) \n[GCC 7.5.0]'

In [7]:
def gpu_check(seed_val = 1):
    print('The Seed is set to {}'.format(seed_val))
    if torch.cuda.is_available():
        print('Model will Run on CUDA.')
        print ("Type 'watch nvidia-smi' to monitor GPU\n")
        torch.cuda.manual_seed(seed_val)
        device = 'cuda'
    else:
        torch.manual_seed(seed_val)
        print ('Running in CPU')
        device = 'cpu'
    cuda = torch.cuda.is_available()
    return cuda,seed_val,device

In [8]:
cuda,SEED,device = gpu_check(seed_val=1234)

The Seed is set to 1234
Model will Run on CUDA.
Type 'watch nvidia-smi' to monitor GPU



In [9]:
!nvidia-smi

Sun May 23 14:47:05 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   53C    P0    43W / 250W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [10]:
def tokenize(s):
    return s.split(' ')

In [11]:
# TEXT = Field(tokenize = 'spacy',tokenizer_language = 'en_core_web_sm', lower = True)
TEXT = Field(tokenize = tokenize,tokenizer_language = 'en_core_web_sm', lower = True)
LABEL = LabelField(dtype = torch.float)

In [12]:
train_data, test_data = IMDB.splits(TEXT, LABEL,root ='/content/imdb_data/')

In [13]:
train_data, valid_data = train_data.split(random_state = random.seed(SEED))

In [14]:
# display lenght of test and traing data
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

Number of training examples: 17500
Number of testing examples: 25000


In [15]:
# display single example at index 0
print(vars(train_data.examples[0]))

{'text': ['wow.', 'i', 'went', 'to', 'the', 'video', 'store', 'tonight', 'because', 'i', 'was', 'in', 'the', 'mood', 'for', 'a', 'bad', 'b', 'horror', 'movie', 'and', 'i', 'found', 'this', 'gem.', 'i', 'looked', 'at', 'the', 'cover', 'and', 'i', 'thought', 'it', 'looked', 'like', 'just', 'the', 'movie', 'for', 'my', 'mood.', 'i', 'brought', 'it', 'home', 'and', 'put', 'it', 'on.<br', '/><br', '/>this', 'movie', 'was', 'not', 'the', 'b', 'horror', 'movie', 'that', 'i', 'had', 'in', 'mind.', 'this', 'was', 'much', 'worse.', 'i', 'wanted', 'a', 'bad', 'movie', 'but', 'what', 'i', 'got,', 'i', "didn't", 'know', 'that', 'crap', 'like', 'this', 'existed', 'amongst', 'man.', 'this', 'movie', 'seemed', 'like', 'a', '5', 'year', 'old', 'wrote', 'and', 'directed', 'it', 'and', 'that', 'is', 'being', 'nice', 'about', 'it.<br', '/><br', '/>i', 'am', 'an', 'aspiring', 'director', 'and', 'this', 'movie', 'made', 'me', 'so', 'mad', 'that', 'someone', 'out', 'there', 'is', 'actually', 'paying', 'this'

In [16]:
line = ''
filenumber = 11

for i in train_data.examples[filenumber].text:
    line += i + ' '
line

train_data.examples[filenumber].label

"kureishi hasn't exactly been blessed with movies that justify the quality of his writing. recent adapted travesty's like 'intimacy' have ruined great writing. but the mother surpasses all his previous incarnations, eclipsing even my beautiful laundrette. a middle-aged woman overcomes widow-hood by having a very carnal relationship with the boyfriend of her emotionally-weak daughter. the fact that you believe all this is credit to the quality of the acting as it is to the finite gift of the writing. and in daniel craig we have a strutting, brash, gruff anti-hero who denies the audience to ever question why a young stud would contemplate bedding a sagging grandmother. beautifully shot, the film fails only in the weak depiction of the peripheral characters, but as a study of inconceivable lust, it's a winner. "

'pos'

In [17]:
# Build vocabulary for source and target from training data

TEXT.build_vocab(train_data, max_size=25_000)  # using pretrained word embedding
LABEL.build_vocab(train_data)


In [18]:

# print(vars(TEXT.vocab))
print(f"Unique tokens in source vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in TRG vocabulary: {len(LABEL.vocab)}")

Unique tokens in source vocabulary: 25002
Unique tokens in TRG vocabulary: 2


In [19]:

BATCH_SIZE = 32

# train and test iteartor
train_iterator,valid_iterator ,test_iterator = BucketIterator.splits(
      (train_data, valid_data,test_data), 
      batch_size = BATCH_SIZE, 
      device = device
    )

In [20]:
# Model class
class Model(nn.Module):
    def __init__(self, input_dim, output_dim,emb_dim, hidden_dim, n_layers, dropout):
        # input_dim <--- vocabulary size
        # output_dim <--- len ([positive, negative]) == 2 
        # emb_dim <--- embedding dimension of embedding matrix

        super(Model, self).__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout)

#         self.fc1 = nn.Linear(hidden_dim, hidden_dim//2)
#         self.fc2 = nn.Linear(hidden_dim//2, output_dim)
        self.fc1 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # shape: [source_len, batch_size]
        embedded = self.dropout(self.embedding(src)) # shape: [src_len, batch_size, embed_dim]
        output, (hidden, cell) = self.rnn(embedded) 
        # output shape -> [batch, hidden_dim]
        # hiddden shape -> [n_layers, batch, hidden_dim]
        # cell shape -> [n_layers, batch, hidden_dim]
        output = self.fc1(output[-1])
#         output = self.fc2(self.relu(output))
        return output

In [21]:
#initializing variables and hyper parameters
INPUT_DIM = len(TEXT.vocab)
OUTPUT_DIM = len(LABEL.vocab)

EMBEDDING_DIM = 100
DEC_EMB_DIM = 100
HIDDEN_DIM = 256

N_LAYERS = 1
DROPOUT = 0.6

# initializing our model
model = Model(INPUT_DIM, OUTPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, N_LAYERS, DROPOUT).to(device)



  "num_layers={}".format(dropout, num_layers))


In [22]:
train_loss = []
train_accuracy = []

In [23]:
# loop and train our model
optimizer = optim.Adam(model.parameters(), lr=3e-3)

# defining learnig rate scheduler (optional)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True)

criterion = nn.CrossEntropyLoss()


In [24]:

# Model training function
def train(EPOCH,model, iterator, optimizer=optimizer, criterion=criterion, clip=1,):
    model.train()
    epoch_loss = 0
    total_correct = 0
    total_count = 0
    pbar = tqdm(iterator)
    for i, batch in enumerate(pbar):
        src = batch.text.to(device)
        trg = batch.label.to(device)
        trg = trg.long()
        optimizer.zero_grad()
        output = model(src)
        
        total_correct += torch.sum(torch.eq(output.argmax(1), trg))
        total_count+=len(trg)
        
        loss = criterion(output, trg)
        
        loss.backward() 
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
        pbar.set_description(desc= f'Epoch {EPOCH} Train data Batch No : {i} Loss : {loss.item():.3f} Accuracy : {total_correct/total_count * 100 :.2f}% ' )
    
    train_accuracy.append(total_correct/total_count)
    mean_loss = epoch_loss / len(iterator)
    train_loss.append(mean_loss)
    
    scheduler.step(mean_loss)
    

In [25]:
def evaluate(EPOCH,model, iterator, criterion,typ_loader):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    pbar  = tqdm(iterator)
    with torch.no_grad():
        
        for i,batch in enumerate(pbar):
            src = batch.text.to(device)
            trg = batch.label.to(device)
            trg = trg.long()
            predictions = model(src)
            
            loss = criterion(predictions, trg)
            
            acc = binary_accuracy(predictions, trg)

            epoch_loss += loss.item()
            epoch_acc += acc
            pbar.set_description(desc= f'Epoch {EPOCH} {typ_loader} Batch No : {i} Loss : {loss.item():.3f} Accuracy : {epoch_acc / len(iterator)* 100 :.2f}% ' )

In [26]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds.argmax(1) == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc.item()

In [27]:

total_epoch = 5
for epoch in range(total_epoch):
    result = train(epoch,model=model, iterator=train_iterator)
    evaluate(epoch,model,valid_iterator,criterion,'Valid data')
    evaluate(epoch,model,test_iterator,criterion,'Test data')
    

Epoch 0 Train data Batch No : 546 Loss : 0.701 Accuracy : 49.77% : 100%|██████████| 547/547 [00:37<00:00, 14.55it/s]
Epoch 0 Valid data Batch No : 234 Loss : 0.692 Accuracy : 50.31% : 100%|██████████| 235/235 [00:02<00:00, 86.82it/s]
Epoch 0 Test data Batch No : 781 Loss : 0.678 Accuracy : 37.39% : 100%|██████████| 782/782 [00:09<00:00, 86.53it/s]
Epoch 1 Train data Batch No : 546 Loss : 0.705 Accuracy : 49.93% : 100%|██████████| 547/547 [00:37<00:00, 14.44it/s]
Epoch 1 Valid data Batch No : 234 Loss : 0.686 Accuracy : 49.99% : 100%|██████████| 235/235 [00:02<00:00, 89.57it/s]
Epoch 1 Test data Batch No : 781 Loss : 0.704 Accuracy : 57.17% : 100%|██████████| 782/782 [00:08<00:00, 88.91it/s]
Epoch 2 Train data Batch No : 546 Loss : 0.718 Accuracy : 50.41% : 100%|██████████| 547/547 [00:37<00:00, 14.41it/s]
Epoch 2 Valid data Batch No : 234 Loss : 0.692 Accuracy : 51.52% : 100%|██████████| 235/235 [00:02<00:00, 89.56it/s]
Epoch 2 Test data Batch No : 781 Loss : 0.629 Accuracy : 46.12% : 

In [30]:
# function to experiment movie review sentences
import spacy
sp = spacy.load('en_core_web_sm')


def predict(sentence):
    if type(sentence) == str:
        tokanized_sentence = [word.text for word in sp.tokenizer(sentence)]
    else:
        tokanized_sentence = sentence


    input_data = [TEXT.vocab.stoi[word.lower()] for word in tokanized_sentence]
    input_data = torch.tensor(input_data, dtype=torch.int64).unsqueeze(1).to(device)


    model.eval()
    output = model(input_data)
    # print(output)
    predict = output.argmax(1)
    predict = predict.squeeze(0)
    print(output)

    if predict>0:
        return "---->> Positive Review"
    else:
        return '---->> Negative Review'

In [31]:
predict('Very bad') # predict funciton will predict if this is positive or negative review.

tensor([[ 0.9696, -0.5756]], device='cuda:0', grad_fn=<AddmmBackward>)


'---->> Negative Review'

In [32]:
predict('Very good') # predict funciton will predict if this is positive or negative review.

tensor([[0.0022, 0.0456]], device='cuda:0', grad_fn=<AddmmBackward>)


'---->> Positive Review'

In [34]:
predict('i recommend to watch the movie once. It is mindblowing') # predict funciton will predict if this is positive or negative review.

tensor([[ 0.2117, -0.0614]], device='cuda:0', grad_fn=<AddmmBackward>)


'---->> Negative Review'