<a href="https://colab.research.google.com/github/AnudeepReddy-Katta/nlp-journey/blob/main/RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [34]:
import torch
from torchtext.legacy import data 
# The torchtext package consists of data processing utilities and popular datasets for natural language.

In [35]:
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic  = True 
# To have reproducability internally

TEXT = data.Field(tokenize = 'spacy',
                  tokenizer_language = 'en_core_web_sm') # Ability to define a preprocessing pipeline

LABEL = data.LabelField(dtype=torch.float)

In [36]:
from torchtext.legacy import datasets
# The raw text iterators for common NLP datasets

In [37]:
train_data, test_data = datasets.IMDB.splits(TEXT,LABEL)

In [38]:
print(len(train_data))
print(len(test_data))

25000
25000


In [39]:
print(vars(train_data.examples[0]))

{'text': ['I', 'recently', 'got', 'the', 'chance', 'to', 'view', '"', 'The', 'Waterdance', '"', ',', 'and', 'quite', 'liked', 'it', '.', 'I', 'do', "n't", 'really', 'understand', 'why', 'its', 'called', 'that', 'as', 'there', 'is', "n't", 'really', 'any', 'dancing', 'going', 'on', 'there', ',', 'except', 'maybe', 'for', 'the', 'dancing', 'at', 'the', 'strip', 'club', 'near', 'the', 'end', '.', 'We', 'are', 'introduced', 'to', 'the', 'main', 'characters', 'throughout', 'the', 'movie', ',', 'invalids', 'in', 'a', 'hospital', '.', 'The', 'story', 'shows', 'a', 'love', 'affair', 'between', 'a', 'physically', 'sisabled', 'guy', 'and', 'a', 'healthy', 'woman', ',', 'which', 'is', 'a', 'very', 'sweet', 'story', '.', 'Unfortunately', ',', 'you', 'do', "n't", 'get', 'to', 'see', 'movies', 'like', 'that', 'today', '.', 'I', 'm', 'not', '"', 'stuck', 'in', 'a', 'time', 'warp', '"', ',', 'i', 'm', 'not', 'saying', 'that', 'everything', 'during', 'the', '80s', 'and', 'early', '90s', 'was', 'better'

In [40]:
# Train-valid data split

import random 

train_data, valid_data = train_data.split(random_state = random.seed(SEED))

In [41]:
print(len(train_data))
print(len(valid_data))
print(len(test_data))

17500
7500
25000


In [42]:
# Building vocab

MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

In [43]:
print(len(TEXT.vocab))
print(len(LABEL.vocab))

25002
2


In [44]:
print(TEXT.vocab.freqs.most_common(20))

[('the', 201981), (',', 192480), ('.', 165389), ('and', 109631), ('a', 109152), ('of', 100721), ('to', 93284), ('is', 76498), ('in', 61360), ('I', 54052), ('it', 53299), ('that', 49014), ('"', 43989), ("'s", 43285), ('this', 42329), ('-', 37051), ('/><br', 35577), ('was', 35050), ('as', 30305), ('movie', 29983)]


In [45]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is']


In [46]:
print(LABEL.vocab.stoi)

defaultdict(None, {'neg': 0, 'pos': 1})


In [47]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    # iterator that batches examples of similar lengths together
    (train_data,valid_data,test_data),
    batch_size = BATCH_SIZE,
    device = device
)

In [48]:
import torch.nn as nn

In [94]:
# RNN class

class RNN(nn.Module):

  def __init__(self,num_embeddings,embedding_dim,hidden_dim,output_dim):
    
    super().__init__()

    self.embedding = nn.Embedding(num_embeddings,embedding_dim) 
    self.rnn = nn.RNN(embedding_dim,hidden_dim)
    self.fc = nn.Linear(hidden_dim,output_dim)

  def forward(self,text):

    embedded = self.embedding(text)
    output, hidden = self.rnn(embedded)
    return self.fc(hidden.squeeze(0))

In [95]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [96]:
model

RNN(
  (embedding): Embedding(25002, 100)
  (rnn): RNN(100, 256)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)

In [97]:
def count_parameters(model):

  return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [98]:
count_parameters(model)

2592105

In [99]:
import torch.optim as optim

In [100]:
optimizer = optim.SGD(model.parameters(), lr=1e-3)

In [101]:
criterion = nn.BCEWithLogitsLoss() # Sigmoid + BCE Loss

In [102]:
model = model.to(device)
criterion = criterion.to(device)

In [103]:
def binary_accuracy(preds,y):
  
  rounded_preds = torch.round(torch.sigmoid(preds))
  correct = (rounded_preds == y).float()
  acc = correct.sum() / len(correct)

  return acc

In [114]:
def train(model, iterator, optimizer, criterion):

  epoch_loss = 0
  epoch_acc = 0

  model.train()

  for batch in iterator:

    optimizer.zero_grad()
    predictions = model(batch.text).squeeze(1)

    loss = criterion(predictions, batch.label)
    acc = binary_accuracy(predictions, batch.label)

    loss.backward()
    optimizer.step()

    epoch_loss += loss.item()
    epoch_acc += acc.item()

  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [115]:
def evaluate(model, iterator, criterion):

  epoch_loss = 0
  epoch_acc = 0

  model.eval()

  with torch.no_grad():

    for batch in iterator:

      predictions = model(batch.text).squeeze(1)

      loss = criterion(predictions, batch.label)
      acc = binary_accuracy(predictions, batch.label)

      epoch_loss += loss.item()
      epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [116]:
import time

def epoch_time(start_time, end_time):

  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time/60)
  elapsed_secs = elapsed_time - elapsed_mins

  return elapsed_mins, elapsed_secs

In [117]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

  start_time = time.time()
    
  train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
  valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
      
  end_time = time.time()

  epoch_mins, epoch_secs = epoch_time(start_time, end_time)

  if valid_loss < best_valid_loss:

    best_valid_loss = valid_loss
    torch.save(model.state_dict(),'model.pt')

  print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
  print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
  print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 15m 891.9264266490936s
	Train Loss: 0.696 | Train Acc: 49.61%
	 Val. Loss: 0.695 |  Val. Acc: 48.85%
Epoch: 02 | Epoch Time: 15m 893.2682528495789s
	Train Loss: 0.693 | Train Acc: 50.30%
	 Val. Loss: 0.695 |  Val. Acc: 50.49%
Epoch: 03 | Epoch Time: 15m 893.2306063175201s
	Train Loss: 0.694 | Train Acc: 49.71%
	 Val. Loss: 0.694 |  Val. Acc: 49.63%
Epoch: 04 | Epoch Time: 15m 894.9841191768646s
	Train Loss: 0.693 | Train Acc: 49.89%
	 Val. Loss: 0.694 |  Val. Acc: 49.37%
Epoch: 05 | Epoch Time: 15m 895.6147291660309s
	Train Loss: 0.694 | Train Acc: 49.95%
	 Val. Loss: 0.694 |  Val. Acc: 49.04%


In [118]:
model.load_state_dict(torch.load('model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.709 | Test Acc: 44.96%
