In [None]:
import torch,torchtext
from torchtext.legacy import data

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize = 'spacy',
                  tokenizer_language = 'en_core_web_sm',
                  include_lengths = True)
LABEL = data.LabelField(dtype = torch.float)

In [None]:
from torchtext.legacy import datasets

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

In [None]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 25000
Number of testing examples: 25000


In [None]:
import random

train_data, valid_data = train_data.split(random_state = random.seed(SEED))

In [None]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000


In [None]:
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

In [None]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2


In [None]:
print(TEXT.vocab.freqs.most_common(20))

[('the', 201216), (',', 191672), ('.', 164256), ('a', 108789), ('and', 108724), ('of', 99798), ('to', 92939), ('is', 75799), ('in', 61052), ('I', 54050), ('it', 53258), ('that', 48729), ('"', 44100), ("'s", 42873), ('this', 42355), ('-', 36652), ('/><br', 35379), ('was', 35008), ('as', 30305), ('with', 29788)]


In [None]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', 'the', ',', '.', 'a', 'and', 'of', 'to', 'is']


In [None]:
print(LABEL.vocab.stoi)

defaultdict(None, {'neg': 0, 'pos': 1})


In [None]:
BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_within_batch=True,
    device = device)

In [None]:
import torch.nn as nn

class LSTM(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text, text_length):

        #[sentence len, batch size] => [sentence len, batch size, embedding size]
        embedded = self.embedding(text)
        
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, text_length.to('cpu'))
        
        #[sentence len, batch size, embedding size] => 
        #  output: [sentence len, batch size, hidden size]
        #  hidden: [1, batch size, hidden size]
        packed_output, (hidden, cell) = self.lstm(packed)
        
        return self.fc(hidden.squeeze(0)).view(-1)

In [None]:

INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 128
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = LSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model

LSTM(
  (embedding): Embedding(25002, 128)
  (lstm): LSTM(128, 256)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 3,595,777 trainable parameters


In [None]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [None]:
criterion = nn.BCEWithLogitsLoss()

In [None]:
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def calculate_metrics(preds, y):
    
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    Acc = correct.sum() / len(correct)
    TP = ((rounded_preds == 1) & (y == 1)).sum()
    TN = ((rounded_preds == 0) & (y == 0)).sum()
    FP = ((rounded_preds == 1) & (y == 0)).sum()
    FN = ((rounded_preds == 0) & (y == 1)).sum()
    return Acc,TP,TN,FP,FN

In [None]:
def train(model, iterator, optimizer, criterion):    
  epoch_loss = 0
  epoch_acc = 0
  TP = 0  
  TN = 0
  FP = 0
  FN = 0
  model.train()
  
  for batch in iterator:
    text, text_length = batch.text
    pred = model(text, text_length)
        
    loss = criterion(pred, batch.label)
        
    acc,tp,tn,fp,fn = calculate_metrics(pred, batch.label)
        
    optimizer.zero_grad()

    loss.backward()
        
    optimizer.step()
        
    epoch_loss += loss.item()
    epoch_acc += acc.item()
    TP += tp.item()
    TN += tn.item()
    FP += fp.item()
    FN += fn.item()
    
  #Acc = (TP+TN)/(TP+FP+TN+FN)  
  precision = TP/(TP+FP)
  recall = TP/(TP+FN)
  f1 = 2 * (precision*recall)/(precision+recall)    
    
  return epoch_loss/len(iterator), epoch_acc/len(iterator),precision,recall,f1,[TP,TN,FP,FN]

In [None]:
def evaluate(model, iterator, criterion):    
  epoch_loss = 0
  epoch_acc = 0
  TP = 0  
  TN = 0
  FP = 0
  FN = 0
  
  model.eval()
    
  with torch.no_grad():
    for batch in iterator:
      text, text_length = batch.text
      predictions = model(text, text_length)
            
      loss = criterion(predictions, batch.label)
            
      acc,tp,tn,fp,fn = calculate_metrics(predictions, batch.label)

      epoch_loss += loss.item()
      epoch_acc += acc.item()
      TP += tp.item()
      TN += tn.item()
      FP += fp.item()
      FN += fn.item()

  Acc = (TP+TN)/(TP+FP+TN+FN)  
  precision = TP/(TP+FP)
  recall = TP/(TP+FN)
  f1 = 2 * (precision*recall)/(precision+recall)      
        
  return epoch_loss / len(iterator), epoch_acc / len(iterator),precision,recall,f1,[TP,TN,FP,FN]

In [None]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
from prettytable import PrettyTable

N_EPOCHS = 25

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc, train_prec, train_recall, train_f1, train_list = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc, valid_prec, valid_recall, valid_f1, valid_list = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'model.pt')
    
    print(f'\n\nEpoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s\n')

    con = PrettyTable(['Train', 'Actual Positive', 'Actual Negative'])
    con.add_row(['Predicted Positive',train_list[0],train_list[2]])
    con.add_row(['Predicted Negative',train_list[3],train_list[1]])
    print(con)

    con = PrettyTable(['Valid', 'Actual Positive', 'Actual Negative'])
    con.add_row(['Predicted Positive',valid_list[0],valid_list[2]])
    con.add_row(['Predicted Negative',valid_list[3],valid_list[1]])
    print(con)

    table = PrettyTable(['Metrics', 'Train', 'Valid'])
    table.add_row(['Loss', f'{train_loss:.3f}',f'{valid_loss:.3f}'])
    table.add_row(['Accuracy', f'{train_acc*100:.2f}%',f'{valid_acc*100:.2f}%'])    
    table.add_row(['Precision', f'{train_prec*1:.2f}',f'{valid_prec*1:.2f}'])
    table.add_row(['Recall', f'{train_recall*1:.2f}',f'{valid_recall*1:.2f}'])
    table.add_row(['F1 score', f'{train_f1*1:.2f}',f'{valid_f1*1:.2f}'])
    print(table)

    



Epoch: 01 | Epoch Time: 0m 7s

+--------------------+-----------------+-----------------+
|       Train        | Actual Positive | Actual Negative |
+--------------------+-----------------+-----------------+
| Predicted Positive |       3326      |       2660      |
| Predicted Negative |       5364      |       6150      |
+--------------------+-----------------+-----------------+
+--------------------+-----------------+-----------------+
|       Valid        | Actual Positive | Actual Negative |
+--------------------+-----------------+-----------------+
| Predicted Positive |       2744      |       2139      |
| Predicted Negative |       1066      |       1551      |
+--------------------+-----------------+-----------------+
+-----------+--------+--------+
|  Metrics  | Train  | Valid  |
+-----------+--------+--------+
|    Loss   | 0.688  | 0.684  |
|  Accuracy | 54.16% | 57.26% |
| Precision |  0.56  |  0.56  |
|   Recall  |  0.38  |  0.72  |
|  F1 score |  0.45  |  0.63  |
+--

In [None]:
model.load_state_dict(torch.load('model.pt'))

test_loss, test_acc, test_prec, test_recall, test_f1, test_list = evaluate(model, test_iterator, criterion)

con = PrettyTable(['Test', 'Actual Positive', 'Actual Negative'])
con.add_row(['Predicted Positive',valid_list[0],valid_list[2]])
con.add_row(['Predicted Negative',valid_list[3],valid_list[1]])
print(con)

table = PrettyTable(['Metrics', 'Test'])
table.add_row(['Loss', f'{test_loss:.3f}'])
table.add_row(['Accuracy', f'{test_acc*100:.2f}%'])    
table.add_row(['Precision', f'{test_prec*1:.2f}'])
table.add_row(['Recall', f'{test_recall*1:.2f}'])
table.add_row(['F1 score', f'{test_f1*1:.2f}'])
print(table)

+--------------------+-----------------+-----------------+
|        Test        | Actual Positive | Actual Negative |
+--------------------+-----------------+-----------------+
| Predicted Positive |       3357      |       723       |
| Predicted Negative |       453       |       2967      |
+--------------------+-----------------+-----------------+
+-----------+--------+
|  Metrics  |  Test  |
+-----------+--------+
|    Loss   | 0.374  |
|  Accuracy | 84.91% |
| Precision |  0.84  |
|   Recall  |  0.87  |
|  F1 score |  0.85  |
+-----------+--------+
