In [1]:
!pip install datasets tqdm torchtext torch -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
import tqdm

import functools
import sys

import datasets
import matplotlib.pyplot as plt
import numpy as np

torch.manual_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'{device=}')

device=device(type='cuda')


In [3]:
train_data, test_data = datasets.load_dataset('imdb', split=['train', 'test'])

Downloading builder script:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.59k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [4]:
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

In [5]:
def tokenize_data(example, tokenizer, max_length):
  tokens = tokenizer(example['text'])[:max_length]
  length = len(tokens)
  return {'tokens': tokens, 'length': length}

max_length = 256
train_token = train_data.map(tokenize_data, fn_kwargs={'tokenizer': tokenizer, 'max_length': max_length})
test_token = test_data.map(tokenize_data, fn_kwargs={'tokenizer': tokenizer, 'max_length': max_length})

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [6]:
train_token

Dataset({
    features: ['text', 'label', 'tokens', 'length'],
    num_rows: 25000
})

In [7]:
for text in train_token['text'][:5]:
  print(text)

I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, eve

In [8]:
for text in train_token['label'][:5]:
  print(text)

0
0
0
0
0


In [9]:
for text in train_token['tokens'][:5]:
  print(text)

['i', 'rented', 'i', 'am', 'curious-yellow', 'from', 'my', 'video', 'store', 'because', 'of', 'all', 'the', 'controversy', 'that', 'surrounded', 'it', 'when', 'it', 'was', 'first', 'released', 'in', '1967', '.', 'i', 'also', 'heard', 'that', 'at', 'first', 'it', 'was', 'seized', 'by', 'u', '.', 's', '.', 'customs', 'if', 'it', 'ever', 'tried', 'to', 'enter', 'this', 'country', ',', 'therefore', 'being', 'a', 'fan', 'of', 'films', 'considered', 'controversial', 'i', 'really', 'had', 'to', 'see', 'this', 'for', 'myself', '.', 'the', 'plot', 'is', 'centered', 'around', 'a', 'young', 'swedish', 'drama', 'student', 'named', 'lena', 'who', 'wants', 'to', 'learn', 'everything', 'she', 'can', 'about', 'life', '.', 'in', 'particular', 'she', 'wants', 'to', 'focus', 'her', 'attentions', 'to', 'making', 'some', 'sort', 'of', 'documentary', 'on', 'what', 'the', 'average', 'swede', 'thought', 'about', 'certain', 'political', 'issues', 'such', 'as', 'the', 'vietnam', 'war', 'and', 'race', 'issues', 

In [10]:
train_valid_data = train_token.train_test_split(test_size = 0.2)

In [11]:
train_data = train_valid_data['train']
valid_data = train_valid_data['test']

vocab = torchtext.vocab.build_vocab_from_iterator(train_data['tokens'],
                                                  specials=['<UNK>', '<PAD>'],
                                                  min_freq= 10)

In [12]:
# vocab['<UNK>']
vocab.set_default_index(0)

In [13]:
def convert_into_tokens(example, vocab):
    ids = [vocab[token] for token in example['tokens']]
    return {'ids': ids}

In [14]:
#this data will be used for training
train_data = train_data.map(convert_into_tokens, fn_kwargs={'vocab': vocab})
# this data will be used for evaluation
valid_data = valid_data.map(convert_into_tokens, fn_kwargs={'vocab': vocab})
# this the data that we use for generalization [New unseen data for testing]
test_data = test_token.map(convert_into_tokens, fn_kwargs={'vocab': vocab})

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [15]:
for text in train_data['ids'][:5]:
  print(text)

[756, 1241, 53, 7, 304, 21, 5, 23, 40, 172, 78, 93, 1773, 8, 2602, 8, 3861, 3, 22, 92, 9, 28, 112, 14, 18, 888, 8, 35, 2038, 3, 9892, 768, 317, 41, 5197, 180, 38, 5, 2806, 0, 1250, 7, 2, 2867, 341, 3, 54, 10, 294, 47, 2, 18, 4, 45, 37, 7, 768, 9, 16, 116, 4, 10, 15, 33, 167, 9, 28, 535, 20, 32, 113, 3, 33, 1970, 113, 4, 0, 105, 20, 128, 4, 712, 4, 6, 9888, 3, 3, 3, 6, 0, 105, 20, 5, 648, 15, 407, 1834, 243, 2751, 8, 343, 86, 20, 73, 117, 30, 3988, 3, 14, 23, 51, 215, 4, 22, 11, 10, 29, 21, 2, 215, 1476, 231, 2, 614, 10, 1727, 1222, 6, 243, 44, 2110, 3, 3, 3, 12, 3, 975, 3, 45, 2, 113, 4, 11, 10, 159, 3, 6, 19, 26, 69, 517, 47, 34, 7, 4225, 9, 16, 94, 2778, 1566, 4, 2, 614, 10, 2778, 3, 22, 19, 41, 4752, 7, 2, 3017, 447, 9888, 15, 1351, 0, 774, 2, 2867, 341, 4, 11, 10, 5, 947, 3, 17, 2, 341, 47, 7602, 6, 41, 0, 57, 768, 202, 187, 1715, 8, 13078, 15, 373, 97, 43, 992, 13, 7136, 45, 2, 1525, 6, 2, 2939, 3, 1061, 10, 47, 11973, 4, 166, 4, 19, 110, 13, 808, 20, 2, 2006, 3, 11, 10, 47, 2264,

In [16]:
train_data = train_data.with_format(type='torch', columns=['ids', 'label', 'length'])
valid_data = valid_data.with_format(type='torch', columns=['ids', 'label', 'length'])
test_data = test_data.with_format(type='torch', columns=['ids', 'label', 'length'])

In [17]:
train_data

Dataset({
    features: ['text', 'label', 'tokens', 'length', 'ids'],
    num_rows: 20000
})

In [18]:
class LSTMmodel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,dropout_rate, pad_index):
        super().__init__()
        # layer 1- Pass the ids to the embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_index)
        # layer 2- LSTM [If n_layers = 2, then layer 3 is also LSTM]
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers,dropout=dropout_rate, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout_rate) # to avoid overfitting

    def forward(self, ids, batch_size):
        # token to embeddings
        embedded = self.dropout(self.embedding(ids))
        embdedded = nn.utils.rnn.pack_padded_sequence(embedded, batch_size, batch_first=True,enforce_sorted=False)
        # embedding sequence (batch_size,seq_length,emd_dim) to LSTM

        outputs, (hidden, cell) = self.lstm(embdedded)

        output, output_length = nn.utils.rnn.pad_packed_sequence(outputs)
        hidden = self.dropout(hidden[-1])

        prediction = self.fc(hidden)
        return prediction

In [19]:
vocab_size = len(vocab)
embedding_dim = 300
hidden_dim = 64
output_dim = len(train_data.unique('label')) # either 0 or 1 = 2(length)
n_layers = 2
dropout_rate = 0.5

model = LSTMmodel(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout_rate,vocab['<PAD>'])
model = model.to(device) #switch our modeling training in GPU

In [20]:
sum(p.numel() for p in model.parameters() if p.requires_grad) # total parameters

4504106

In [21]:
def initialize_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight)
        nn.init.zeros_(m.bias)
    elif isinstance(m, nn.LSTM):
        for name, param in m.named_parameters():
            if 'bias' in name:
                nn.init.zeros_(param)
            elif 'weight' in name:
                nn.init.orthogonal_(param)

model.apply(initialize_weights)

LSTMmodel(
  (embedding): Embedding(14590, 300, padding_idx=1)
  (lstm): LSTM(300, 64, num_layers=2, batch_first=True, dropout=0.5)
  (fc): Linear(in_features=64, out_features=2, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [22]:
vectors = torchtext.vocab.GloVe() # extra pre-trained embebedding
pretrained_embedding = vectors.get_vecs_by_tokens(vocab.get_itos())
model.embedding.weight.data = pretrained_embedding
model.to(device)

.vector_cache/glove.840B.300d.zip: 2.18GB [06:52, 5.28MB/s]                            
100%|█████████▉| 2196016/2196017 [07:44<00:00, 4723.15it/s]


LSTMmodel(
  (embedding): Embedding(14590, 300, padding_idx=1)
  (lstm): LSTM(300, 64, num_layers=2, batch_first=True, dropout=0.5)
  (fc): Linear(in_features=64, out_features=2, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [23]:
learning_rate = 1e-4
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_function = nn.CrossEntropyLoss().to(device)

def metrics(prediction, actual):
    # print('METRICS')
    batch_size, _ = prediction.shape
    predicted_classes = prediction.argmax(dim=-1)
    correct_predictions = predicted_classes.eq(actual).sum()
    accuracy = correct_predictions / batch_size
    return accuracy

In [24]:
def collate(batch, pad_index):
    # print('COLLATING')
    batch_ids = [i['ids'] for i in batch]
    batch_ids = nn.utils.rnn.pad_sequence(batch_ids, padding_value=pad_index, batch_first=True)
    batch_length = [i['length'] for i in batch]
    batch_length = torch.stack(batch_length)
    batch_label = [i['label'] for i in batch]
    batch_label = torch.stack(batch_label)
    batch = {'ids': batch_ids.to(device),
             'length': batch_length,
             'label': batch_label.to(device)}
    return batch

In [25]:
batch_size = 64
collate = functools.partial(collate, pad_index= vocab['<PAD>'])

train_dataloader = torch.utils.data.DataLoader(train_data,
                                               batch_size=batch_size,
                                               collate_fn=collate,
                                               shuffle=True)

valid_dataloader = torch.utils.data.DataLoader(valid_data, batch_size=batch_size, collate_fn=collate)
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, collate_fn=collate)

In [26]:
def train(dataloader, model, loss_function, optimizer, device):
    # print('TRAINING')
    model.train()

    epoch_losses = []
    epoch_accs = []

    for batch in tqdm.tqdm(dataloader, desc='training...', file=sys.stdout):
        ids = batch['ids']

        #batch length - dataloader
        length = batch['length']
        label = batch['label']

        # y_hat = prediction from the model
        prediction = model(ids, length)
        # print(f'Prediction device:{prediction.device}')
        # loss function - > Actual value, predicted value
        # actual value - label
        # predicted value is prediction
        loss = loss_function(prediction, label) #loss
        accuracy = metrics(prediction, label) #
        optimizer.zero_grad() #adam -> gradient descent

        loss.backward()
        optimizer.step() #we will update the weights with learning rate

        epoch_losses.append(loss.item())
        epoch_accs.append(accuracy.item())

    return epoch_losses, epoch_accs

In [27]:
def evaluate(dataloader, model, loss_function, device):
    # print('EVALUATING')
    model.eval()
    epoch_losses = []
    epoch_accs = []

    with torch.no_grad(): # no optimization -> no update in weightds
        for batch in tqdm.tqdm(dataloader, desc='evaluating...', file=sys.stdout):
            ids = batch['ids']
            length = batch['length']
            label = batch['label']
            prediction = model(ids, length)
            loss = loss_function(prediction, label)
            accuracy = metrics(prediction, label)
            epoch_losses.append(loss.item())
            epoch_accs.append(accuracy.item())

    return epoch_losses, epoch_accs

In [28]:
n_epochs = 5
best_valid_loss = float('inf')

train_losses = []
train_accs = []
valid_losses = []
valid_accs = []

for epoch in range(n_epochs):

    train_loss, train_acc = train(train_dataloader, model, loss_function, optimizer, device)
    valid_loss, valid_acc = evaluate(valid_dataloader, model, loss_function, device)

    train_losses.extend(train_loss)
    train_accs.extend(train_acc)
    valid_losses.extend(valid_loss)
    valid_accs.extend(valid_acc)

    epoch_train_loss = np.mean(train_loss)
    epoch_train_acc = np.mean(train_acc)
    epoch_valid_loss = np.mean(valid_loss)
    epoch_valid_acc = np.mean(valid_acc)

    if epoch_valid_loss < best_valid_loss:
        best_valid_loss = epoch_valid_loss
        torch.save(model.state_dict(), 'lstm.pt')

        print(f'Epoch: {epoch+1}/{n_epochs}')
        print(f'loss: {epoch_train_loss:.4f}, accuracy: {epoch_train_acc:.4f}')
        print(f'valid_loss: {epoch_valid_loss:.4f}, valid_accuracy: {epoch_valid_acc:.4f}')
        print("--"*25)

training...: 100%|██████████| 313/313 [00:12<00:00, 24.47it/s]
evaluating...: 100%|██████████| 79/79 [00:01<00:00, 75.21it/s]
Epoch: 1/5
loss: 0.6868, accuracy: 0.5511
valid_loss: 0.6714, valid_accuracy: 0.6262
--------------------------------------------------
training...: 100%|██████████| 313/313 [00:10<00:00, 28.84it/s]
evaluating...: 100%|██████████| 79/79 [00:00<00:00, 89.58it/s]
Epoch: 2/5
loss: 0.5651, accuracy: 0.7099
valid_loss: 0.4371, valid_accuracy: 0.8153
--------------------------------------------------
training...: 100%|██████████| 313/313 [00:09<00:00, 33.40it/s]
evaluating...: 100%|██████████| 79/79 [00:01<00:00, 60.08it/s]
Epoch: 3/5
loss: 0.4572, accuracy: 0.7982
valid_loss: 0.4183, valid_accuracy: 0.8184
--------------------------------------------------
training...: 100%|██████████| 313/313 [00:08<00:00, 34.92it/s]
evaluating...: 100%|██████████| 79/79 [00:00<00:00, 88.48it/s]
Epoch: 4/5
loss: 0.4117, accuracy: 0.8246
valid_loss: 0.4153, valid_accuracy: 0.8315
---

In [29]:
model.load_state_dict(torch.load('lstm.pt')) #save the models

test_loss, test_acc = evaluate(test_dataloader, model, loss_function, device)

epoch_test_loss = np.max(test_loss)
epoch_test_acc = np.max(test_acc)

print("Loss",epoch_test_loss)
print("Acc",epoch_test_acc)

evaluating...: 100%|██████████| 391/391 [00:05<00:00, 68.88it/s]
Loss 0.9728038311004639
Acc 1.0


In [30]:
def make_prediction(text, model, tokenizer, vocab):
    #find the token for the user input
    tokens = tokenizer(text)
    #convert token into numerical number (unique id)
    ids = [vocab[t] for t in tokens]

    #find the length and convert the ids into tensor to feed in LSTM model
    length = torch.LongTensor([len(ids)])
    tensor = torch.LongTensor(ids).unsqueeze(dim=0).to(device)

    #make prediction
    prediction = model(tensor, length).squeeze(dim=0)
    probability = torch.softmax(prediction, dim=-1) #check for the score - probability (softmax)

    predicted_class = prediction.argmax(dim=-1)
    predicted_probability = probability[predicted_class]
    return predicted_class, predicted_probability

In [31]:
def display(label,score):
    return f"Negative-Score:{score:.5f}" if not label else f"Positive-Score:{score:.5f}"
    # if label==0:
    #     print(f"Negative-Score:{score}")
    # else:
    #     print(f"Positive-Score:{score}")

In [32]:
text = "Amazing movie, loved it"
label,score = make_prediction(text, model, tokenizer, vocab)
print(display(label,score))

Positive-Score:0.7634541988372803
