In [1]:
import torch
import torch.nn as nn
torch.manual_seed(1)

rnn_layer = nn.RNN(input_size=5, hidden_size=2, num_layers=1, batch_first=True)

w_xh = rnn_layer.weight_ih_l0
w_hh = rnn_layer.weight_hh_l0
b_xh = rnn_layer.bias_ih_l0
b_hh = rnn_layer.bias_hh_l0
print('W_xh shape:', w_xh.shape)
print('W_hh shape: ', w_hh.shape)
print('b_xh shape: ', b_xh.shape)
print('b_hh shape: ', b_hh.shape)

W_xh shape: torch.Size([2, 5])
W_hh shape:  torch.Size([2, 2])
b_xh shape:  torch.Size([2])
b_hh shape:  torch.Size([2])


In [2]:
x_seq = torch.tensor([[1.]*5, [2.]*5, [3.]*5]).float()
x_seq

tensor([[1., 1., 1., 1., 1.],
        [2., 2., 2., 2., 2.],
        [3., 3., 3., 3., 3.]])

In [3]:
torch.reshape(x_seq, (1, 3, 5))

tensor([[[1., 1., 1., 1., 1.],
         [2., 2., 2., 2., 2.],
         [3., 3., 3., 3., 3.]]])

In [4]:
x_seq = torch.tensor([[1.]*5, [2.]*5, [3.]*5]).float()
## output of the simple RNN:
output, hn = rnn_layer(torch.reshape(x_seq, (1, 3, 5)))
out_man = []

for t in range(3):
    xt = torch.reshape(x_seq[t], (1, 5))
    print(f'Time step {t} =>')
    print('   Input           :', xt.numpy())

    ht = torch.matmul(xt, torch.transpose(w_xh, 0, 1)) + b_xh
    print('    Hidden        :', ht.detach().numpy())
          
    if t > 0:
        prev_h = out_man[t-1]
    else:
        prev_h = torch.zeros((ht.shape))
    ot = ht + torch.matmul(prev_h, torch.transpose(w_hh, 0, 1)) + b_hh
    ot = torch.tanh(ot)
    out_man.append(ot)
    print('   Ouput (manual) :', ot.detach().numpy())
    print('   RNN output     :', output[:, t].detach().numpy())
    print()
    

Time step 0 =>
   Input           : [[1. 1. 1. 1. 1.]]
    Hidden        : [[-0.4701929  0.5863904]]
   Ouput (manual) : [[-0.3519801   0.52525216]]
   RNN output     : [[-0.3519801   0.52525216]]

Time step 1 =>
   Input           : [[2. 2. 2. 2. 2.]]
    Hidden        : [[-0.88883156  1.2364397 ]]
   Ouput (manual) : [[-0.68424344  0.76074266]]
   RNN output     : [[-0.68424344  0.76074266]]

Time step 2 =>
   Input           : [[3. 3. 3. 3. 3.]]
    Hidden        : [[-1.3074701  1.886489 ]]
   Ouput (manual) : [[-0.8649416   0.90466356]]
   RNN output     : [[-0.8649416   0.90466356]]



In [5]:
# from torchtext.datasets import IMDB
# train_dataset = IMDB(split='train')
# test_dataset = IMDB(split='test')


In [6]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset('imdb')
train_data = dataset['train']
test_data = dataset['test']

print(train_data[0])

  from .autonotebook import tqdm as notebook_tqdm


{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [7]:
len(train_data)

25000

In [8]:
from torch.utils.data import Dataset

class IMDBDataset(Dataset):
    def __init__(self, imdb_dataset):
        self.imdb_dataset = imdb_dataset

    def __len__(self):
        return len(self.imdb_dataset)

    def __getitem__(self, idx):
        text = self.imdb_dataset[idx]['text']
        label = self.imdb_dataset[idx]['label']
        return label, text



In [9]:
train_dataset = IMDBDataset(train_data)
test_dataset = IMDBDataset(test_data)

In [10]:
from torch.utils.data.dataset import random_split
torch.manual_seed(1)

train_dataset, valid_dataset = random_split(list(train_dataset), [20000, 5000])

In [11]:
import re

from collections import Counter, OrderedDict

def tokenizer(text):
    text = re.sub(r'<[^>]*>', '', text)
    emoticons = re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub(r'[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-', '')
    tokenized = text.split()
    return tokenized

token_counts = Counter()
for label, line in train_dataset:
    tokens = tokenizer(line)
    token_counts.update(tokens)
print('Vocab-size', len(token_counts))

Vocab-size 69006


In [12]:
# token_counts

In [13]:
from torchtext.vocab import vocab
sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)

ordered_dict = OrderedDict(sorted_by_freq_tuples)
vocab = vocab(ordered_dict)
vocab.insert_token('<pad>', 0)
vocab.insert_token('<unk>', 1)
vocab.set_default_index(1)



In [14]:
vocab

Vocab()

In [15]:
print([vocab[token] for token in ['this', 'is', 'an', 'example']])

[11, 7, 35, 457]


In [16]:
# for label, line in valid_dataset:
#     if label == 0:
#         print(label)
    

In [17]:
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]

In [18]:
## Step 3-B: wrap the encode and transformation function
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for _label, _text in batch:
        label_list.append(label)
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    label_list = torch.tensor(label_list)
    lengths = torch.tensor(lengths)
    padded_text_list = nn.utils.rnn.pad_sequence(text_list, batch_first=True)
    return padded_text_list, label_list, lengths


# Take a small batch
from torch.utils.data import DataLoader
dataloader = DataLoader(train_dataset, batch_size=4, shuffle=False, collate_fn=collate_batch)

In [19]:
text_batch, label_batch, lengths = next(iter(dataloader))
# print('text_batch', text_batch)
# print('label_batch', label_batch)
# print('lengths', lengths),
print(lengths)

tensor([165,  86, 218, 145])


In [20]:
print(text_batch.shape)

torch.Size([4, 218])


In [21]:
batch_size = 32
train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
valid_dl = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
test_dl = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

In [22]:
embedding = nn.Embedding(num_embeddings=10, embedding_dim=3, padding_idx=0)
# a batch of 2 samples of 4 indices each
text_encoded_input = torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 0]])
em = embedding(text_encoded_input)
em.shape

torch.Size([2, 4, 3])

In [23]:
text_encoded_input.shape

torch.Size([2, 4])

In [24]:
print(em)

tensor([[[ 0.7039, -0.8321, -0.4651],
         [-0.3203,  2.2408,  0.5566],
         [-0.4643,  0.3046,  0.7046],
         [-0.7106, -0.2959,  0.8356]],

        [[-0.4643,  0.3046,  0.7046],
         [ 0.0946, -0.3531,  0.9124],
         [-0.3203,  2.2408,  0.5566],
         [ 0.0000,  0.0000,  0.0000]]], grad_fn=<EmbeddingBackward0>)


In [25]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers=2, batch_first=True)
        # self.rnn = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        # self.rnn = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        _, hidden = self.rnn(x)
        out = hidden[-1, :, :]  # we use the final hidden state from the last hidden layer as the input to the fully connected layer

        out = self.fc(out)
        return out

In [26]:
model = RNN(64, 32)
print(model)
model(torch.randn(5, 3, 64))

RNN(
  (rnn): RNN(64, 32, num_layers=2, batch_first=True)
  (fc): Linear(in_features=32, out_features=1, bias=True)
)


tensor([[ 0.3183],
        [ 0.1230],
        [ 0.1772],
        [-0.1052],
        [-0.1259]], grad_fn=<AddmmBackward0>)

In [27]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True)
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)
        out, (hidden, cell) = self.rnn(out)
        out = hidden[-1, :, :]
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

In [28]:
vocab_size = len(vocab)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64
torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size)
model

RNN(
  (embedding): Embedding(69008, 20, padding_idx=0)
  (rnn): LSTM(20, 64, batch_first=True)
  (fc1): Linear(in_features=64, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [29]:
def train(dataloader):
    model.train()
    total_acc, total_loss = 0, 0
    for text_batch, label_batch, lengths in dataloader:
        optimizer.zero_grad()
        pred = model(text_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch.float())
        loss.backward()
        optimizer.step()
        total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
        total_loss += loss.item() * label_batch.size(0)
    return total_acc / len(dataloader.dataset), total_loss / len(dataloader.dataset)

In [30]:
def evaluate(dataloader):
    model.eval()
    total_acc, total_loss = 0, 0
    with torch.no_grad():
        for text_batch, label_batch, lengths in dataloader:
            pred = model(text_batch, lengths)[:, 0]
            label_batch = label_batch.float()
            loss = loss_fn(pred, label_batch)
            total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
            total_loss += loss.item()* label_batch.size(0)
        return total_acc / len(dataloader.dataset), total_loss / len(dataloader.dataset)

In [31]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [32]:
# num_epochs = 10
# torch.manual_seed(1)
# for epoch in range(num_epochs):
#     acc_train, loss_train = train(train_dl)
#     acc_valid, loss_valid = evaluate(valid_dl)
#     print(f'Epoch {epoch} accuracy: {acc_train:.4f} val_accuracy: {acc_valid:.4f}')

In [33]:
torch.save(model, 'sentiment_analysis_model.pth')

In [34]:
model = torch.load('sentiment_analysis_model.pth')

In [35]:
acc_test, _ = evaluate(test_dl)

In [37]:
print('acc_test', acc_test)

acc_test 0.685
