In [51]:
import numpy as np
import torch
from datasets import load_dataset
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from tensorflow.keras.preprocessing.text import Tokenizer # type: ignore
from tensorflow.keras.preprocessing.sequence import pad_sequences # type: ignore

In [52]:
dataset = load_dataset("imdb")

In [72]:
X_train = dataset["train"]["text"]
y_train = dataset["train"]["label"] 
X_test = dataset["test"]["text"]
y_test = dataset["test"]["label"]

In [73]:
X_train[0]

'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, ev

In [74]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [75]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_seq, maxlen=300, padding="pre")
X_train_tensor = torch.tensor(X_train_padded)


X_test_seq = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_seq, maxlen=300, padding="pre")
X_test_tensor = torch.tensor(X_test_padded)

In [76]:
print(X_train_tensor[0][:20])
print(X_train_tensor.shape)

tensor([   0,    0,    0,    0,    0,   10, 1605,   10,  241, 1992, 4174,   36,
          58,  371, 1127,   85,    4,   29,    1, 7093], dtype=torch.int32)
torch.Size([25000, 300])


In [77]:
reversed_word_index = tokenizer.index_word

In [78]:
sample_review = X_train_tensor[0]
sample_label = y_train[0]

sample_review[:10]

tensor([   0,    0,    0,    0,    0,   10, 1605,   10,  241, 1992],
       dtype=torch.int32)

In [79]:
decoded_text = " ".join([reversed_word_index.get(int(i),'?') for i in sample_review])
decoded_text

# for i in sample_review:
#     print(reversed_word_index.get(int(i),0))


# reversed_word_index[12]

"? ? ? ? ? i rented i am curious yellow from my video store because of all the controversy that surrounded it when it was first released in 1967 i also heard that at first it was seized by u s customs if it ever tried to enter this country therefore being a fan of films considered controversial i really had to see this for myself br br the plot is centered around a young swedish drama student named lena who wants to learn everything she can about life in particular she wants to focus her attentions to making some sort of documentary on what the average swede thought about certain political issues such as the vietnam war and race issues in the united states in between asking politicians and ordinary denizens of stockholm about their opinions on politics she has sex with her drama teacher classmates and married men br br what kills me about i am curious yellow is that 40 years ago this was considered pornographic really the sex and nudity scenes are few and far between even then it's not

In [80]:
# input_ids = X_train_tokens["input_ids"]
labels = torch.tensor(y_train)
dataset = TensorDataset(X_train_tensor, labels)

batch_size = 32
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

labels = torch.tensor(y_test)
dataset = TensorDataset(X_test_tensor, labels)

test_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


In [81]:
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding index

In [94]:
#RNN
class RNN(nn.Module):

    def __init__(self,vocab_size,embed_dim=128,hidden_dim=256,num_classes=1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,embed_dim)
        self.rnn = nn.RNN(embed_dim,hidden_dim,batch_first=True)
        self.linear = nn.Linear(hidden_dim,num_classes)
    
    def forward(self,x):

        x = self.embedding(x)
        out,_ = self.rnn(x)
        out = out[:,-1,:]
        out = self.linear(out)
        return out


In [95]:
rnn = RNN(vocab_size)

for batch in train_loader:
    X,y = batch
    break

print((torch.sigmoid(rnn(X))>0.5).float()[0:10])

tensor([[0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [1.],
        [1.],
        [0.]])


In [96]:
loss_fn = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(params = rnn.parameters(),lr = 0.005)

In [97]:
from sklearn.metrics import accuracy_score
epochs = 8

for epoch in range(epochs):
    
    train_loss = 0
    train_acc = 0

    for X, y in train_loader:
        optimizer.zero_grad()
        rnn.train()

        y = y.float().unsqueeze(1)  

        y_train_logits = rnn(X)
        loss = loss_fn(y_train_logits, y)

        y_train_pred = (torch.sigmoid(y_train_logits) > 0.5).float() 
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        train_acc += accuracy_score(y, y_train_pred)
    
    train_loss /= len(train_loader)
    train_acc /= len(train_loader)

    print(f"Epoch {epoch}")
    print("------------------")
    print(f"Train :: Loss = {train_loss:.4f}, Accuracy = {train_acc*100:.4f}%")

    rnn.eval()
    test_loss = 0
    test_acc = 0
    with torch.inference_mode():

        for X, y in test_loader:

            y = y.float().unsqueeze(1) 

            y_test_logits = rnn(X)
            test_loss += loss_fn(y_test_logits, y).item()

            y_test_pred = (torch.sigmoid(y_test_logits) > 0.5).float()
            test_acc += accuracy_score(y, y_test_pred)
        
        test_loss /= len(test_loader)
        test_acc /= len(test_loader)

        y_train_pred = (torch.sigmoid(y_train_logits) > 0.5).float() 
    
    print(f"Test :: Loss = {test_loss:.4f}, Accuracy = {test_acc*100:.4f}%\n")
    

Epoch 0
------------------
Train :: Loss = 0.7057, Accuracy = 52.6255%
Test :: Loss = 0.7673, Accuracy = 55.4068%

Epoch 1
------------------
Train :: Loss = 0.6255, Accuracy = 65.0615%
Test :: Loss = 0.6109, Accuracy = 65.6250%

Epoch 2
------------------
Train :: Loss = 0.5277, Accuracy = 73.8851%
Test :: Loss = 0.5975, Accuracy = 68.2305%

Epoch 3
------------------
Train :: Loss = 0.5115, Accuracy = 75.4476%
Test :: Loss = 0.7319, Accuracy = 60.9455%

Epoch 4
------------------
Train :: Loss = 0.4519, Accuracy = 79.6036%
Test :: Loss = 0.7096, Accuracy = 65.5930%

Epoch 5
------------------
Train :: Loss = 0.3940, Accuracy = 82.8605%
Test :: Loss = 0.6522, Accuracy = 70.1926%

Epoch 6
------------------
Train :: Loss = 0.3977, Accuracy = 82.3969%
Test :: Loss = 0.6476, Accuracy = 71.9270%

Epoch 7
------------------
Train :: Loss = 0.3783, Accuracy = 83.6597%
Test :: Loss = 0.6567, Accuracy = 69.3614%



In [100]:
import pickle
torch.save(rnn.state_dict(),f="rnn.pth")
# pickle.HIGHEST_PROTOCOL ensures that the most efficient and latest protocol is used.
with open("tokenizer.pickle","wb") as handle:
    pickle.dump(tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)