In [None]:
## Sequence classification
## IMDB dataset , classify a review as +ve or -ve

In [None]:
import torch
if torch.cuda.is_available():
    device=torch.device(type='cuda', index=0)
else:
    device=torch.device(type='cpu', index=0)
print(device)

cuda:0


In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

print("Path to dataset files:", path)

import pandas as pd

#read the data
data=pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv", header='infer')
#look at the data
data=data[:5000]
data.head()

Using Colab cache for faster access to the 'imdb-dataset-of-50k-movie-reviews' dataset.
Path to dataset files: /kaggle/input/imdb-dataset-of-50k-movie-reviews


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
# Preprocessing
import re

def shorten(s):
    no_html = re.sub(r'<.*?>', '', s)
    lowered = no_html.lower()
    sanitized = re.sub(r"[^a-z0-9\s.,?!'-]", ' ', lowered)
    words = sanitized.split()
    final_words = words[:100]
    return " ".join(final_words)

def tonum(s):
    if s == 'positive':
        return 1
    else:
        return 0
data['review']=data['review'].apply(lambda x:shorten(x))
data['sentiment']=data['sentiment'].apply(lambda x:tonum(x))

#re-check
data.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production. the filming tec...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there's a family where a little boy ...,0
4,petter mattei's love in the time of money is a...,1


In [None]:
class Vocab:
    def __init__(self):
        self.word2index={'PAD':0, 'EOS':1}
        self.index2word={0:'PAD', 1:'EOS'}
        self.word2count={}
        self.nwords=2

    def buildVocab(self,s):
        for word in s.split(" "):
            if word not in self.word2index:
                self.word2index[word]=self.nwords
                self.index2word[self.nwords]=word
                self.word2count[word]=1
                self.nwords+=1
            else:
                self.word2count[word]+=1
myVocab=Vocab()
reviews=data['review'].to_list()
for review in reviews:
    myVocab.buildVocab(review)
print("Vocab Length:",myVocab.nwords)

Vocab Length: 45279


In [None]:
from torch.utils.data import Dataset,random_split,DataLoader
class CustomDataset(Dataset):
    def __init__(self, reviews, review_sentiments, vocab, max_length):
        super().__init__()
        self.reviews=reviews
        self.review_sentiments=review_sentiments
        self.max_length=max_length
        self.vocab=vocab

    def __len__(self):
        return len(self.reviews)

    def get_input_ids(self,review,vocab):
        input_ids=[]
        input_ids_tensor=torch.zeros(self.max_length+1, dtype=torch.int64)
        for word in review.split(" "):
            input_ids.append(self.vocab.word2index[word])

        input_ids.append(self.vocab.word2index['EOS'])
        input_ids_tensor[:len(input_ids)]=torch.tensor(input_ids)
        return input_ids_tensor

    def __getitem__(self,idx):
        review=self.reviews[idx]
        review_sentiment=self.review_sentiments[idx]

        return self.get_input_ids(review,self.vocab), review_sentiment
review_sentiments=data['sentiment']
dataset=CustomDataset(reviews,review_sentiments,myVocab,100)
train_dataset,test_dataset=random_split(dataset,[0.9,0.1])
batch_size=64

#create train and eval data loader
train_dataloader=DataLoader(dataset=train_dataset,batch_size=batch_size, shuffle=True)
test_dataloader=DataLoader(dataset=test_dataset,batch_size=32, shuffle=False)


In [None]:
import torch.nn as nn
import torch.optim as optim
import re
import torch
import torch.nn as nn

class SentiNN(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size):
        super().__init__()
        self.e = nn.Embedding(input_size, embed_size)
        self.dropout = nn.Dropout(0.2)
        self.rnn = nn.GRU(embed_size, hidden_size, batch_first=True)
        self.out = nn.Linear(in_features=hidden_size, out_features=2)

    def forward(self, x):
        x = self.e(x)
        x = self.dropout(x)
        outputs, hidden = self.rnn(x)
        hidden.squeeze_(0)
        logits = self.out(hidden)
        return logits

    def predict(self, text, vocab, device, max_length=100):
        self.eval()

        no_html = re.sub(r'<.*?>', '', text)
        lowered = no_html.lower()
        sanitized = re.sub(r"[^a-z0-9\s.,?!'-]", ' ', lowered)

        input_ids = []
        unk_token_id = vocab.word2index.get('<UNK>', 0)

        for word in sanitized.split(" "):
            word_id = vocab.word2index.get(word, unk_token_id)
            input_ids.append(word_id)

        truncated_ids = input_ids[:max_length]
        final_ids = truncated_ids + [vocab.word2index['EOS']]

        padded_tensor = torch.zeros(1, max_length + 1, dtype=torch.int64)
        id_tensor = torch.tensor(final_ids)
        padded_tensor[0, :len(final_ids)] = id_tensor

        input_tensor = padded_tensor.to(device)

        with torch.no_grad():
            logits = self(input_tensor)

        prediction = torch.argmax(logits, dim=1).item()

        if prediction == 1:
            return "Positive"
        else:
            return "Negative"
embed_size=128
hidden_size=256

#create instance of a neural network
sentinn=SentiNN(myVocab.nwords,embed_size,hidden_size).to(device) #translation-direction sensitive

#specify loss, learning rate, and an optimizer
loss_fn=nn.CrossEntropyLoss().to(device)
lr=0.001
opt=optim.Adam(params=sentinn.parameters(), lr=lr)

In [None]:
def train_one_epoch():
    sentinn.train()
    track_loss=0
    num_correct=0
    batch_size=64

    for i, (reviews_ids,sentiments) in enumerate(train_dataloader):

        reviews_ids=reviews_ids.to(device) #64 x 100
        sentiments=sentiments.to(device) #64
        logits=sentinn(reviews_ids)
        loss=loss_fn(logits,sentiments)


        track_loss+=loss.item()
        num_correct+=(torch.argmax(logits,dim=1)==sentiments).type(torch.float).sum().item()

        running_loss=round(track_loss/(i+(reviews_ids.shape[0]/batch_size)),4)
        running_acc=round((num_correct/((i*batch_size+reviews_ids.shape[0])))*100,4)

        opt.zero_grad()
        loss.backward()
        opt.step()


    epoch_loss=running_loss
    epoch_acc=running_acc
    return epoch_loss, epoch_acc
def eval_one_epoch():
    sentinn.eval()
    track_loss=0
    num_correct=0
    batch_size=32

    for i, (reviews_ids,sentiments) in enumerate(test_dataloader):

        reviews_ids=reviews_ids.to(device)
        sentiments=sentiments.to(device)
        logits=sentinn(reviews_ids)

        loss=loss_fn(logits,sentiments)


        track_loss+=loss.item()
        num_correct+=(torch.argmax(logits,dim=1)==sentiments).type(torch.float).sum().item()

        running_loss=round(track_loss/(i+(reviews_ids.shape[0]/batch_size)),4)
        running_acc=round((num_correct/((i*batch_size+reviews_ids.shape[0])))*100,4)




    epoch_loss=running_loss
    epoch_acc=running_acc
    return epoch_loss, epoch_acc

n_epochs=30

for e in range(n_epochs):
    print("Epoch=",e+1, sep="", end=", ")
    epoch_loss,epoch_acc=train_one_epoch()
    print("Train Loss=", epoch_loss, "Train Acc", epoch_acc)
    epoch_loss,epoch_acc=eval_one_epoch()
    print("Eval Loss=", epoch_loss, "Eval Acc", epoch_acc)

Epoch=1, Train Loss= 0.7085 Train Acc 50.9556
Eval Loss= 0.7088 Eval Acc 53.8
Epoch=2, Train Loss= 0.6912 Train Acc 56.1333
Eval Loss= 0.701 Eval Acc 56.4
Epoch=3, Train Loss= 0.6555 Train Acc 61.5333
Eval Loss= 0.7153 Eval Acc 52.2
Epoch=4, Train Loss= 0.5939 Train Acc 68.0
Eval Loss= 0.7059 Eval Acc 60.6
Epoch=5, Train Loss= 0.4789 Train Acc 76.9333
Eval Loss= 0.753 Eval Acc 64.0
Epoch=6, Train Loss= 0.3695 Train Acc 83.9556
Eval Loss= 0.8346 Eval Acc 69.0
Epoch=7, Train Loss= 0.2728 Train Acc 88.1556
Eval Loss= 0.8471 Eval Acc 64.6
Epoch=8, Train Loss= 0.1733 Train Acc 93.6222
Eval Loss= 0.9873 Eval Acc 68.4
Epoch=9, Train Loss= 0.1135 Train Acc 95.6667
Eval Loss= 1.0486 Eval Acc 68.6
Epoch=10, Train Loss= 0.0824 Train Acc 97.1333
Eval Loss= 1.0749 Eval Acc 74.2
Epoch=11, Train Loss= 0.0662 Train Acc 97.5556
Eval Loss= 1.098 Eval Acc 72.4
Epoch=12, Train Loss= 0.0579 Train Acc 98.0667
Eval Loss= 1.3001 Eval Acc 73.4
Epoch=13, Train Loss= 0.0482 Train Acc 98.5333
Eval Loss= 1.3046 Ev

In [None]:
sentinn.predict("This was the best movie I have ever seen!",myVocab,device)

'Positive'