In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


In [4]:
import torch
print(torch.__version__)

2.6.0+cu124


In [14]:
import pandas as pd
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
print(df)

                                                  review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]


In [15]:
import re
from collections import Counter, OrderedDict

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +' '.join(emoticons).replace('-', '')
    tokenized = text.split()
    return tokenized

token_counts = Counter()
for index, row in df.iterrows():
    tokens = tokenizer(row['review'])
    token_counts.update(tokens)

print('Vocab-size:', len(token_counts))


Vocab-size: 104182


In [16]:
vocab = sorted(token_counts, key=token_counts.get, reverse=True)
int2word = dict(enumerate(vocab, 2))
int2word[0] = '<pad>'
int2word[1] = '<unk>'
word2int = {word: id for id, word in int2word.items()}

print(word2int['<pad>'])
print(word2int['<unk>'])
print(word2int['the'])
print(word2int['and'])

0
1
2
3


In [17]:
print([word2int[token] for token in ['this', 'is', 'an', 'example']])

[11, 7, 35, 472]


In [18]:
from torch.utils.data import Dataset
class IMDBDatasetFromDataFrame(Dataset):
    def __init__(self, dataframe):
        self.df = dataframe

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        review = self.df.iloc[idx, 0]
        sentiment = self.df.iloc[idx, 1]

        review_tokens = tokenizer(review)
        processed_text = torch.tensor([word2int[token] for token in review_tokens], dtype=torch.int64)

        label = 1. if sentiment == 'positive' else 0.
        
        return processed_text, label

dataset = IMDBDatasetFromDataFrame(df)
print(len(dataset))

50000


In [19]:
from torch.utils.data.dataset import random_split
torch.manual_seed(1)
temp_dataset, test_dataset = random_split(list(dataset), [30000, 20000])
train_dataset, valid_dataset = random_split(list(temp_dataset), [25000, 5000])

print(len(train_dataset))
print(len(valid_dataset))
print(len(test_dataset))

25000
5000
20000


In [27]:
import torch.nn as nn

def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for _text, _label in batch:
        label_list.append(_label)
        text_list.append(_text)
        lengths.append(_text.size(0))
    label_list = torch.tensor(label_list)
    lengths = torch.tensor(lengths)
    padded_text_list = nn.utils.rnn.pad_sequence(text_list, batch_first=True)
    return padded_text_list, label_list, lengths

In [30]:
from torch.utils.data import DataLoader
batch_size = 32
train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
valid_dl = DataLoader(valid_dataset, batch_size=batch_size,shuffle=False, collate_fn=collate_batch)
test_dl = DataLoader(test_dataset, batch_size=batch_size,shuffle=False, collate_fn=collate_batch)


In [45]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [46]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size,fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim,padding_idx=0)
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size,batch_first=True)
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)
        out, (hidden, cell) = self.rnn(out)
        out = hidden[-1, :, :]
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

vocab_size = len(word2int)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64
torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size)
model.to(device)
print(model)

RNN(
  (embedding): Embedding(104184, 20, padding_idx=0)
  (rnn): LSTM(20, 64, batch_first=True)
  (fc1): Linear(in_features=64, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [47]:
def train(dataloader):
    model.train()
    total_acc, total_loss = 0, 0
    for text_batch, label_batch, lengths in dataloader:
        text_batch, label_batch = text_batch.to(device), label_batch.to(device)
        optimizer.zero_grad()
        pred = model(text_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch)
        loss.backward()
        optimizer.step()
        total_acc += ((pred >= 0.5).float() == label_batch).float().sum().item()
        total_loss += loss.item() * label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

In [48]:
def evaluate(dataloader):
    model.eval()
    total_acc, total_loss = 0, 0
    with torch.no_grad():
        for text_batch, label_batch, lengths in dataloader:
            text_batch, label_batch = text_batch.to(device), label_batch.to(device)
            pred = model(text_batch, lengths)[:, 0]
            loss = loss_fn(pred, label_batch)
            total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
            total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

In [49]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10
torch.manual_seed(1)
for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dl)
    acc_valid, loss_valid = evaluate(valid_dl)
    print(f'Epoch {epoch} accuracy: {acc_train:.4f} val_accuracy: {acc_valid:.4f}')


Epoch 0 accuracy: 0.6296 val_accuracy: 0.7054
Epoch 1 accuracy: 0.7136 val_accuracy: 0.6852
Epoch 2 accuracy: 0.8094 val_accuracy: 0.8284
Epoch 3 accuracy: 0.8585 val_accuracy: 0.6652
Epoch 4 accuracy: 0.8298 val_accuracy: 0.8454
Epoch 5 accuracy: 0.8998 val_accuracy: 0.8396
Epoch 6 accuracy: 0.9176 val_accuracy: 0.8588
Epoch 7 accuracy: 0.9304 val_accuracy: 0.8776
Epoch 8 accuracy: 0.9390 val_accuracy: 0.8838
Epoch 9 accuracy: 0.9539 val_accuracy: 0.8836


In [50]:
acc_test, _ = evaluate(test_dl)
print(f'test_accuracy: {acc_test:.4f}')

test_accuracy: 0.8728
