In [None]:
# -------------------------------
# 1️⃣ Uninstall conflicting versions
# -------------------------------
!pip uninstall -y torch torchtext torchvision torchaudio numpy

# -------------------------------
# 2️⃣ Install compatible versions
# For vanilla RNN + IMDB
# torch 2.3.0, torchtext 0.18.0, torchvision/torchaudio matching
# numpy 1.26.4 (avoids PyTorch errors)
# -------------------------------
!pip install torch==2.3.0 torchtext==0.18.0 torchvision==0.18.0 torchaudio==2.3.0 numpy==1.26.4 --quiet

# -------------------------------
# 3️⃣ Restart runtime (required to load new versions)
# -------------------------------
import os
os.kill(os.getpid(), 9)  # This forces Colab to restart


Found existing installation: torch 2.8.0+cu126
Uninstalling torch-2.8.0+cu126:
  Successfully uninstalled torch-2.8.0+cu126
[0mFound existing installation: torchvision 0.23.0+cu126
Uninstalling torchvision-0.23.0+cu126:
  Successfully uninstalled torchvision-0.23.0+cu126
Found existing installation: torchaudio 2.8.0+cu126
Uninstalling torchaudio-2.8.0+cu126:
  Successfully uninstalled torchaudio-2.8.0+cu126
Found existing installation: numpy 2.0.2
Uninstalling numpy-2.0.2:
  Successfully uninstalled numpy-2.0.2
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m779.1/779.1 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m65.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m67.7 MB/s[0m eta [36m0:00:00[0m
[2K   

In [9]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [5]:
df = pd.read_csv('imdb.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [11]:
from sklearn.model_selection import train_test_split

trainset, testset = train_test_split(df, test_size = 0.2, random_state = 42)

In [7]:
tokenizer = get_tokenizer('basic_english')

def yeild_tokens(texts):
  for text in texts:
    yield tokenizer(text)

In [8]:
vocab = build_vocab_from_iterator(yeild_tokens(df['review']), specials = ['<unk>', '<pad>'])
vocab.set_default_index(vocab['<unk>'])
pad_idx = vocab['<pad>']

label_map = {'negative':0, 'positive':1}

In [10]:
class ReviewDataset(Dataset):
  def __init__(self, df, vocab, tokenizer, label_map):
    self.texts = df['review'].tolist()
    self.labels = df['sentiment'].map(label_map).tolist()
    self.vocab = vocab
    self.tokenizer = tokenizer

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = torch.tensor(self.vocab(self.tokenizer(self.texts[idx])), dtype=torch.long)
    label = torch.tensor(self.labels[idx], dtype = torch.long)
    return text, label

In [12]:
train = ReviewDataset(trainset, vocab, tokenizer, label_map)
test = ReviewDataset(testset, vocab, tokenizer, label_map)

In [13]:
def collate_batch(batch):
  texts, labels = zip(*batch)
  texts = pad_sequence(texts, batch_first=True, padding_value=pad_idx)
  labels = torch.stack(labels)

  return texts, labels

train_loader = DataLoader(train, batch_size =4, shuffle = True, collate_fn=collate_batch)
test_loader = DataLoader(test, batch_size=4, shuffle = False, collate_fn = collate_batch)

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [33]:
import torch.nn as nn

class LSTMModel(nn.Module):
  def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim = 2):
    super(LSTMModel, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
    self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first = True)
    self.fc = nn.Linear(hidden_dim, output_dim)

  def forward(self, x):
    embedded = self.embedding(x)
    output, (hidden, cell) = self.lstm(embedded)
    return self.fc(hidden.squeeze(0))

In [34]:
model = LSTMModel(len(vocab), embed_dim=64, hidden_dim=128).to(device)

In [35]:
from sklearn.metrics import accuracy_score
import torch.optim as optim

In [36]:
loss_fn = nn.CrossEntropyLoss()
opt = optim.Adam(model.parameters(), lr = 0.001)

In [37]:
for epochs in range(10):
  total_loss = 0
  train_pred, labels_pred = [], []
  for texts, labels in train_loader:
    texts, labels = texts.to(device), labels.to(device)

    opt.zero_grad()
    pred = model(texts)

    loss = loss_fn(pred, labels)
    loss.backward()
    opt.step()

    total_loss = loss.item()

    preds = torch.argmax(pred, dim = 1)
    train_pred.extend(preds.cpu().numpy())
    labels_pred.extend(labels.cpu().numpy())

  print(f'Epoch {epochs+1}, Loass : {total_loss/len(train_loader):.4f}')
  acc = accuracy_score(labels_pred, train_pred)
  print('Accuracy :', acc)

Epoch 1, Loass : 0.0001
Accuracy : 0.583725
Epoch 2, Loass : 0.0001
Accuracy : 0.74445
Epoch 3, Loass : 0.0000
Accuracy : 0.80885
Epoch 4, Loass : 0.0000
Accuracy : 0.865525
Epoch 5, Loass : 0.0001
Accuracy : 0.91075
Epoch 6, Loass : 0.0000
Accuracy : 0.94005
Epoch 7, Loass : 0.0000
Accuracy : 0.9609
Epoch 8, Loass : 0.0000
Accuracy : 0.97515
Epoch 9, Loass : 0.0000
Accuracy : 0.984425
Epoch 10, Loass : 0.0000
Accuracy : 0.98935


In [38]:
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
  for texts, labels in test_loader:
    texts, labels = texts.to(device), labels.to(device)
    output = model(texts)
    preds = torch.argmax(output, dim =1)
    all_preds.extend(preds.cpu().numpy())
    all_labels.extend(labels.cpu().numpy())

acc = accuracy_score(all_labels, all_preds)
print('Accuracy :', acc)

Accuracy : 0.98935


In [41]:
text = 'i hate this movie'

tokens = tokenizer(text)
num = vocab(tokens)

input = torch.tensor(num, dtype = torch.long).unsqueeze(0)

input = input.to(device)

model.eval()
with torch.no_grad():
  output = model(input)
  pred = torch.argmax(output, dim=1).item()


label_map_rev = {v: k for k,v in label_map.items()}
print(f'Text : ', text)
print(f'Sentiment : {label_map_rev[pred]}')

Text :  i hate this movie
Sentiment : negative
