In [None]:
!pip install torch==2.2.2 torchtext==0.17.2 torchvision torchaudio
!pip install torchdata==0.7.1 --quiet
!pip install 'portalocker>=2.0.0'
!pip install spacy nltk gensim

In [None]:
from torchtext import _extension
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
from torchtext.datasets import IMDB
from sklearn.model_selection import train_test_split
import spacy
import re
from nltk.corpus import stopwords as nltk_stopwords
from spacy.lang.en.stop_words import STOP_WORDS as spacy_stopwords
from gensim.models import Word2Vec
import numpy as np
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# 1. Initial Setup and Preprocessing

# Set a seed for reproducibility
SEED = 2222
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Load spacy model
# You may need to run: python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

# Combine stopwords from NLTK and SpaCy
stop_words = set(nltk_stopwords.words('english')).union(spacy_stopwords)

def preprocess_text(text):
    """Cleans, tokenizes, and lemmatizes text data."""
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    tokenizer = re.compile(r"\w+'?\w+|\w+")
    tokens = tokenizer.findall(text.lower())
    lemmatized_tokens = [
        token.lemma_ for token in nlp(" ".join(tokens)) if token.text not in stop_words
    ]
    return lemmatized_tokens

In [None]:
# 2. Load and Process the Dataset

print("Loading and processing IMDb dataset...")
train_iter, test_iter = IMDB(split=('train', 'test'))

reviews = []
labels = []

# Process all data (this will take a while)
for label, text in list(train_iter) + list(test_iter):
    reviews.append(preprocess_text(text))
    # Convert labels from [1, 2] to [0 (negative), 1 (positive)]
    labels.append(label - 1)

print(f"Processed {len(reviews)} reviews.")
print("-" * 20)

Loading and processing IMDb dataset...
Processed 50000 reviews.
--------------------


In [None]:
# 3. Train Word Embeddings using Word2Vec

print("Training Word2Vec model...")
embedding_dimension = 100
word2vec_model = Word2Vec(
    sentences=reviews,
    vector_size=embedding_dimension,
    window=5,
    min_count=5,  # Consider words that appear at least 5 times
    workers=4,
    max_vocab_size=15000
)
# Extract keyed vectors and free up memory
word_vectors = word2vec_model.wv
del word2vec_model
print(f"Word2Vec model trained. Vocabulary size: {len(word_vectors.key_to_index)}")
print("-" * 20)

Training Word2Vec model...
Word2Vec model trained. Vocabulary size: 7524
--------------------


In [None]:
# 4. Prepare Data for PyTorch

def word2idx(embedding_model, review):
    """Converts a tokenized review to a list of vocabulary indices."""
    index_review = []
    for word in review:
        if word in embedding_model.key_to_index:
            index_review.append(embedding_model.key_to_index[word])
    return torch.tensor(index_review)

# The padding value will be the index for an <unk> or <pad> token.
# Let's make it 0 and shift all other indices by 1.
padding_value = 0
word_vectors.add_vector('<pad>', np.zeros(embedding_dimension)) # Add a padding vector
# The index of our padding token is now the last one. We will handle this in the model.

# Convert all reviews to their index representations
index_reviews = [word2idx(word_vectors, review) for review in reviews]

# Extract weights from the trained Word2Vec model
embedding_weights = torch.FloatTensor(word_vectors.vectors)



In [None]:
# 5. Split Data and Define Batch Iterator

X_train, X_temp, y_train, y_temp = train_test_split(
    index_reviews, labels, test_size=0.2, random_state=SEED
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=SEED
)

print(f"Training samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")
print(f"Testing samples: {len(X_test)}")
print("-" * 20)

def iterator_func(X, y, batch_size=64):
    """Creates batches of data, sorts them by length, and pads them."""
    size = len(X)
    permutation = np.random.permutation(size)
    iterator = []
    for i in range(0, size, batch_size):
        indices = permutation[i:i + batch_size]
        batch_texts = [X[i] for i in indices]
        batch_labels = [y[i] for i in indices]

        # Sort by length in descending order (required for packing)
        sorted_batch = sorted(zip(batch_texts, batch_labels), key=lambda x: len(x[0]), reverse=True)
        batch_texts, batch_labels = zip(*sorted_batch)

        batch = {
            "text": list(batch_texts),
            "label": list(batch_labels)
        }

        batch["length"] = torch.IntTensor([len(review) for review in batch["text"]])

        # Pad sequences and transpose for RNN input shape
        batch["text"] = nn.utils.rnn.pad_sequence(
            batch["text"], padding_value=padding_value, batch_first=False
        )
        batch["label"] = torch.FloatTensor(batch["label"])

        iterator.append(batch)
    return iterator

Training samples: 40000
Validation samples: 5000
Testing samples: 5000
--------------------


In [None]:
# 6. Define the RNN Model

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, embedding_weights):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_weights, padding_idx=padding_value)
        self.rnn = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers=n_layers,
            bidirectional=bidirectional,
            dropout=dropout
        )
        self.fc = nn.Linear(hidden_dim * 2, output_dim) # *2 for bidirectional
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):
        embedded = self.embedding(text)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'))
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        return self.fc(hidden)

In [None]:
# 7. Training Setup

# Hyperparameters
INPUT_DIM = len(word_vectors.key_to_index)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
N_EPOCHS = 5
BATCH_SIZE = 64

# Instantiate model, optimizer, criterion
model = RNN(
    INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS,
    BIDIRECTIONAL, DROPOUT, embedding_weights
)
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = criterion.to(device)

def binary_accuracy(preds, y):
    """Returns accuracy per batch"""
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for batch in iterator:
        text, text_lengths = batch["text"].to(device), batch["length"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        predictions = model(text, text_lengths).squeeze(1)
        loss = criterion(predictions, labels)
        acc = binary_accuracy(predictions, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            text, text_lengths = batch["text"].to(device), batch["length"].to(device)
            labels = batch["label"].to(device)
            predictions = model(text, text_lengths).squeeze(1)
            loss = criterion(predictions, labels)
            acc = binary_accuracy(predictions, labels)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [None]:
# 8. Model Training Loop

train_iterator = iterator_func(X_train, y_train, BATCH_SIZE)
valid_iterator = iterator_func(X_val, y_val, BATCH_SIZE)
test_iterator = iterator_func(X_test, y_test, BATCH_SIZE)

print("\nStarting model training...")
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    print(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')

print("\nTraining finished.")
print("-" * 20)


Starting model training...
Epoch: 01 | Train Loss: 0.481 | Train Acc: 76.83% | Val. Loss: 0.407 | Val. Acc: 81.69%
Epoch: 02 | Train Loss: 0.334 | Train Acc: 85.67% | Val. Loss: 0.327 | Val. Acc: 85.66%
Epoch: 03 | Train Loss: 0.293 | Train Acc: 87.67% | Val. Loss: 0.304 | Val. Acc: 86.00%
Epoch: 04 | Train Loss: 0.268 | Train Acc: 88.94% | Val. Loss: 0.290 | Val. Acc: 87.58%
Epoch: 05 | Train Loss: 0.247 | Train Acc: 89.96% | Val. Loss: 0.284 | Val. Acc: 88.35%

Training finished.
--------------------


In [None]:
# 9. Final Evaluation and Prediction

test_loss, test_acc = evaluate(model, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

def predict_sentiment(sentence, model, word_vectors, device):
    model.eval()
    tokenized = preprocess_text(sentence)
    indexed = [word_vectors.key_to_index.get(t, -1) for t in tokenized]
    indexed = [i for i in indexed if i != -1] # Remove out-of-vocabulary words
    if not indexed:
        return 0.5 # Neutral if no words are in vocabulary

    length = torch.LongTensor([len(indexed)])
    tensor = torch.LongTensor(indexed).to(device).unsqueeze(1)
    prediction = torch.sigmoid(model(tensor, length))
    return prediction.item()

Test Loss: 0.288 | Test Acc: 88.13%


In [None]:
# Example Prediction to evaluate
print("-" * 20)
review1 = "This movie was absolutely fantastic, I loved every minute of it!"
score1 = predict_sentiment(review1, model, word_vectors, device)
print(f"Review: '{review1}'\nSentiment Score: {score1:.4f} ({'Positive' if score1 > 0.5 else 'Negative'})")

review2 = "It was a complete waste of time, the plot was boring and the acting was terrible."
score2 = predict_sentiment(review2, model, word_vectors, device)
print(f"Review: '{review2}'\nSentiment Score: {score2:.4f} ({'Positive' if score2 > 0.5 else 'Negative'})")

--------------------
Review: 'This movie was absolutely fantastic, I loved every minute of it!'
Sentiment Score: 0.9794 (Positive)
Review: 'It was a complete waste of time, the plot was boring and the acting was terrible.'
Sentiment Score: 0.0010 (Negative)


In [None]:
# 10. Save Artifacts for Deployment

# Saving the trained model's state dictionary
torch.save(model.state_dict(), 'sentiment_model.pth')

# Saving the Gensim Word2Vec keyed vectors
word_vectors.save('word_vectors.kv')

print("Model and word vectors have been saved successfully!")
print("You can now find 'sentiment_model.pth' and 'word_vectors.kv' in the file browser on the left.")

Model and word vectors have been saved successfully!
You can now find 'sentiment_model.pth' and 'word_vectors.kv' in the file browser on the left.
