In [1]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv("IMDB_Dataset.csv")

In [5]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
data.shape

(50000, 2)

In [9]:
type(data) # Two columns so it is a DataFrame

pandas.core.frame.DataFrame

In [11]:
data.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [13]:
data["sentiment"].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [15]:
# This is supervised learning (Classification) as there is a labeled dataset

In [17]:
# One-Hot Encoding / Label encoder is a method of representing 
# characters or words by a vector where 
# only one element is set to one 
# and all others are zero, based on their position in the vocabulary.

In [19]:
# Positive -> 1, Negative -> 0
data.replace({"sentiment": {"positive" : 1, "negative" : 0}}, inplace=True) 
# When inplace=True is set, the replacement happens directly on the original DataFrame ('data' variable)
# It does not create a new DataFrame and only changes the DataFrame in memory, not the file you loaded.

  data.replace({"sentiment": {"positive" : 1, "negative" : 0}}, inplace=True)


In [21]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [23]:
data.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0
49999,No one expects the Star Trek movies to be high...,0


In [None]:
# LSTM -> Long Short Term Memory (Updated version of the RNN)
# RNN -> Recurrent Neural Network (Better for smaller datasets)

In [25]:
!pip install tensorflow



In [27]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [28]:
train_data, test_data = train_test_split(data, test_size = 0.2, random_state=42)
#train_data → used to train model
# test_data → used to test (evaluate) model on data it has never seen before
# 20% of the data goes into the test set and 80% goes into the training set which is generally accepted as a good split
# Setting random_state=42 ensures reproducibility → you (and others) always get the same split.
# Controls the random shuffling of the data before splitting.
# Without it, every time you run the code you might get different train/test splits.
# Why 42? It’s just a convention/inside joke from The Hitchhiker’s Guide to the Galaxy (“the answer to everything is 42”). Could use any number.

In [29]:
train_data.shape

(40000, 2)

In [33]:
test_data.shape

(10000, 2)

In [35]:
## Tokenizer Working Example

# Create a fresh tokenizer
test_tokenizer = Tokenizer()

# Fit tokenizer on a single text
test_tokenizer.fit_on_texts(["I need a job"])   # NOTE: needs to be inside a list, not just a string

# Check the vocabulary
print("Word Index:", test_tokenizer.word_index)

# Convert the text into numbers (sequence)
seq = test_tokenizer.texts_to_sequences(["I need a job"])
print("Sequence:", seq)

Word Index: {'i': 1, 'need': 2, 'a': 3, 'job': 4}
Sequence: [[1, 2, 3, 4]]


In [37]:
tokenizer = Tokenizer(num_words = 5000)
# Dataset has 50,000 reviews, it may contain tens of thousands of unique words.
# By limiting to the most frequent 5,000, we:
# Reduce vocabulary size, Reduce memory usage, Make training faster and more stable
# Using fewer words = simpler, faster model but maybe less expressive.
# Using more words = more detail but heavier computation.
tokenizer.fit_on_texts(train_data["review"])

In [38]:
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen = 200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen = 200)
# Takes the reviews (text) and converts them into sequences of integers (word indexes).
# Each review becomes a list of numbers. 
# Problem: Different reviews have different lengths (some are short, some very long).
# But neural networks need fixed-length input. 
# Pads or truncates all sequences so they’re the same length.
# maxlen=200 → every review will be exactly 200 tokens long. 
# Larger → more information, but slower training, more memory. 
# Smaller → faster, but risk losing context.

In [39]:
X_train

array([[1935,    1, 1200, ...,  205,  351, 3856],
       [   3, 1651,  595, ...,   89,  103,    9],
       [   0,    0,    0, ...,    2,  710,   62],
       ...,
       [   0,    0,    0, ..., 1641,    2,  603],
       [   0,    0,    0, ...,  245,  103,  125],
       [   0,    0,    0, ...,   70,   73, 2062]])

In [40]:
X_test

array([[   0,    0,    0, ...,  995,  719,  155],
       [  12,  162,   59, ...,  380,    7,    7],
       [   0,    0,    0, ...,   50, 1088,   96],
       ...,
       [   0,    0,    0, ...,  125,  200, 3241],
       [   0,    0,    0, ..., 1066,    1, 2305],
       [   0,    0,    0, ...,    1,  332,   27]])

In [42]:
Y_train = train_data["sentiment"]
Y_test = test_data["sentiment"]

In [43]:
Y_train

39087    0
30893    0
45278    1
16398    0
13653    0
        ..
11284    1
44732    1
38158    0
860      1
15795    1
Name: sentiment, Length: 40000, dtype: int64

In [44]:
# BUILDING THE LSTM MODEL

In [45]:
model = Sequential()
# Creates a Sequential model → a stack of layers where data flows step by step.
# You add layers one by one in order.
model.add(Embedding(input_dim = 5000, output_dim = 128))
# Instead of treating words as unrelated IDs, the model learns to place similar words close together in vector space.
# Example: “great” and “awesome” → vectors close together. “terrible” → vector far away.
# input_dim=5000 → only the top 5k words are included.
# output_dim=128 → each word is represented in a 128-dim space.
# Output shape: (200, 128) for each review (200 tokens max, each mapped to 128 numbers).
model.add(LSTM(128, dropout = 0.2, recurrent_dropout = 0.2))
# LSTM (Long Short-Term Memory) is a special RNN that remembers important info and forgets unimportant info as it reads a sequence word by word.
# 128 → The number of units (neurons) inside the LSTM. This determines the size of the hidden state (the “memory vector”).
# dropout=0.2 → 20% of inputs are randomly dropped out (helps prevent overfitting).
# recurrent_dropout=0.2 → 20% dropout on the recurrent connections (the memory links).
# Output shape: (128,) → a single 128-dim vector summarizing the whole review.
model.add(Dense(1,activation = "sigmoid"))
# A fully connected (dense) layer that takes the 128 features from LSTM and outputs one number.
# The sigmoid squashes this number into the range [0, 1].
# For binary classification (positive/negative sentiment), we want a probability.
# Output ≈ 0.95 → confident it’s positive. Output ≈ 0.08 → confident it’s negative.
# Dense(1) → one neuron, because we only predict one thing: sentiment.
# activation="sigmoid" → converts raw score into probability.

In [46]:
model.build(input_shape=(None, 200))  # None = batch size, 200 = sequence length
model.summary()

In [55]:
# Compiling
model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics=["accuracy"])

In [57]:
# Fitting
model.fit(X_train, Y_train, epochs=5, batch_size=64, validation_split = 0.2)
# An epoch = 1 full pass through the entire training dataset.
# 1 epoch = model sees all 50,000 once. 5 epochs = model sees all 50,000 five times (gets better each time).
# Each batch = 64 reviews. Updating weights after each review would be too noisy and too slow.
# So the model: 
# Looks at 64 reviews, Predicts, Calculates average error (loss) over those 64.Updates weights once.

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 293ms/step - accuracy: 0.7874 - loss: 0.4608 - val_accuracy: 0.7876 - val_loss: 0.4657
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 293ms/step - accuracy: 0.8372 - loss: 0.3782 - val_accuracy: 0.8422 - val_loss: 0.3832
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 288ms/step - accuracy: 0.8742 - loss: 0.3075 - val_accuracy: 0.8701 - val_loss: 0.3226
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m189s[0m 261ms/step - accuracy: 0.8898 - loss: 0.2768 - val_accuracy: 0.8566 - val_loss: 0.3430
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 268ms/step - accuracy: 0.9073 - loss: 0.2330 - val_accuracy: 0.8691 - val_loss: 0.3268


<keras.src.callbacks.history.History at 0x255292be720>

In [58]:
# Compile = Set up training rules → optimizer, loss, metrics.
# Fit = Train the model → feed data, adjust weights, improve predictions.

In [None]:
# Save Model
model.save("my_model.keras")

In [None]:
# Saving the tokenizer
import joblib
joblib.dump(tokenizer, "tokenizer.pkl")

In [None]:
loss, accuracy = model.evaluate(X_test, Y_test)
# model.evaluate() tests the trained model on new data not used for training.

In [None]:
print(loss)

In [None]:
print(accuracy)

In [None]:
## Building a Predictive System

In [None]:
def predictive_system(review):
    sequences = tokenizer.texts_to_sequences([review])
    # Takes a single review (string) as input.
    # Uses the saved tokenizer to convert each word into its integer ID.
    # Wrapping [review] in a list makes it compatible with the tokenizer (it expects a list of texts).
    padded_sequence = pad_sequences(sequences, maxlen = 200)
    # Neural networks expect fixed-length inputs, so we pad (or truncate) sequences.
    prediction = model.predict(padded_sequence)
    # Feeds the padded review into the trained LSTM model.
    # Returns a probability between 0 and 1 because the last layer is sigmoid.
    sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
    return sentiment

In [None]:
predictive_system("This movie was great")