In [None]:
!pip install kaggle



In [None]:
# configuring the path of Kaggle.json file
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
  0% 0.00/25.7M [00:00<?, ?B/s]
100% 25.7M/25.7M [00:00<00:00, 1.39GB/s]


In [None]:
# unzip the dataset file
from zipfile import ZipFile
with ZipFile("/content/imdb-dataset-of-50k-movie-reviews.zip", "r") as zip_ref:
  zip_ref.extractall()

In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Dropout, Bidirectional, Conv1D, MaxPooling1D
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
# Load the CSV file after unzipping
df = pd.read_csv('/content/IMDB Dataset.csv')

# Show first few rows
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
manual_stopwords = set([
    "a", "an", "the", "is", "are", "was", "were", "be", "been", "has", "have", "had",
    "do", "does", "did", "but", "if", "or", "because", "as", "what", "which", "this",
    "that", "these", "those", "then", "there", "here", "when", "where", "why", "how",
    "all", "any", "both", "each", "few", "more", "some", "such", "no", "nor", "too",
    "very", "can", "will", "just", "not", "in", "on", "at", "by", "with", "about",
    "against", "between", "into", "through", "during", "before", "after", "above",
    "below", "to", "from", "up", "down", "out", "off", "over", "under", "again",
    "further", "then", "once", "of", "for", "and"
])

def clean_text_manual(text):
    text = text.lower()
    text = re.sub(r"<br\s*/?>", " ", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text)
    text = " ".join([word for word in text.split() if word not in manual_stopwords])
    return text.strip()

df['clean_review'] = df['review'].apply(clean_text_manual)
df[['review', 'clean_review']].head()

Unnamed: 0,review,clean_review
0,One of the other reviewers has mentioned that ...,one other reviewers mentioned watching oz epis...
1,A wonderful little production. <br /><br />The...,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,i thought wonderful way spend time hot summer ...
3,Basically there's a family where a little boy ...,basically theres family little boy jake thinks...
4,"Petter Mattei's ""Love in the Time of Money"" is...",petter matteis love time money visually stunni...


In [None]:
# Show the first 10 sentences after cleaning
for i, review in enumerate(df['clean_review'].head(10), 1):
    print(f"{i}. {review}")

1. one other reviewers mentioned watching oz episode youll hooked they right exactly happened me first thing struck me oz its brutality unflinching scenes violence set right word go trust me show faint hearted timid show pulls punches regards drugs sex violence its hardcore classic use word it called oz nickname given oswald maximum security state penitentary it focuses mainly emerald city experimental section prison cells glass fronts face inwards so privacy high agenda em city home manyaryans muslims gangstas latinos christians italians irish moreso scuffles death stares dodgy dealings shady agreements never far away i would say main appeal show due fact it goes other shows wouldnt dare forget pretty pictures painted mainstream audiences forget charm forget romanceoz doesnt mess around first episode i ever saw struck me so nasty it surreal i couldnt say i ready it i watched i developed taste oz got accustomed high levels graphic violence violence injustice crooked guards wholl sold n

In [None]:
# Tokenization & Padding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_words = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(df['clean_review'])

sequences = tokenizer.texts_to_sequences(df['clean_review'])
X = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')

y = df['sentiment'].apply(lambda label: 1 if label == 'positive' else 0).values

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.1, random_state=42
)

print("Train shape:", X_train.shape, y_train.shape)
print("Validation shape:", X_val.shape, y_val.shape)
print("Test shape:", X_test.shape, y_test.shape)

Train shape: (36000, 100) (36000,)
Validation shape: (4000, 100) (4000,)
Test shape: (10000, 100) (10000,)


## 📊 SimpleRNN model The simplest type of recursive network. Best used for beginning or short texts.

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2

# Build SimpleRNN model
rnn_model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=100),
    SimpleRNN(64, dropout=0.5, recurrent_dropout=0.5, kernel_regularizer=l2(0.001)),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile
rnn_model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

# Early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

# Train
rnn_model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=6,
    batch_size=64,
    callbacks=[early_stop],
    verbose=1
)

Epoch 1/6




[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 44ms/step - accuracy: 0.4970 - loss: 0.8573 - val_accuracy: 0.4970 - val_loss: 0.7598
Epoch 2/6
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 43ms/step - accuracy: 0.4956 - loss: 0.7668 - val_accuracy: 0.4936 - val_loss: 0.7583
Epoch 3/6
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 42ms/step - accuracy: 0.5059 - loss: 0.7591 - val_accuracy: 0.4964 - val_loss: 0.7584
Epoch 4/6
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 41ms/step - accuracy: 0.5031 - loss: 0.7592 - val_accuracy: 0.5085 - val_loss: 0.7582
Epoch 5/6
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 41ms/step - accuracy: 0.5034 - loss: 0.7591 - val_accuracy: 0.5054 - val_loss: 0.7589
Epoch 6/6
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 42ms/step - accuracy: 0.4949 - loss: 0.7602 - val_accuracy: 0.4958 - val_loss: 0.7598


<keras.src.callbacks.history.History at 0x78a95a23e610>

In [None]:
# Evaluate SimpleRNN model
loss, accuracy = rnn_model.evaluate(X_test, y_test)
print(f"SimpleRNN Test Accuracy: {accuracy:.4f}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.5074 - loss: 0.7583
SimpleRNN Test Accuracy: 0.5080


## 📊 LSTM Model robust model that handles forgetting problems in SimpleRNN. Suitable for long sequences.

In [None]:
# Step 1: Build the LSTM Model

from tensorflow.keras.regularizers import l2

# Define the LSTM model
lstm_model = Sequential([
    Embedding(input_dim=10000, output_dim=32, input_length=100),
    Dropout(0.5),
    LSTM(32, dropout=0.5, recurrent_dropout=0.5, kernel_regularizer=l2(0.001)),
    Dense(1, activation='sigmoid')
])

# Compile the model
lstm_model.compile(loss='binary_crossentropy',
                   optimizer='adam',
                   metrics=['accuracy'])

# Define EarlyStopping
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history_lstm = lstm_model.fit(X_train, y_train,
                              epochs=10,
                              batch_size=64,
                              validation_data=(X_val, y_val),
                              callbacks=[early_stop],
                              verbose=1)

# Evaluate on test data
test_loss_lstm, test_acc_lstm = lstm_model.evaluate(X_test, y_test, verbose=0)

test_acc_lstm



Epoch 1/10
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 321ms/step - accuracy: 0.5333 - loss: 0.7059 - val_accuracy: 0.8000 - val_loss: 0.5429
Epoch 2/10
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m179s[0m 317ms/step - accuracy: 0.6968 - loss: 0.6154 - val_accuracy: 0.7527 - val_loss: 0.5574
Epoch 3/10
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m205s[0m 323ms/step - accuracy: 0.7262 - loss: 0.5897 - val_accuracy: 0.8090 - val_loss: 0.4896
Epoch 4/10
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 326ms/step - accuracy: 0.7596 - loss: 0.5460 - val_accuracy: 0.8040 - val_loss: 0.4942
Epoch 5/10
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m198s[0m 318ms/step - accuracy: 0.7682 - loss: 0.5349 - val_accuracy: 0.8058 - val_loss: 0.4688
Epoch 6/10
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m178s[0m 316ms/step - accuracy: 0.7940 - loss: 0.4884 - val_accuracy: 0.8310 - val_loss: 0.4161
Epoc

0.8361999988555908

In [None]:
print(f"LSTM Test Accuracy: {test_acc_lstm:.4f}")

LSTM Test Accuracy: 0.8362


## 📊 The GRU model is lighter and faster than LSTM but gives close accuracy in many cases.

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dropout, Dense
from tensorflow.keras.callbacks import EarlyStopping

# Build GRU model
gru_model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=100),
    GRU(64, dropout=0.5, recurrent_dropout=0.5, kernel_regularizer=l2(0.001)),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile
gru_model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

# Early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

# Train
gru_model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=6,
    batch_size=64,
    callbacks=[early_stop],
    verbose=1
)

Epoch 1/6
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 129ms/step - accuracy: 0.5131 - loss: 0.7244 - val_accuracy: 0.7474 - val_loss: 0.5847
Epoch 2/6
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 128ms/step - accuracy: 0.7021 - loss: 0.6135 - val_accuracy: 0.7560 - val_loss: 0.5541
Epoch 3/6
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 130ms/step - accuracy: 0.7695 - loss: 0.5504 - val_accuracy: 0.8114 - val_loss: 0.4816
Epoch 4/6
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 125ms/step - accuracy: 0.7980 - loss: 0.5096 - val_accuracy: 0.7865 - val_loss: 0.4993
Epoch 5/6
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 127ms/step - accuracy: 0.8030 - loss: 0.4861 - val_accuracy: 0.7879 - val_loss: 0.4815
Epoch 6/6
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 125ms/step - accuracy: 0.8174 - loss: 0.4646 - val_accuracy: 0.8191 - val_loss: 0.4569


<keras.src.callbacks.history.History at 0x78a958bc02d0>

In [None]:
loss, accuracy = gru_model.evaluate(X_test, y_test)
print(f"GRU Test Accuracy: {accuracy:.4f}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 22ms/step - accuracy: 0.8233 - loss: 0.4462
GRU Test Accuracy: 0.8228


## 📊 A Bidirectional LSTM model processes the sequence from forward to backward, helping to understand the full context of the sentence.

In [None]:
# Build Bidirectional LSTM model
bilstm_model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=100),
    Bidirectional(LSTM(64, dropout=0.5, recurrent_dropout=0.5, kernel_regularizer=l2(0.001))),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile
bilstm_model.compile(loss='binary_crossentropy',
                     optimizer='adam',
                     metrics=['accuracy'])

# Early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

# Train
bilstm_model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=6,
    batch_size=64,
    callbacks=[early_stop],
    verbose=1
)

Epoch 1/6
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 272ms/step - accuracy: 0.6303 - loss: 0.6919 - val_accuracy: 0.8241 - val_loss: 0.4246
Epoch 2/6
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 270ms/step - accuracy: 0.8254 - loss: 0.4322 - val_accuracy: 0.8478 - val_loss: 0.3968
Epoch 3/6
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 269ms/step - accuracy: 0.8541 - loss: 0.3823 - val_accuracy: 0.8263 - val_loss: 0.4444
Epoch 4/6
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 267ms/step - accuracy: 0.8630 - loss: 0.3645 - val_accuracy: 0.8335 - val_loss: 0.4131


<keras.src.callbacks.history.History at 0x78a958884f10>

In [None]:
loss, accuracy = bilstm_model.evaluate(X_test, y_test)
print(f"Bidirectional LSTM Test Accuracy: {accuracy:.4f}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 63ms/step - accuracy: 0.8593 - loss: 0.3759
Bidirectional LSTM Test Accuracy: 0.8549


## 📊 CNN + LSTM model for pattern capture, and LSTM for sequence understanding. It combines the features of both and delivers powerful performance.

In [None]:
# Build CNN + LSTM model
cnn_lstm_model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=100),
    Conv1D(filters=64, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=2),
    LSTM(64, dropout=0.5, recurrent_dropout=0.5, kernel_regularizer=l2(0.001)),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile
cnn_lstm_model.compile(loss='binary_crossentropy',
                       optimizer='adam',
                       metrics=['accuracy'])

# Early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

# Train
cnn_lstm_model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=6,
    batch_size=64,
    callbacks=[early_stop],
    verbose=1
)

Epoch 1/6
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 107ms/step - accuracy: 0.5987 - loss: 0.6735 - val_accuracy: 0.8246 - val_loss: 0.4306
Epoch 2/6
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 109ms/step - accuracy: 0.8525 - loss: 0.3961 - val_accuracy: 0.8482 - val_loss: 0.3824
Epoch 3/6
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 101ms/step - accuracy: 0.8840 - loss: 0.3190 - val_accuracy: 0.8465 - val_loss: 0.4108
Epoch 4/6
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 106ms/step - accuracy: 0.9095 - loss: 0.2675 - val_accuracy: 0.8471 - val_loss: 0.4283


<keras.src.callbacks.history.History at 0x7eb8d4bbed10>

In [None]:
loss, accuracy = cnn_lstm_model.evaluate(X_test, y_test)
print(f"CNN + LSTM Test Accuracy: {accuracy:.4f}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 29ms/step - accuracy: 0.8521 - loss: 0.3703
CNN + LSTM Test Accuracy: 0.8518


In [None]:
# Save the best performing model (CNN + LSTM)
cnn_lstm_model.save("best_cnn_lstm_model.keras")

preprocessing function for the new sentence

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def preprocess_input_sentence(sentence):
    words = sentence.split()
    sequence = tokenizer.texts_to_sequences([sentence])
    padded = pad_sequences(sequence, maxlen=max_len, padding='post', truncating='post')
    return padded

Download the model and classify it

In [None]:
from keras.models import load_model

def classify_sentence(input_sentence):
    preprocessed_sentence = preprocess_input_sentence(input_sentence)
    model = load_model("best_cnn_lstm_model.keras")
    prediction = model.predict(preprocessed_sentence)[0][0]
    sentiment = "Positive" if prediction > 0.5 else "Negative"
    return sentiment

Test the model on new sentences.

In [None]:
new_sentences = [
    "The food was fantastic",
    "The movie was terrible",
    "I love this movie it is great one",
    "I hated every second of it"
]

predictions = [classify_sentence(sentence) for sentence in new_sentences]

for sentence, prediction in zip(new_sentences, predictions):
    print(f"Sentence: {sentence} → Sentiment: {prediction}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 440ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 374ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 361ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 376ms/step
Sentence: The food was fantastic → Sentiment: Positive
Sentence: The movie was terrible → Sentiment: Negative
Sentence: I love this movie it is great one → Sentiment: Positive
Sentence: I hated every second of it → Sentiment: Positive
