In [2]:
import numpy as np
import pandas as pd
import kagglehub
import os

In [3]:
dataset_path = kagglehub.dataset_download('lakshmi25npathi/imdb-dataset-of-50k-movie-reviews')

csv_file = os.path.join(dataset_path, 'IMDB Dataset.csv')
df = pd.read_csv(csv_file)
df.head()



Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
import json
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [6]:
df.shape

(50000, 2)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [8]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [9]:
df.sentiment.value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [10]:
pd.get_dummies(df['sentiment'])

Unnamed: 0,negative,positive
0,False,True
1,False,True
2,False,True
3,True,False
4,False,True
...,...,...
49995,False,True
49996,True,False
49997,True,False
49998,True,False


In [11]:
df.sentiment = df['sentiment'].map({'positive':1,'negative':0})

In [12]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [13]:
df.sentiment.value_counts()

sentiment
1    25000
0    25000
Name: count, dtype: int64

In [14]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [15]:
print(train_df.shape)
print(test_df.shape)

(40000, 2)
(10000, 2)


In [16]:
tokernizer = Tokenizer(num_words=5000)
tokernizer.fit_on_texts(train_df.review)
x_train = pad_sequences(tokernizer.texts_to_sequences(train_df.review), maxlen=200)
x_test = pad_sequences(tokernizer.texts_to_sequences(test_df.review), maxlen=200)

In [17]:
os.makedirs("assets", exist_ok=True)

tokenizer_json= tokernizer.to_json()
with open('assets/tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

print("Tokenizer saved to assets/tokenizer.json")

Tokenizer saved to assets/tokenizer.json


In [18]:
x_train

array([[1935,    1, 1200, ...,  205,  351, 3856],
       [   3, 1651,  595, ...,   89,  103,    9],
       [   0,    0,    0, ...,    2,  710,   62],
       ...,
       [   0,    0,    0, ..., 1641,    2,  603],
       [   0,    0,    0, ...,  245,  103,  125],
       [   0,    0,    0, ...,   70,   73, 2062]])

In [19]:
y_train = train_df.sentiment
y_test = test_df.sentiment

In [20]:
y_train

39087    0
30893    0
45278    1
16398    0
13653    0
        ..
11284    1
44732    1
38158    0
860      1
15795    1
Name: sentiment, Length: 40000, dtype: int64

In [22]:
import tensorflow as tf
model = Sequential([
    tf.keras.layers.Input(shape=(200,)),
    Embedding(input_dim = 5000, output_dim = 128, input_length=200),
    Dropout(0.3),
    LSTM(128, return_sequences=True),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])



In [23]:
model.summary()

In [24]:
model.compile(
    optimizer= 'adam',
    loss = 'binary_crossentropy',
    metrics = ['accuracy']
)

In [25]:
x_train.shape

(40000, 200)

In [26]:
model.fit(x_train,y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m795s[0m 2s/step - accuracy: 0.7391 - loss: 0.4961 - val_accuracy: 0.8658 - val_loss: 0.3242
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m453s[0m 908ms/step - accuracy: 0.8936 - loss: 0.2700 - val_accuracy: 0.8781 - val_loss: 0.2994
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m462s[0m 925ms/step - accuracy: 0.9165 - loss: 0.2193 - val_accuracy: 0.8712 - val_loss: 0.3362
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m475s[0m 951ms/step - accuracy: 0.9272 - loss: 0.1931 - val_accuracy: 0.8832 - val_loss: 0.2982
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m618s[0m 1s/step - accuracy: 0.9436 - loss: 0.1573 - val_accuracy: 0.8605 - val_loss: 0.3356


<keras.src.callbacks.history.History at 0x1bb3cbe18a0>

In [27]:
model.save("assets/sentiment_lstm.keras")
print("Model saved to assets/sentiment_lstm.keras")

Model saved to assets/sentiment_lstm.keras


In [28]:
loss, accuracy = model.evaluate(x_test,y_test)
print(f"Test loss: {loss}")
print(f"Test accuracy: {accuracy}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 112ms/step - accuracy: 0.8597 - loss: 0.3270
Test loss: 0.32563573122024536
Test accuracy: 0.8654999732971191


In [29]:
def predict_sentiment(review):
    sequence = tokernizer.texts_to_sequences([review])
    padded_sequences= pad_sequences(sequence, maxlen=200)
    prediction = model.predict(padded_sequences)
    sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
    return sentiment

In [30]:
exp = "This movie was fantastic. I love it."
sentiment = predict_sentiment(exp)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 688ms/step
The sentiment of the review is: positive


In [31]:
exp = "War 2 tries hard to impress but falls short of the hype. Despite a star-studded cast with Hrithik Roshan and NTR Jr., the film suffers from a weak script and predictable twists. The action, though flashy, feels overdone and lacks emotional depth. Character development is minimal, and the pacing drags in parts. It leans too heavily on style over substance, leaving little impact once the credits roll."
sentiment = predict_sentiment(exp)
print(f"The sentiment of review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step
The sentiment of review is: negative
