In [20]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, GRU, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split


In [22]:
df = pd.read_csv(r'/content/IMDB Dataset.csv')

In [23]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [24]:
import re
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ' '.join([w for w in text.split() if w not in stop_words])
    return text

df['cleaned_review'] = df['review'].apply(clean_text)

# Encode labels
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})

df[['cleaned_review', 'label']].head()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,cleaned_review,label
0,one reviewers mentioned watching oz episode yo...,1
1,wonderful little production filming technique ...,1
2,thought wonderful way spend time hot summer we...,1
3,basically theres family little boy jake thinks...,0
4,petter matteis love time money visually stunni...,1


In [25]:
vocab_size = 10000  # top 10k words
max_len = 200       # pad/truncate reviews to 200 words

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(df['cleaned_review'])

# Convert text to sequences of integers
sequences = tokenizer.texts_to_sequences(df['cleaned_review'])

# Pad sequences
X = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')

# Labels
y = np.array(df['label'], dtype=np.int32)

print(X.shape, y.shape)



(50000, 200) (50000,)


In [26]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(X_train.shape, X_test.shape)


(40000, 200) (10000, 200)


In [27]:
rnn_model = Sequential([
    Embedding(vocab_size, 128, input_length=max_len),
    SimpleRNN(128),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

rnn_model.compile(loss='binary_crossentropy', optimizer=Adam(1e-3), metrics=['accuracy'])
rnn_model.summary()




In [28]:
lstm_model = Sequential([
    Embedding(vocab_size, 128, input_length=max_len),
    LSTM(128),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

lstm_model.compile(loss='binary_crossentropy', optimizer=Adam(1e-3), metrics=['accuracy'])
lstm_model.summary()


In [29]:
gru_model = Sequential([
    Embedding(vocab_size, 128, input_length=max_len),
    GRU(128),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

gru_model.compile(loss='binary_crossentropy', optimizer=Adam(1e-3), metrics=['accuracy'])
gru_model.summary()


## rnn

In [30]:
history = rnn_model.fit(
    X_train, y_train,
    validation_split=0.1,
    epochs=5,
    batch_size=32
)


Epoch 1/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 88ms/step - accuracy: 0.5056 - loss: 0.7248 - val_accuracy: 0.4900 - val_loss: 0.6957
Epoch 2/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 85ms/step - accuracy: 0.4968 - loss: 0.7048 - val_accuracy: 0.4905 - val_loss: 0.6937
Epoch 3/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 92ms/step - accuracy: 0.5055 - loss: 0.6967 - val_accuracy: 0.4988 - val_loss: 0.6957
Epoch 4/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 88ms/step - accuracy: 0.5034 - loss: 0.6957 - val_accuracy: 0.5033 - val_loss: 0.6932
Epoch 5/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 84ms/step - accuracy: 0.5147 - loss: 0.6946 - val_accuracy: 0.5045 - val_loss: 0.6933


In [31]:
loss, acc = rnn_model.evaluate(X_test, y_test)
print(f"Test Accuracy: {acc*100:.2f}%")


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 21ms/step - accuracy: 0.5033 - loss: 0.6930
Test Accuracy: 49.62%


# LSTM

In [32]:
history = lstm_model.fit(
    X_train, y_train,
    validation_split=0.1,
    epochs=5,
    batch_size=32
)


Epoch 1/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m376s[0m 333ms/step - accuracy: 0.5069 - loss: 0.6937 - val_accuracy: 0.6185 - val_loss: 0.6650
Epoch 2/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m372s[0m 331ms/step - accuracy: 0.5950 - loss: 0.6666 - val_accuracy: 0.5307 - val_loss: 0.6832
Epoch 3/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m383s[0m 331ms/step - accuracy: 0.5870 - loss: 0.6295 - val_accuracy: 0.6885 - val_loss: 0.5843
Epoch 4/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m369s[0m 328ms/step - accuracy: 0.8139 - loss: 0.4224 - val_accuracy: 0.8752 - val_loss: 0.3027
Epoch 5/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m377s[0m 335ms/step - accuracy: 0.9169 - loss: 0.2262 - val_accuracy: 0.8777 - val_loss: 0.3067


In [33]:
loss, acc = lstm_model.evaluate(X_test, y_test)
print(f"Test Accuracy: {acc*100:.2f}%")


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 104ms/step - accuracy: 0.8773 - loss: 0.3180
Test Accuracy: 87.42%


## GRU

In [34]:
history = gru_model.fit(
    X_train, y_train,
    validation_split=0.1,
    epochs=5,
    batch_size=32
)


Epoch 1/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m292s[0m 257ms/step - accuracy: 0.5003 - loss: 0.6938 - val_accuracy: 0.5178 - val_loss: 0.6963
Epoch 2/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m309s[0m 246ms/step - accuracy: 0.5652 - loss: 0.6637 - val_accuracy: 0.8720 - val_loss: 0.2983
Epoch 3/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m276s[0m 245ms/step - accuracy: 0.8976 - loss: 0.2591 - val_accuracy: 0.8905 - val_loss: 0.2602
Epoch 4/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 245ms/step - accuracy: 0.9521 - loss: 0.1417 - val_accuracy: 0.8823 - val_loss: 0.2932
Epoch 5/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m273s[0m 243ms/step - accuracy: 0.9781 - loss: 0.0746 - val_accuracy: 0.8763 - val_loss: 0.3800


In [35]:
loss, acc = gru_model.evaluate(X_test, y_test)
print(f"Test Accuracy: {acc*100:.2f}%")


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 55ms/step - accuracy: 0.8787 - loss: 0.3962
Test Accuracy: 87.72%
