In [18]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


In [19]:
df = pd.read_csv("/kaggle/input/imdb-movie-ratings-sentiment-analysis/movie.csv")

In [20]:
df.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [21]:
df['text'] = df['text'].str.lower()

In [22]:
df.head()

Unnamed: 0,text,label
0,i grew up (b. 1965) watching and loving the th...,0
1,"when i put this movie in my dvd player, and sa...",0
2,why do people who do not know what a particula...,0
3,even though i have great interest in biblical ...,0
4,im a die hard dads army fan and nothing will e...,1


In [23]:
df.isnull().sum()

text     0
label    0
dtype: int64

In [24]:
from sklearn.model_selection import train_test_split
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

In [25]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
stop_words = set(stopwords.words('english'))
sentiment_stops = {'not', 'no', 'never', 'neither', 'nor', 'but', 'however', 'although', 'though', 'despite'}
stop_words = stop_words - sentiment_stops

In [27]:
def clean_text(text):
    text = text.lower() # to lower the sentences
    text = re.sub(r'[^\w\s]', '', text) # removing punctuation marks
    text = re.sub(r'\d+', '', text) # removing numbers
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    tokens = word_tokenize(text) # tokenization
    tokens = [word for word in tokens if word not in stop_words] # extract the  stopping words
    text = ' '.join(tokens) # join clear text
    return text

In [28]:
df['Cleaned_Review'] = df['text'].apply(clean_text)

In [29]:
df.head()

Unnamed: 0,text,label,Cleaned_Review
0,i grew up (b. 1965) watching and loving the th...,0,grew b watching loving thunderbirds mates scho...
1,"when i put this movie in my dvd player, and sa...",0,put movie dvd player sat coke chips expectatio...
2,why do people who do not know what a particula...,0,people not know particular time past like feel...
3,even though i have great interest in biblical ...,0,even though great interest biblical movies bor...
4,im a die hard dads army fan and nothing will e...,1,im die hard dads army fan nothing ever change ...


In [30]:
X = df['Cleaned_Review']
y = df['label']

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [60]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping,ReduceLROnPlateau
from tensorflow.keras.layers import (
    Embedding, LSTM, Dense, Dropout, Bidirectional, Input
)

In [61]:
max_features = 25000


tokenizer = Tokenizer(num_words=max_features,oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_test = tokenizer.texts_to_sequences(X_test)

max_len = max(len(tokens) for tokens in sequences_train)

X_train = pad_sequences(sequences_train, maxlen=max_len)
X_test = pad_sequences(sequences_test, maxlen=max_len)


print(f"Text sequence shape: {X_text.shape}")
print(f"Vocabulary size: {len(tokenizer.word_index)}")


Text sequence shape: (8000, 1484)
Vocabulary size: 137365


In [62]:
vocab_size = len(tokenizer.word_index) + 1

In [63]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=100))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(32)))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [64]:


# Callbacks
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=8,
    restore_best_weights=True
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.1,
    patience=3,
    verbose=1
)
with tf.device('/device:GPU:0'):

  history = model.fit(
      X_train, y_train,
      epochs=20,
      batch_size=32,
      validation_data=(X_test, y_test),
      callbacks=[early_stopping, reduce_lr]
  )

# Evaluate model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test accuracy: {test_accuracy:.4f}")

Epoch 1/20
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m251s[0m 247ms/step - accuracy: 0.7030 - loss: 0.5509 - val_accuracy: 0.8723 - val_loss: 0.3170 - learning_rate: 0.0010
Epoch 2/20
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m247s[0m 247ms/step - accuracy: 0.9208 - loss: 0.2186 - val_accuracy: 0.8889 - val_loss: 0.2932 - learning_rate: 0.0010
Epoch 3/20
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m247s[0m 247ms/step - accuracy: 0.9564 - loss: 0.1295 - val_accuracy: 0.8811 - val_loss: 0.2994 - learning_rate: 0.0010
Epoch 4/20
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m246s[0m 246ms/step - accuracy: 0.9751 - loss: 0.0845 - val_accuracy: 0.8783 - val_loss: 0.3698 - learning_rate: 0.0010
Epoch 5/20
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 221ms/step - accuracy: 0.9875 - loss: 0.0479
Epoch 5: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
[1m1000/1000[0m [32m━━━━━━━━

In [65]:
model.summary()