In [2]:
!pip install contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (11

In [3]:
import pandas as pd
import numpy as np
import re
import html
import contractions
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout, BatchNormalization
from tensorflow.keras.optimizers import RMSprop
from sklearn.metrics import accuracy_score, classification_report

In [6]:
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

In [4]:
def clean_text(text):
    
    text = text.lower()
    text = html.unescape(text)  # Convert HTML entities
    text = contractions.fix(text)  # Expand contractions
    text = re.sub(r"[^\w\s.,!?'-]", '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

In [7]:
df['review'] = df['review'].apply(clean_text)
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [8]:
train_text, test_text, train_labels, test_labels = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)

In [9]:
num_words = 10000  
oov_token = "<OOV>"
tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)
tokenizer.fit_on_texts(train_text)

In [10]:
sequences = tokenizer.texts_to_sequences(train_text)
max_length = 300
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

In [11]:
X_train, X_val, y_train, y_val = train_test_split(padded_sequences, train_labels, test_size=0.2, random_state=42)

In [12]:
model = Sequential([
    
    Embedding(input_dim=num_words, output_dim=300),
    Bidirectional(LSTM(units=128, return_sequences=True)),
    Dropout(0.3),
    LSTM(units=64, return_sequences=False),
    BatchNormalization(),
    Dense(units=64, activation='relu'),
    Dropout(0.3),
    Dense(units=1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', 
              optimizer=RMSprop(learning_rate=0.001), 
              metrics=['accuracy'])

In [13]:
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 61ms/step - accuracy: 0.5267 - loss: 0.7065 - val_accuracy: 0.4991 - val_loss: 0.7304
Epoch 2/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 61ms/step - accuracy: 0.5558 - loss: 0.6673 - val_accuracy: 0.5691 - val_loss: 0.6930
Epoch 3/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 61ms/step - accuracy: 0.6200 - loss: 0.6205 - val_accuracy: 0.6776 - val_loss: 0.6902
Epoch 4/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 61ms/step - accuracy: 0.7531 - loss: 0.5118 - val_accuracy: 0.8612 - val_loss: 0.3528
Epoch 5/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 61ms/step - accuracy: 0.8751 - loss: 0.3048 - val_accuracy: 0.8792 - val_loss: 0.3127
Epoch 6/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 62ms/step - accuracy: 0.9175 - loss: 0.2176 - val_accuracy: 0.8814 - val_loss: 0.2816
Epoc

In [14]:
test_tokens = tokenizer.texts_to_sequences(test_text)
test_sequence = pad_sequences(test_tokens, maxlen=max_length, padding='post')

In [15]:
predictions = model.predict(test_sequence)
y_pred = (predictions > 0.5).astype(int)  

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 23ms/step


In [16]:
accuracy = accuracy_score(test_labels, y_pred)
report = classification_report(test_labels, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")
print(report)

Test Accuracy: 0.8993
              precision    recall  f1-score   support

           0       0.89      0.91      0.90      4961
           1       0.91      0.88      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

