In [None]:
# =================================================
# Exploratory Analysis & LSTM vs GRU Comparison
# =================================================

# 1Ô∏è‚É£ Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import re
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Dropout
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [None]:
# -----------------------
# 2Ô∏è‚É£ Load Dataset
# -----------------------
df = pd.read_csv('../data/raw/imdb_reviews.csv')
df.head()


In [None]:
# -----------------------
# 3Ô∏è‚É£ Class Distribution
# -----------------------
sns.countplot(x='sentiment', data=df)
plt.title("Sentiment Class Distribution")
plt.show()


In [None]:
# -----------------------
# 4Ô∏è‚É£ Text Cleaning
# -----------------------
def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

df['clean_review'] = df['review'].apply(clean_text)

In [None]:
# -----------------------
# 5Ô∏è‚É£ WordClouds
# -----------------------
positive_text = ' '.join(df[df['sentiment']==1]['clean_review'])
negative_text = ' '.join(df[df['sentiment']==0]['clean_review'])

plt.figure(figsize=(12,6))
plt.subplot(1,2,1)
wc = WordCloud(width=500, height=400, background_color='white').generate(positive_text)
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.title('Positive Reviews WordCloud')

plt.subplot(1,2,2)
wc = WordCloud(width=500, height=400, background_color='white').generate(negative_text)
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.title('Negative Reviews WordCloud')
plt.show()

In [None]:
# -----------------------
# 6Ô∏è‚É£ Sequence Length Distribution
# -----------------------
seq_lengths = df['clean_review'].apply(lambda x: len(x.split()))
plt.figure(figsize=(8,5))
sns.histplot(seq_lengths, bins=50)
plt.title("Review Length Distribution")
plt.xlabel("Number of Words")
plt.ylabel("Frequency")
plt.show()

In [None]:
# -----------------------
# 7Ô∏è‚É£ Tokenization & Padding
# -----------------------
max_words = 10000
max_len = 200

tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(df['clean_review'])
sequences = tokenizer.texts_to_sequences(df['clean_review'])
padded = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')

X_train, X_test, y_train, y_test = train_test_split(padded, df['sentiment'].values, test_size=0.2, random_state=42)



In [None]:
# -----------------------
# 8Ô∏è‚É£ Build LSTM Model
# -----------------------
lstm_model = Sequential([
    Embedding(max_words, 100, input_length=max_len),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history_lstm = lstm_model.fit(X_train, y_train, validation_split=0.1, epochs=3, batch_size=64, verbose=1)



In [None]:
# -----------------------
# 9Ô∏è‚É£ Build GRU Model
# -----------------------
gru_model = Sequential([
    Embedding(max_words, 100, input_length=max_len),
    GRU(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])
gru_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history_gru = gru_model.fit(X_train, y_train, validation_split=0.1, epochs=3, batch_size=64, verbose=1)



In [None]:
# -----------------------
# üîü Plot Training Accuracy Comparison
# -----------------------
plt.figure(figsize=(10,5))
plt.plot(history_lstm.history['val_accuracy'], label='LSTM Val Accuracy')
plt.plot(history_gru.history['val_accuracy'], label='GRU Val Accuracy')
plt.title("Validation Accuracy: LSTM vs GRU")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

In [None]:
# -----------------------
# 1Ô∏è‚É£1Ô∏è‚É£ Evaluate LSTM
# -----------------------
y_pred_lstm = (lstm_model.predict(X_test) > 0.5).astype(int)
print("LSTM Test Accuracy:", accuracy_score(y_test, y_pred_lstm))
print("LSTM Classification Report:\n", classification_report(y_test, y_pred_lstm))
print("LSTM Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lstm))


In [None]:
# -----------------------
# 1Ô∏è‚É£2Ô∏è‚É£ Evaluate GRU
# -----------------------
y_pred_gru = (gru_model.predict(X_test) > 0.5).astype(int)
print("GRU Test Accuracy:", accuracy_score(y_test, y_pred_gru))
print("GRU Classification Report:\n", classification_report(y_test, y_pred_gru))
print("GRU Confusion Matrix:\n", confusion_matrix(y_test, y_pred_gru))