In [13]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
import time
import requests
from io import StringIO
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

nltk.download('stopwords', quiet=True)

DATA_URL = 'https://raw.githubusercontent.com/AnshulBhusari/IMDB_Sentiment_Analysis_Project/main/IMDB%20Dataset.csv'

try:
    response = requests.get(DATA_URL)
    response.raise_for_status()
    df = pd.read_csv(StringIO(response.text))
except requests.exceptions.RequestException as e:
    raise Exception(f"Critical: Could not load the IMDb dataset. Error: {e}")

df['sentiment'] = df['sentiment'].replace({'positive': 1, 'negative': 0})

X_train_val, X_test, y_train_val, y_test = train_test_split(
    df['review'], df['sentiment'], test_size=0.1, random_state=42, stratify=df['sentiment']
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.111, random_state=42, stratify=y_train_val
)

STOP_WORDS = set(stopwords.words('english'))
def clean_text(text):
    text = text.lower()
    text = re.sub(r'<br\s*/?>', ' ', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = ' '.join(word for word in text.split() if word not in STOP_WORDS)
    return text

X_train_cleaned = X_train.apply(clean_text)
X_val_cleaned = X_val.apply(clean_text)
X_test_cleaned = X_test.apply(clean_text)

# --- MODEL A: LOGISTIC REGRESSION ---
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train_cleaned)
X_test_tfidf = vectorizer.transform(X_test_cleaned)

lr = LogisticRegression(solver='liblinear', C=1.0, random_state=42)
start_time_lr = time.time()
lr.fit(X_train_tfidf, y_train)
end_time_lr = time.time()

y_pred_lr = lr.predict(X_test_tfidf)
y_proba_lr = lr.predict_proba(X_test_tfidf)[:, 1]

results_lr = {
    'Accuracy': accuracy_score(y_test, y_pred_lr),
    'F1-Score': f1_score(y_test, y_pred_lr),
    'AUC-ROC': roc_auc_score(y_test, y_proba_lr),
    'Time': end_time_lr - start_time_lr
}
print("\n--- Model A: Logistic Regression Final Results ---")
print(pd.Series(results_lr).round(4))
print("\nClassification Report (LR):")
print(classification_report(y_test, y_pred_lr))

# --- MODEL B: LSTM ---
MAX_WORDS = 10000
MAX_LEN = 256
EMBEDDING_DIM = 128
LSTM_UNITS = 128
EPOCHS = 10

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train_cleaned.astype(str))

train_sequences = tokenizer.texts_to_sequences(X_train_cleaned)
val_sequences = tokenizer.texts_to_sequences(X_val_cleaned)
test_sequences = tokenizer.texts_to_sequences(X_test_cleaned)

X_train_seq = pad_sequences(train_sequences, maxlen=MAX_LEN, padding='post', truncating='post')
X_val_seq = pad_sequences(val_sequences, maxlen=MAX_LEN, padding='post', truncating='post')
X_test_seq = pad_sequences(test_sequences, maxlen=MAX_LEN, padding='post', truncating='post')

lstm_model = Sequential([
    Embedding(MAX_WORDS, EMBEDDING_DIM, input_length=MAX_LEN, name='embedding_layer'),
    Dropout(0.3, name='dropout_1'),
    LSTM(LSTM_UNITS, name='lstm_layer'),
    Dropout(0.3, name='dropout_2'),
    Dense(1, activation='sigmoid', name='output_layer')
])

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, verbose=1, restore_best_weights=True),
    ModelCheckpoint('best_lstm_model.keras', monitor='val_loss', save_best_only=True, verbose=0)
]

start_time_lstm = time.time()
history = lstm_model.fit(
    X_train_seq, y_train,
    epochs=EPOCHS,
    batch_size=64,
    validation_data=(X_val_seq, y_val),
    callbacks=callbacks,
    verbose=1
)
end_time_lstm = time.time()
training_time_lstm = end_time_lstm - start_time_lstm

lstm_model.load_weights('best_lstm_model.keras')
y_pred_proba_lstm = lstm_model.predict(X_test_seq, verbose=0)
y_pred_lstm = (y_pred_proba_lstm > 0.5).astype(int)

results_lstm = {
    'Accuracy': accuracy_score(y_test, y_pred_lstm),
    'F1-Score': f1_score(y_test, y_pred_lstm),
    'AUC-ROC': roc_auc_score(y_test, y_pred_proba_lstm),
    'Time': training_time_lstm
}

print("\n--- Model B: LSTM Final Results ---")
print(pd.Series(results_lstm).round(4))
print("\nClassification Report (LSTM):")
print(classification_report(y_test, y_pred_lstm))

  df['sentiment'] = df['sentiment'].replace({'positive': 1, 'negative': 0})



--- Model A: Logistic Regression Final Results ---
Accuracy    0.8879
F1-Score    0.8863
AUC-ROC     0.9528
Time        0.2069
dtype: float64

Classification Report (LR):
              precision    recall  f1-score   support

           0       0.89      0.89      0.89       868
           1       0.89      0.89      0.89       845

    accuracy                           0.89      1713
   macro avg       0.89      0.89      0.89      1713
weighted avg       0.89      0.89      0.89      1713

Epoch 1/10




[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 554ms/step - accuracy: 0.5106 - loss: 0.6935 - val_accuracy: 0.4944 - val_loss: 0.6931
Epoch 2/10
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 586ms/step - accuracy: 0.5125 - loss: 0.6920 - val_accuracy: 0.5155 - val_loss: 0.6914
Epoch 3/10
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 562ms/step - accuracy: 0.5423 - loss: 0.6739 - val_accuracy: 0.5015 - val_loss: 0.6931
Epoch 4/10
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 574ms/step - accuracy: 0.5473 - loss: 0.6440 - val_accuracy: 0.5155 - val_loss: 0.7115
Epoch 5/10
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 571ms/step - accuracy: 0.5458 - loss: 0.6368 - val_accuracy: 0.5056 - val_loss: 0.7372
Epoch 5: early stopping
Restoring model weights from the end of the best epoch: 2.

--- Model B: LSTM Final Results ---
Accuracy      0.5108
F1-Score      0.1396
AUC-ROC       0.503