In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
import pickle
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import seaborn as sns

# Download NLTK data
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Load dataset
df = pd.read_csv('imdb_dataset.csv')

# Data Cleaning
def clean_data(df):
    print("Missing values before cleaning:")
    print(df.isnull().sum())
    df = df.dropna(subset=['review', 'sentiment'])
    print(f"Number of duplicates: {df.duplicated().sum()}")
    df = df.drop_duplicates()
    valid_labels = {'positive', 'negative'}
    invalid_labels = df[~df['sentiment'].isin(valid_labels)]
    if not invalid_labels.empty:
        print(f"Found {len(invalid_labels)} invalid sentiment labels. Removing them.")
        df = df[df['sentiment'].isin(valid_labels)]
    print(f"Dataset shape after cleaning: {df.shape}")
    return df

# Text Preprocessing
def preprocess_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply data cleaning and preprocessing
df = clean_data(df)
df['review'] = df['review'].apply(preprocess_text)
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Plot class distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='sentiment', data=df)
plt.title('Class Distribution of Sentiment')
plt.xlabel('Sentiment (0 = Negative, 1 = Positive)')
plt.ylabel('Count')
plt.savefig('class_distribution.png')
plt.close()

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)

# Logistic Regression Model
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)

# Evaluate Logistic Regression
y_pred_lr = lr_model.predict(X_test_tfidf)
lr_metrics = {
    'Accuracy': accuracy_score(y_test, y_pred_lr),
    'Precision': precision_score(y_test, y_pred_lr),
    'Recall': recall_score(y_test, y_pred_lr),
    'F1-Score': f1_score(y_test, y_pred_lr)
}

# Plot confusion matrix for Logistic Regression
cm_lr = confusion_matrix(y_test, y_pred_lr)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.title('Confusion Matrix - Logistic Regression')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig('confusion_matrix_lr.png')
plt.close()

# LSTM Model with Cross-Validation
max_words = 5000
max_len = 200
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

def create_lstm_model():
    model = Sequential()
    model.add(Embedding(max_words, 100, input_length=max_len))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# K-Fold Cross-Validation for LSTM
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
lstm_metrics_cv = {'Accuracy': [], 'Precision': [], 'Recall': [], 'F1-Score': []}

for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train_pad, y_train)):
    print(f"Training fold {fold + 1}...")
    X_fold_train, X_fold_val = X_train_pad[train_idx], X_train_pad[val_idx]
    y_fold_train, y_fold_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    lstm_model = create_lstm_model()
    lstm_model.fit(X_fold_train, y_fold_train, epochs=5, batch_size=64, validation_data=(X_fold_val, y_fold_val), verbose=1)

    y_pred_fold = (lstm_model.predict(X_fold_val) > 0.5).astype("int32")
    lstm_metrics_cv['Accuracy'].append(accuracy_score(y_fold_val, y_pred_fold))
    lstm_metrics_cv['Precision'].append(precision_score(y_fold_val, y_pred_fold))
    lstm_metrics_cv['Recall'].append(recall_score(y_fold_val, y_pred_fold))
    lstm_metrics_cv['F1-Score'].append(f1_score(y_fold_val, y_pred_fold))

# Average LSTM metrics from cross-validation
lstm_metrics = {metric: np.mean(values) for metric, values in lstm_metrics_cv.items()}

# Train final LSTM model on full training data
lstm_model = create_lstm_model()
lstm_model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_split=0.2, verbose=1)

# Evaluate LSTM on test set
y_pred_lstm = (lstm_model.predict(X_test_pad) > 0.5).astype("int32")

# Plot confusion matrix for LSTM
cm_lstm = confusion_matrix(y_test, y_pred_lstm)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_lstm, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.title('Confusion Matrix - LSTM')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig('confusion_matrix_lstm.png')
plt.close()

# Plot comparison of Accuracy and F1-Score
metrics_df = pd.DataFrame({
    'Model': ['Logistic Regression', 'LSTM'],
    'Accuracy': [lr_metrics['Accuracy'], lstm_metrics['Accuracy']],
    'F1-Score': [lr_metrics['F1-Score'], lstm_metrics['F1-Score']]
})

plt.figure(figsize=(10, 6))
metrics_df.plot(x='Model', y=['Accuracy', 'F1-Score'], kind='bar')
plt.title('Model Performance Comparison')
plt.ylabel('Score')
plt.ylim(0, 1)
plt.savefig('model_comparison.png')
plt.close()

# Save models and tokenizer
pickle.dump(vectorizer, open('tfidf_vectorizer.pkl', 'wb'))
pickle.dump(lr_model, open('lr_model.pkl', 'wb'))
pickle.dump(tokenizer, open('tokenizer.pkl', 'wb'))
lstm_model.save('lstm_model.h5')

# Print metrics
print("Logistic Regression Metrics:")
for metric, value in lr_metrics.items():
    print(f"{metric}: {value:.4f}")
print("\nLSTM Metrics (Cross-Validation Average):")
for metric, value in lstm_metrics.items():
    print(f"{metric}: {value:.4f}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Missing values before cleaning:
review       0
sentiment    0
dtype: int64
Number of duplicates: 418
Dataset shape after cleaning: (49582, 2)
Training fold 1...




Epoch 1/5
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m312s[0m 613ms/step - accuracy: 0.7589 - loss: 0.4851 - val_accuracy: 0.8661 - val_loss: 0.3177
Epoch 2/5
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m311s[0m 627ms/step - accuracy: 0.8879 - loss: 0.2755 - val_accuracy: 0.8583 - val_loss: 0.3304
Epoch 3/5
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 628ms/step - accuracy: 0.8996 - loss: 0.2593 - val_accuracy: 0.8734 - val_loss: 0.3106
Epoch 4/5
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m325s[0m 634ms/step - accuracy: 0.9134 - loss: 0.2227 - val_accuracy: 0.8694 - val_loss: 0.3150
Epoch 5/5
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m320s[0m 630ms/step - accuracy: 0.9222 - loss: 0.2018 - val_accuracy: 0.8726 - val_loss: 0.3239
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 106ms/step
Training fold 2...
Epoch 1/5




[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m320s[0m 639ms/step - accuracy: 0.7382 - loss: 0.5120 - val_accuracy: 0.8574 - val_loss: 0.3348
Epoch 2/5
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m320s[0m 636ms/step - accuracy: 0.8854 - loss: 0.2918 - val_accuracy: 0.8726 - val_loss: 0.3050
Epoch 3/5
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m319s[0m 630ms/step - accuracy: 0.9042 - loss: 0.2464 - val_accuracy: 0.8756 - val_loss: 0.3141
Epoch 4/5
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 630ms/step - accuracy: 0.9112 - loss: 0.2269 - val_accuracy: 0.8664 - val_loss: 0.3368
Epoch 5/5
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 630ms/step - accuracy: 0.9205 - loss: 0.2055 - val_accuracy: 0.8698 - val_loss: 0.3398
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 109ms/step
Training fold 3...
Epoch 1/5




[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m312s[0m 622ms/step - accuracy: 0.7487 - loss: 0.4914 - val_accuracy: 0.8739 - val_loss: 0.3028
Epoch 2/5
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m315s[0m 635ms/step - accuracy: 0.8880 - loss: 0.2905 - val_accuracy: 0.8766 - val_loss: 0.3015
Epoch 3/5
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m324s[0m 638ms/step - accuracy: 0.9006 - loss: 0.2530 - val_accuracy: 0.8766 - val_loss: 0.3006
Epoch 4/5
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m319s[0m 634ms/step - accuracy: 0.9122 - loss: 0.2269 - val_accuracy: 0.8697 - val_loss: 0.3165
Epoch 5/5
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m320s[0m 631ms/step - accuracy: 0.9213 - loss: 0.2072 - val_accuracy: 0.8732 - val_loss: 0.3279
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 105ms/step
Training fold 4...
Epoch 1/5




[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m311s[0m 621ms/step - accuracy: 0.7506 - loss: 0.4953 - val_accuracy: 0.8669 - val_loss: 0.3194
Epoch 2/5
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 610ms/step - accuracy: 0.8833 - loss: 0.2917 - val_accuracy: 0.8784 - val_loss: 0.2914
Epoch 3/5
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m327s[0m 620ms/step - accuracy: 0.9002 - loss: 0.2558 - val_accuracy: 0.8775 - val_loss: 0.2900
Epoch 4/5
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m300s[0m 606ms/step - accuracy: 0.9139 - loss: 0.2278 - val_accuracy: 0.8722 - val_loss: 0.3109
Epoch 5/5
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m330s[0m 622ms/step - accuracy: 0.9208 - loss: 0.2053 - val_accuracy: 0.8771 - val_loss: 0.3168
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 105ms/step
Training fold 5...
Epoch 1/5




[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m320s[0m 639ms/step - accuracy: 0.7578 - loss: 0.4886 - val_accuracy: 0.8710 - val_loss: 0.3178
Epoch 2/5
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m320s[0m 636ms/step - accuracy: 0.8837 - loss: 0.2931 - val_accuracy: 0.8674 - val_loss: 0.3154
Epoch 3/5
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m314s[0m 619ms/step - accuracy: 0.9011 - loss: 0.2532 - val_accuracy: 0.8698 - val_loss: 0.3148
Epoch 4/5
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m330s[0m 636ms/step - accuracy: 0.9168 - loss: 0.2190 - val_accuracy: 0.8739 - val_loss: 0.3201
Epoch 5/5
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m324s[0m 639ms/step - accuracy: 0.9245 - loss: 0.1965 - val_accuracy: 0.8702 - val_loss: 0.3425
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 106ms/step
Epoch 1/5




[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m318s[0m 633ms/step - accuracy: 0.7535 - loss: 0.4865 - val_accuracy: 0.8531 - val_loss: 0.3356
Epoch 2/5
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m311s[0m 612ms/step - accuracy: 0.8869 - loss: 0.2880 - val_accuracy: 0.8688 - val_loss: 0.3286
Epoch 3/5
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m315s[0m 635ms/step - accuracy: 0.8962 - loss: 0.2656 - val_accuracy: 0.8736 - val_loss: 0.3171
Epoch 4/5
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m314s[0m 634ms/step - accuracy: 0.9111 - loss: 0.2302 - val_accuracy: 0.8692 - val_loss: 0.3268
Epoch 5/5
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 638ms/step - accuracy: 0.9221 - loss: 0.2048 - val_accuracy: 0.8630 - val_loss: 0.3478
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 105ms/step




Logistic Regression Metrics:
Accuracy: 0.8831
Precision: 0.8733
Recall: 0.8973
F1-Score: 0.8852

LSTM Metrics (Cross-Validation Average):
Accuracy: 0.8726
Precision: 0.8709
Recall: 0.8768
F1-Score: 0.8734


<Figure size 1000x600 with 0 Axes>