# Sentiment Analysis on IMDB Dataset

In [None]:
import numpy as np 
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, LSTM, Dropout, Flatten
from tensorflow.keras.optimizers import Adam
from tkinter import messagebox
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
import keras_tuner as kt
from tensorflow.keras.callbacks import EarlyStopping
import re
import nltk
import urllib.request
import zipfile
import os
import pandas as pd
import tkinter as tk

# Download NLTK Resources

In [None]:
# Download necessary NLTK resources
nltk.download('stopwords')
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Data Loading

In [None]:
file_path = r"D:\Courses\GP-Cinemate\ML\machine-learning-dev\preprocessing\IMDB Dataset.csv"
df = pd.read_csv(file_path)
texts = df['review'].values
labels = df['sentiment'].values

# Text Preprocessing

In [None]:
def enhanced_preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Replace contractions
    contractions = {
        "ain't": "am not", "aren't": "are not", "can't": "cannot", 
        "can't've": "cannot have", "'cause": "because", "could've": "could have", 
        "couldn't": "could not", "couldn't've": "could not have", "didn't": "did not",
        "doesn't": "does not", "don't": "do not", "hadn't": "had not",
        "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not",
        "he'd": "he would", "he'd've": "he would have", "he'll": "he will",
        "he'll've": "he will have", "he's": "he is", "how'd": "how did",
        "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
        "i'd": "i would", "i'd've": "i would have", "i'll": "i will",
        "i'll've": "i will have", "i'm": "i am", "i've": "i have",
        "isn't": "is not", "it'd": "it would", "it'd've": "it would have",
        "it'll": "it will", "it'll've": "it will have", "it's": "it is",
        "let's": "let us", "ma'am": "madam", "mayn't": "may not",
        "might've": "might have", "mightn't": "might not", "mightn't've": "might not have",
        "must've": "must have", "mustn't": "must not", "mustn't've": "must not have",
        "needn't": "need not", "needn't've": "need not have", "o'clock": "of the clock",
        "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
        "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would",
        "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have",
        "she's": "she is", "should've": "should have", "shouldn't": "should not",
        "shouldn't've": "should not have", "so've": "so have", "so's": "so is",
        "that'd": "that would", "that'd've": "that would have", "that's": "that is",
        "there'd": "there would", "there'd've": "there would have", "there's": "there is",
        "they'd": "they would", "they'd've": "they would have", "they'll": "they will",
        "they'll've": "they will have", "they're": "they are", "they've": "they have",
        "to've": "to have", "wasn't": "was not", "we'd": "we would",
        "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have",
        "we're": "we are", "we've": "we have", "weren't": "were not",
        "what'll": "what will", "what'll've": "what will have", "what're": "what are",
        "what's": "what is", "what've": "what have", "when's": "when is",
        "when've": "when have", "where'd": "where did", "where's": "where is",
        "where've": "where have", "who'll": "who will", "who'll've": "who will have",
        "who's": "who is", "who've": "who have", "why's": "why is",
        "why've": "why have", "will've": "will have", "won't": "will not",
        "won't've": "will not have", "would've": "would have", "wouldn't": "would not",
        "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
        "y'all'd've": "you all would have", "y'all're": "you all are",
        "y'all've": "you all have", "you'd": "you would", "you'd've": "you would have",
        "you'll": "you will", "you'll've": "you will have", "you're": "you are",
        "you've": "you have"
    }
    
    # Replace contractions in text
    for contraction, replacement in contractions.items():
        text = text.replace(contraction, replacement)
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    
    # Join tokens back into text
    processed_text = ' '.join(tokens)
    
    return processed_text

# Process texts with enhanced preprocessing
processed_texts = [enhanced_preprocess_text(text) for text in texts]

In [5]:
def map_to_binary_classes(labels):
    return np.array([0 if label == 'negative' else 1 for label in labels])

y = map_to_binary_classes(labels)

# Text Tokenization and Sequence Preparation

In [None]:
vocab_size = 10000
max_length = 100
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(processed_texts)
sequences = tokenizer.texts_to_sequences(processed_texts)
X = pad_sequences(sequences, maxlen=max_length)
word_index = tokenizer.word_index

# Data Splitting

In [None]:
# Split into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)  
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# GloVe Embeddings Loading

In [None]:
embedding_dim = 100
embeddings_index = {}
glove_file = 'glove.6B.100d.txt'

In [10]:
# Download GloVe embeddings if not present
if not os.path.exists(glove_file):
    url = 'http://nlp.stanford.edu/data/glove.6B.zip'
    urllib.request.urlretrieve(url, 'glove.6B.zip')
    with zipfile.ZipFile('glove.6B.zip', 'r') as zip_ref:
        zip_ref.extractall('.')

In [11]:
with open(glove_file, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [None]:
# Create embedding matrix
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_index.items():
    if i < vocab_size:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# Model Definitions

In [None]:
def build_ann(embedding_dim=100, dense_units=10000):  # ANN Model
    model = Sequential([
        Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False),
        Flatten(),
        Dense(dense_units, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [None]:
def build_cnn(embedding_dim=100, num_filters=128, kernel_size=5, dense_units=128):  # CNN Model
    model = Sequential([
        Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False),
        Conv1D(num_filters, kernel_size, activation='relu'),
        GlobalMaxPooling1D(),
        Dense(dense_units, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [None]:
def build_lstm(embedding_dim=100, lstm_units=128, dense_units=128, dropout_rate=0.2):  # LSTM Model
    model = Sequential([
        Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False),
        LSTM(lstm_units),
        Dropout(dropout_rate),
        Dense(dense_units, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Hyperparameter Tuning

In [None]:
def build_model(hp):
    model_type = hp.Choice('model_type', ['ann', 'cnn', 'lstm', 'bilstm', 'cnn-lstm'])
    model = Sequential()
    model.add(Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_length, trainable=False))
    
    if model_type == 'ann':
        model.add(Flatten())
        model.add(Dense(units=hp.Int('dense_units', 500, 10000, step=500), activation='relu'))
    elif model_type == 'cnn':
        model.add(Conv1D(filters=hp.Int('num_filters', 64, 256, step=64), 
                         kernel_size=hp.Int('kernel_size', 3, 7, step=2), activation='relu'))
        model.add(GlobalMaxPooling1D())
        model.add(Dense(units=hp.Int('dense_units', 64, 256, step=64), activation='relu'))
    elif model_type == 'lstm':
        model.add(LSTM(units=hp.Int('lstm_units', 64, 256, step=64)))
        model.add(Dropout(hp.Float('dropout_rate', 0.2, 0.5, step=0.1)))
        model.add(Dense(units=hp.Int('dense_units', 64, 256, step=64), activation='relu'))
    elif model_type == 'bilstm':
        model.add(tf.keras.layers.Bidirectional(LSTM(units=hp.Int('lstm_units', 64, 256, step=64))))
        model.add(Dropout(hp.Float('dropout_rate', 0.2, 0.5, step=0.1)))
        model.add(Dense(units=hp.Int('dense_units', 64, 256, step=64), activation='relu'))
    else:  # cnn-lstm
        model.add(Conv1D(filters=hp.Int('num_filters', 64, 256, step=64),
                         kernel_size=hp.Int('kernel_size', 3, 7, step=2), activation='relu'))
        model.add(tf.keras.layers.MaxPooling1D(pool_size=2))
        model.add(LSTM(units=hp.Int('lstm_units', 64, 256, step=64)))
        model.add(Dropout(hp.Float('dropout_rate', 0.2, 0.5, step=0.1)))
        model.add(Dense(units=hp.Int('dense_units', 64, 256, step=64), activation='relu'))
    
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=Adam(hp.Float('learning_rate', 1e-4, 1e-2, sampling='log')),
                  loss='binary_crossentropy', metrics=['accuracy'])
    return model

Trial 13 Complete [00h 10m 23s]
val_accuracy: 0.7279999852180481

Best val_accuracy So Far: 0.8500000238418579
Total elapsed time: 02h 14m 24s

Search: Running Trial #14

Value             |Best Value So Far |Hyperparameter
lstm              |lstm              |model_type
9500              |4500              |dense_units
0.00023798        |0.0055938         |learning_rate
128               |192               |num_filters
5                 |3                 |kernel_size
256               |64                |lstm_units
0.3               |0.4               |dropout_rate
3                 |3                 |tuner/epochs
0                 |0                 |tuner/initial_epoch
3                 |3                 |tuner/bracket
0                 |0                 |tuner/round

Epoch 1/3
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 79ms/step - accuracy: 0.6982 - loss: 0.5709 - val_accuracy: 0.7841 - val_loss: 0.4664
Epoch 2/3
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━

KeyboardInterrupt: 

In [None]:
tuner = kt.Hyperband(build_model, objective='val_accuracy', max_epochs=60, directory='tuner_dir', project_name='sentiment')
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Model Training

In [None]:
# Start hyperparameter search
tuner.search(X_train, y_train, validation_data=(X_val, y_val), epochs=60, batch_size=32, callbacks=[early_stopping])
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
best_model = tuner.get_best_models(num_models=1)[0]

print(f"Best hyperparameters: {best_hps.values}")

# Model Evaluation

In [None]:
# Evaluate on test set
test_loss, test_accuracy = best_model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy of best model: {test_accuracy:.4f}")



In [None]:
# Comprehensive evaluation metrics
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns

# Get predictions
y_pred_prob = best_model.predict(X_test, verbose=0)
y_pred = (y_pred_prob > 0.5).astype(int)

# Classification metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Display examples of correct and incorrect predictions
def display_example_predictions(X, y_true, y_pred, tokenizer, n=5):
    # Get indices where predictions were correct
    correct_indices = np.where(y_true == y_pred)[0]
    incorrect_indices = np.where(y_true != y_pred)[0]
    
    # Convert sequences back to words for visualization
    index_to_word = {v: k for k, v in tokenizer.word_index.items()}
    
    print("\nCORRECT PREDICTIONS EXAMPLES:")
    for i in np.random.choice(correct_indices, min(n, len(correct_indices)), replace=False):
        words = [index_to_word.get(idx, '') for idx in X[i] if idx > 0]
        text = ' '.join(words[:20]) + '...'  # First 20 words for brevity
        true_sentiment = 'positive' if y_true[i] == 1 else 'negative'
        print(f"Text: {text}")
        print(f"True sentiment: {true_sentiment}, Predicted probability: {y_pred_prob[i][0]:.4f}\n")
    
    print("\nINCORRECT PREDICTIONS EXAMPLES:")
    for i in np.random.choice(incorrect_indices, min(n, len(incorrect_indices)), replace=False):
        words = [index_to_word.get(idx, '') for idx in X[i] if idx > 0]
        text = ' '.join(words[:20]) + '...'  # First 20 words for brevity
        true_sentiment = 'positive' if y_true[i] == 1 else 'negative'
        pred_sentiment = 'positive' if y_pred[i] == 1 else 'negative'
        print(f"Text: {text}")
        print(f"True sentiment: {true_sentiment}, Predicted sentiment: {pred_sentiment}, Probability: {y_pred_prob[i][0]:.4f}\n")

display_example_predictions(X_test, y_test, y_pred, tokenizer)

# Save the Model

In [None]:
best_model.save('best_sentiment_model.h5')

# Sentiment Analysis GUI

In [None]:
def preprocess_for_prediction(text):
    # Preprocess input text same way as training data
    processed_text = enhanced_preprocess_text(text)
    # Convert to sequence
    sequence = tokenizer.texts_to_sequences([processed_text])
    # Pad sequence
    padded = pad_sequences(sequence, maxlen=max_length)
    return padded

In [None]:
def predict_sentiment():
    review = entry.get().strip()
    if not review:
        messagebox.showwarning("Input Error", "Please enter a review!")
        return
    padded = preprocess_for_prediction(review)
    prediction = best_model.predict(padded, verbose=0)[0][0]
    sentiment = "positive" if prediction > 0.5 else "negative"
    result_label.config(text=f"Predicted sentiment: {sentiment} (Probability: {prediction:.4f})")

In [None]:
# Set up GUI
root = tk.Tk()
root.title("Sentiment Analysis")
root.geometry("400x200")

tk.Label(root, text="Enter your review:").pack(pady=5)
entry = tk.Entry(root, width=50)
entry.pack(pady=5)

button = tk.Button(root, text="Predict Sentiment", command=predict_sentiment)
button.pack(pady=10)

result_label = tk.Label(root, text="", font=("Arial", 12))
result_label.pack(pady=10)

root.mainloop()