In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, LSTM, Dropout, Flatten
from tensorflow.keras.optimizers import Adam
from tkinter import messagebox
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from tensorflow.keras.preprocessing.text import Tokenizer
import keras_tuner as kt
from tensorflow.keras.callbacks import EarlyStopping
import re
import nltk
import urllib.request
import zipfile
import os
import pandas as pd
import tkinter as tk

In [2]:
file_path = r"D:\Courses\GP-Cinemate\ML\machine-learning-dev\preprocessing\IMDB Dataset.csv"
df = pd.read_csv(file_path)

In [3]:
texts = df['review'].values
labels = df['sentiment'].values

In [4]:
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text.lower())  # Remove punctuation
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

texts = [preprocess_text(text) for text in df['review'].values]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ziad/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def map_to_binary_classes(labels):
    return np.array([0 if label == 'negative' else 1 for label in labels])

y = map_to_binary_classes(labels)

In [6]:
vocab_size = 10000
max_length = 100
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
X = pad_sequences(sequences, maxlen=max_length)

In [7]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)  
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42) 
word_index = tokenizer.word_index

# Split into train and validation sets (80% train, 20% validation)

In [8]:

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Load GloVe embeddings (100-dimensional)

In [9]:

embedding_dim = 100
embeddings_index = {}
glove_file = 'glove.6B.100d.txt'

# Download GloVe embeddings if not present

In [10]:
# Download GloVe embeddings if not present
if not os.path.exists(glove_file):
    url = 'http://nlp.stanford.edu/data/glove.6B.zip'
    urllib.request.urlretrieve(url, 'glove.6B.zip')
    with zipfile.ZipFile('glove.6B.zip', 'r') as zip_ref:
        zip_ref.extractall('.')

In [11]:
with open(glove_file, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Create embedding matrix

In [12]:
word_index = tokenizer.word_index  # Use the tokenizer’s word_index from Cell 6
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_index.items():
    if i < vocab_size:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# --- Step 2: Define Deep Learning Models ---
# ANN Model (Second Paper)

In [13]:
def build_ann(embedding_dim=100, dense_units=10000):  # Matches Table 4 parameters
    model = Sequential([
        Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False),
        Flatten(),
        Dense(dense_units, activation='relu'),
        Dense(1, activation='sigmoid')  # Binary output
    ])
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# CNN Model (Hybrid of First and Second Paper)

In [14]:
def build_cnn(embedding_dim=100, num_filters=128, kernel_size=5, dense_units=128):  # Matches Table 5
    model = Sequential([
        Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False),
        Conv1D(num_filters, kernel_size, activation='relu'),
        GlobalMaxPooling1D(),
        Dense(dense_units, activation='relu'),
        Dense(1, activation='sigmoid')  # Binary output
    ])
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# LSTM Model (Second Paper)

In [15]:
def build_lstm(embedding_dim=100, lstm_units=128, dense_units=128, dropout_rate=0.2):  # Matches Table 6
    model = Sequential([
        Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False),
        LSTM(lstm_units),
        Dropout(dropout_rate),
        Dense(dense_units, activation='relu'),
        Dense(1, activation='sigmoid')  # Binary output
    ])
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [16]:
# New Cell after Cell 16
def build_model(hp):
    model_type = hp.Choice('model_type', ['ann', 'cnn', 'lstm'])
    model = Sequential()
    model.add(Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_length, trainable=False))
    
    if model_type == 'ann':
        model.add(Flatten())
        model.add(Dense(units=hp.Int('dense_units', 500, 10000, step=500), activation='relu'))
    elif model_type == 'cnn':
        model.add(Conv1D(filters=hp.Int('num_filters', 64, 256, step=64), 
                         kernel_size=hp.Int('kernel_size', 3, 7, step=2), activation='relu'))
        model.add(GlobalMaxPooling1D())
        model.add(Dense(units=hp.Int('dense_units', 64, 256, step=64), activation='relu'))
    else:  # lstm
        model.add(LSTM(units=hp.Int('lstm_units', 64, 256, step=64)))
        model.add(Dropout(hp.Float('dropout_rate', 0.2, 0.5, step=0.1)))
        model.add(Dense(units=hp.Int('dense_units', 64, 256, step=64), activation='relu'))
    
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=Adam(hp.Float('learning_rate', 1e-4, 1e-2, sampling='log')),
                  loss='binary_crossentropy', metrics=['accuracy'])
    return model

tuner = kt.Hyperband(build_model, objective='val_accuracy', max_epochs=60, directory='tuner_dir', project_name='sentiment')
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
tuner.search(X_train, y_train, validation_data=(X_val, y_val), epochs=60, batch_size=32, callbacks=[early_stopping])
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
best_model = tuner.get_best_models(num_models=1)[0]

print(f"Best hyperparameters: {best_hps.values}")

Trial 13 Complete [00h 10m 23s]
val_accuracy: 0.7279999852180481

Best val_accuracy So Far: 0.8500000238418579
Total elapsed time: 02h 14m 24s

Search: Running Trial #14

Value             |Best Value So Far |Hyperparameter
lstm              |lstm              |model_type
9500              |4500              |dense_units
0.00023798        |0.0055938         |learning_rate
128               |192               |num_filters
5                 |3                 |kernel_size
256               |64                |lstm_units
0.3               |0.4               |dropout_rate
3                 |3                 |tuner/epochs
0                 |0                 |tuner/initial_epoch
3                 |3                 |tuner/bracket
0                 |0                 |tuner/round

Epoch 1/3
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 79ms/step - accuracy: 0.6982 - loss: 0.5709 - val_accuracy: 0.7841 - val_loss: 0.4664
Epoch 2/3
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━

KeyboardInterrupt: 

# --- Step 3: Hyperparameter Tuning and Training ---

# Hyperparameter configurations

In [None]:

ann_hyperparams = [
    {'embedding_dim': 100, 'dense_units': 500},
    {'embedding_dim': 100, 'dense_units': 1000},
]
cnn_hyperparams = [
    {'embedding_dim': 100, 'num_filters': 64, 'kernel_size': 5, 'dense_units': 128},
    {'embedding_dim': 100, 'num_filters': 128, 'kernel_size': 3, 'dense_units': 64},
]
lstm_hyperparams = [
    {'embedding_dim': 100, 'lstm_units': 64, 'dense_units': 128, 'dropout_rate': 0.2},
    {'embedding_dim': 100, 'lstm_units': 128, 'dense_units': 64, 'dropout_rate': 0.5},
]

# Training and evaluation

In [None]:
test_loss, test_accuracy = best_model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy of best model: {test_accuracy:.4f}")

# Additional Metrics (New Cell)
from sklearn.metrics import precision_score, recall_score, f1_score
y_pred = (best_model.predict(X_test) > 0.5).astype(int)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")

In [None]:
# Save the best model
best_model.save('best_sentiment_model.h5')



# --- Step 4: Create GUI ---

In [None]:
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    sequence = [word_index.get(word) for word in words if word_index.get(word) and word_index.get(word) < vocab_size]
    return pad_sequences([sequence], maxlen=max_length)

In [None]:
def predict_sentiment():
    review = entry.get().strip()
    if not review:
        messagebox.showwarning("Input Error", "Please enter a review!")
        return
    padded = preprocess_text(review)
    prediction = best_model.predict(padded, verbose=0)[0][0]
    sentiment = "positive" if prediction > 0.5 else "negative"
    result_label.config(text=f"Predicted sentiment: {sentiment} (Probability: {prediction:.4f})")

In [None]:
# Set up GUI
root = tk.Tk()
root.title("Sentiment Analysis")
root.geometry("400x200")

tk.Label(root, text="Enter your review:").pack(pady=5)
entry = tk.Entry(root, width=50)
entry.pack(pady=5)

button = tk.Button(root, text="Predict Sentiment", command=predict_sentiment)
button.pack(pady=10)

result_label = tk.Label(root, text="", font=("Arial", 12))
result_label.pack(pady=10)

root.mainloop()