In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

print("Path to dataset files:", path)

# Models/get_models.py

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (
    Dense,
    Dropout,
    Embedding,
    LSTM,
    GRU,
    Input,
    GlobalAveragePooling1D,
    MultiHeadAttention,
    LayerNormalization,
    Add,
)


def get_machine_learning_models():
    """
    Returns a dictionary of machine learning models with their names as keys.
    """

    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Random Forest": RandomForestClassifier(),
        "Naive Bayes": MultinomialNB(),
    }

    return models


def get_deep_learning_models(vocab_size=20000, max_len=500, embed_dim=128, num_heads=4):
    """
    Returns a dictionary of deep learning models with their names as keys.
    """
    models = {}

    # 1. Simple Feedforward
    models["Simple Feedforward"] = Sequential(
        [
            Dense(128, activation="relu", input_shape=(500,)),
            Dropout(0.5),
            Dense(1, activation="sigmoid"),
        ]
    )

    # 2. LSTM Model
    lstm_input = Input(shape=(max_len,))
    x = Embedding(input_dim=vocab_size, output_dim=embed_dim)(lstm_input)
    x = LSTM(64)(x)
    x = Dropout(0.5)(x)
    output = Dense(1, activation="sigmoid")(x)
    models["LSTM"] = Model(inputs=lstm_input, outputs=output)

    # 3. GRU Model
    gru_input = Input(shape=(max_len,))
    x = Embedding(input_dim=vocab_size, output_dim=embed_dim)(gru_input)
    x = GRU(64)(x)
    x = Dropout(0.5)(x)
    output = Dense(1, activation="sigmoid")(x)
    models["GRU"] = Model(inputs=gru_input, outputs=output)

    # 4. Transformer-like Model (simple attention block)
    trans_input = Input(shape=(max_len,))
    x = Embedding(input_dim=vocab_size, output_dim=embed_dim)(trans_input)
    attn_output = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(x, x)
    x = Add()([x, attn_output])
    x = LayerNormalization()(x)
    x = GlobalAveragePooling1D()(x)
    x = Dropout(0.5)(x)
    output = Dense(1, activation="sigmoid")(x)
    models["Transformer-Attention"] = Model(inputs=trans_input, outputs=output)

    return models

# src/preprocess_data.py

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


def preprocess_data(df):
    df = df.dropna().drop_duplicates()
    X = clean_text(df["review"])
    y = df["sentiment"].map({"positive": 1, "negative": 0}).astype("float32")

    # Get both representations
    X_tfidf = get_embeddings(X, method="tfidf")
    X_seq = get_embeddings(X, method="sequence")

    # Split both
    X_tfidf_train, X_tfidf_test, y_train, y_test = train_test_split(
        X_tfidf, y, test_size=0.2, random_state=42
    )
    X_seq_train, X_seq_test, _, _ = train_test_split(
        X_seq, y, test_size=0.2, random_state=42
    )

    print("Data split into training and testing sets successfully.")
    return X_tfidf_train, X_tfidf_test, X_seq_train, X_seq_test, y_train, y_test


def clean_text(X):
    # Example cleaning function, modify as needed
    X = X.str.replace(r"[^a-zA-Z\s]", "", regex=True)
    print("Text data cleaned successfully.")
    return X


def get_embeddings(X, method="tfidf", max_features=20000, max_len=500):
    if method == "tfidf":
        vectorizer = TfidfVectorizer(max_features=max_features)
        X_vec = vectorizer.fit_transform(X)
        print("TF-IDF embeddings generated successfully.")
        return X_vec

    elif method == "sequence":
        tokenizer = Tokenizer(num_words=max_features, oov_token="<OOV>")
        tokenizer.fit_on_texts(X)
        sequences = tokenizer.texts_to_sequences(X)
        padded = pad_sequences(
            sequences, maxlen=max_len, padding="post", truncating="post"
        )
        print("Sequence embeddings generated successfully.")
        return padded

# src/train_models.py

In [None]:
def train_models(X_tfidf_train, y_train, X_seq_train):
    """
    Trains and returns a dictionary of machine learning and deep learning models.
    """
    ml_models = get_machine_learning_models()
    dl_models = get_deep_learning_models()

    # Train machine learning models on TF-IDF features
    for name, model in ml_models.items():
        print(f"Training {name} model...")
        model.fit(X_tfidf_train, y_train)
        print(f"{name} model trained successfully.")

    # Train deep learning models on sequence data
    for name, model in dl_models.items():
        print(f"Training {name} model...")
        model.compile(
            optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]
        )
        model.fit(X_seq_train, y_train, epochs=10, batch_size=32, verbose=0)
        print(f"{name} model trained successfully.")

    print("All models trained successfully.")
    return {**ml_models, **dl_models}

# test/evaluate.py

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Model


def evaluate_model(model, test_data, test_labels):
    """
    Evaluates the given model on the test data and returns the accuracy.

    Args:
        model: A trained scikit-learn or Keras model.
        test_data: The input data for testing.
        test_labels: The true labels for testing.

    Returns:
        float: Accuracy of the model on the test data.
    """
    model_name = model.name if isinstance(model, Model) else model.__class__.__name__
    print(f"Evaluating model: {model_name}")

    if hasattr(model, "predict"):
        if isinstance(model, Model):  # Keras model
            predictions = model.predict(test_data, verbose=0)

            # Convert probabilities to binary class labels (sigmoid output)
            if predictions.shape[-1] == 1:
                predictions = (predictions > 0.5).astype("int32").flatten()
            else:
                predictions = np.argmax(predictions, axis=-1)

        else:  # Scikit-learn model
            predictions = model.predict(test_data)

        accuracy = accuracy_score(test_labels, predictions)
        print(f"{model_name} Accuracy: {accuracy:.2f}")
        return accuracy

    else:
        raise ValueError(f"Unsupported model type: {type(model)}")

# src/visualize_most_freq_words.py

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt


def visualize_most_frequent_words(df):
    positive_reviews = df[df["sentiment"] == "positive"]["review"].str.cat(sep=" ")
    negative_reviews = df[df["sentiment"] == "negative"]["review"].str.cat(sep=" ")

    wordcloud_pos = WordCloud(width=800, height=400, background_color="white").generate(
        positive_reviews
    )
    wordcloud_neg = WordCloud(
        width=800, height=400, background_color="black", colormap="Reds"
    ).generate(negative_reviews)

    plt.figure(figsize=(14, 6))
    plt.subplot(1, 2, 1)
    plt.imshow(wordcloud_pos, interpolation="bilinear")
    plt.axis("off")
    plt.title("Most Frequent Positive Words")

    plt.subplot(1, 2, 2)
    plt.imshow(wordcloud_neg, interpolation="bilinear")
    plt.axis("off")
    plt.title("Most Frequent Negative Words")

    plt.tight_layout()
    plt.show()


# src/main.py

In [None]:
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)
import pandas as pd


def main():
    print("*" * 20, "Main", "*" * 20)

    # Step 1: Download data
    # data_downloader()
    # print("Data downloaded successfully.")

    # Step 2: Load and limit dataset (for debugging or testing)
    df = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
    # df = df.head(10)
    print("Data loaded successfully.")

    # Step 3: Preprocess text → returns TF-IDF and Sequence embeddings
    X_tfidf_train, X_tfidf_test, X_seq_train, X_seq_test, y_train, y_test = (
        preprocess_data(df)
    )
    print("Data preprocessed successfully.")

    # Step 4: Train all models
    models = train_models(X_tfidf_train, y_train, X_seq_train)
    print("Models trained successfully.")

    # Step 5: Evaluate each model using appropriate test input
    accuracy_results = {}
    for name, model in models.items():
        if name in ["Logistic Regression", "Random Forest", "Naive Bayes"]:
            X_test = X_tfidf_test
        else:
            X_test = X_seq_test

        accuracy = evaluate_model(model, X_test, y_test)
        accuracy_results[name] = accuracy
        print(f"{name} Accuracy: {accuracy:.2f}")

    print("Model evaluation completed.")
    print("Accuracy Results:", accuracy_results)

    # Visualize the most frequent positive and negative words
    visualize_most_frequent_words(df)
    print("*" * 20, "Return", "*" * 20)


if __name__ == "__main__":
    main()