In [2]:
!pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl.metadata (5.2 kB)
Collecting tensorboard~=2.19.0 (from tensorflow)
  Downloading tensorboard-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting tensorflow-io-gcs-filesystem>=0.23.1 (from tensorflow)
  Downloading tensorflow_io_gcs_filesystem-0.37.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Collecting wheel<1.0,>=0.23.0 (from astunparse>=1.6.0->tensorflow

In [3]:
# Experiment 2
# Define activation functions to search
# Importing necessary libraries
import kagglehub
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras import regularizers, optimizers
from tensorflow.keras.regularizers import l1, l2

#Required for the notebook's reproduciiility
np.random.seed(2)
tf.random.set_seed(2)

# Download latest version
path = kagglehub.dataset_download("muhammadtahir194/movies-dataset-tmdb-top-rated")

print("Path to dataset files:", path)
file_path=""
for file in os.listdir(path):
    if file.endswith(".csv"):
        file_path = os.path.join(path, file)
        break

df = pd.read_csv(file_path)
print(df.head())
df.dropna(inplace=True)
df['is_popular'] = df['popularity'] > 4.0
df['is_popular'] = df['is_popular'].astype(int) #form of one-hot encodinng

# Convert the release_date column to datetime format and extracting only the year
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
df['release_year'] = df['release_date'].dt.year.astype(int)
df.drop(columns=['release_date'], inplace=True)

df.drop(columns=['title', 'overview'], inplace=True)
print(df.head())
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X, y = df.drop(columns=['id','popularity', 'is_popular']), df['is_popular']
print("X shape:", X.shape)
print("y shape:", y.shape)

stratify_val = y if len(np.unique(y)) < 20 else None
print("Unique y values in the dataset: ", len(np.unique(y)))

# Splitting data into training (70%), validation (15%), and test (15%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=2, stratify=stratify_val)
stratify_val_temp = y_temp if stratify_val is not None else None
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=2, stratify=stratify_val_temp)

print(f"Training Set: {X_train.shape}, Validation Set: {X_val.shape}, Test Set: {X_test.shape}")

# Scaling the numerical features
numerical_cols = X_train.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_val[numerical_cols] = scaler.transform(X_val[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])




Path to dataset files: /kaggle/input/movies-dataset-tmdb-top-rated
        id                     title  \
0      278  The Shawshank Redemption   
1      238             The Godfather   
2      240     The Godfather Part II   
3      424          Schindler's List   
4  1356039             Counterattack   

                                            overview release_date  popularity  \
0  Imprisoned in the 1940s for the double murder ...   1994-09-23       5.522   
1  Spanning the years 1945 to 1955, a chronicle o...   1972-03-14       5.317   
2  In the continuing saga of the Corleone crime f...   1974-12-20       4.747   
3  The true story of how businessman Oskar Schind...   1993-12-15       4.457   
4  When a hostage rescue mission creates a new en...   2025-02-27       9.430   

   vote_average  vote_count  
0         8.708       27883  
1         8.689       21151  
2         8.570       12771  
3         8.567       16219  
4         8.524         431  
        id  popularity  v

In [6]:
!pip install keras_tuner

Collecting keras_tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kt-legacy (from keras_tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras_tuner
Successfully installed keras_tuner-1.4.7 kt-legacy-1.0.5


In [7]:
import keras_tuner as kt
import numpy as np
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU
from tensorflow.keras.optimizers import Adam, RMSprop, SGD

ACTIVATION_FUNCTIONS = ['relu', 'tanh', 'sigmoid', 'elu', 'leaky_relu']

def build_model(hp):
    neurons = hp.Int('neurons', min_value=8, max_value=512, step=8)
    layers = hp.Int('layers', min_value=2, max_value=6, step=1)
    dropout_rate = hp.Float('dropout_rate', min_value=0.1, max_value=0.5, step=0.1)
    learning_rate = hp.Choice('learning_rate', values=[0.001, 0.0005, 0.0001])
    optimizer = hp.Choice('optimizer', values=['adam', 'rmsprop', 'sgd'])
    activation_list = [hp.Choice(f'activation_{i}', values=ACTIVATION_FUNCTIONS) for i in range(layers)]
    batch_norm = hp.Boolean('batch_norm')

    model = Sequential()

    # First layer
    if activation_list[0] == 'leaky_relu':
        model.add(Dense(neurons, input_shape=(X_train.shape[1],)))
        model.add(LeakyReLU(alpha=0.01))
    else:
        model.add(Dense(neurons, activation=activation_list[0], input_shape=(X_train.shape[1],)))

    if batch_norm:
        model.add(BatchNormalization())

    model.add(Dropout(dropout_rate))

    # Hidden layers
    for i in range(1, layers):
        model.add(Dense(neurons))
        if activation_list[i] == 'leaky_relu':
            model.add(LeakyReLU(alpha=0.01))
        else:
            model.add(Dense(neurons, activation=activation_list[i]))

        if batch_norm:
            model.add(BatchNormalization())
        model.add(Dropout(dropout_rate))

    # Output layer
    model.add(Dense(1, activation='sigmoid'))

    # Optimizer selection
    if optimizer == 'adam':
        optimizer = Adam(learning_rate)
    elif optimizer == 'rmsprop':
        optimizer = RMSprop(learning_rate)
    else:
        optimizer = SGD(learning_rate)

    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Custom tuner to prioritize test accuracy
class CustomTuner(kt.Hyperband):
    def __init__(self, *args, X_test, y_test, **kwargs):
        super().__init__(*args, **kwargs)
        self.X_test = X_test
        self.y_test = y_test
        self.test_scores = {}

    def on_trial_end(self, trial):
        """Override method to evaluate test accuracy."""
        model = self.load_model(trial)  # Use trial as an argument
        _, test_acc = model.evaluate(self.X_test, self.y_test, verbose=0)
        self.test_scores[trial.trial_id] = test_acc
        super().on_trial_end(trial)

    def get_best_trial(self):
        """Find the best trial based on test accuracy first, then validation accuracy."""
        trials_list = list(self.oracle.trials.values())  # Get trials from the oracle
        sorted_trials = sorted(
            trials_list,
            key=lambda trial: (-self.test_scores.get(trial.trial_id, 0), -trial.score),
        )
        return sorted_trials[0] if sorted_trials else None

    def get_best_model(self):
        """Return the best model based on the best trial."""
        best_trial = self.get_best_trial()
        return self.load_model(best_trial) if best_trial else None


# Initialize the custom tuner
tuner = CustomTuner(
    build_model,
    objective='val_accuracy',
    max_epochs=50,
    factor=3,
    directory='my_tuner_dir',
    project_name='movie_popularity',
    X_test=X_test,  # Pass test data
    y_test=y_test
)

def tuner_search():
    # Perform hyperparameter search
    tuner.search(X_train, y_train, epochs=50, validation_data=(X_val, y_val), batch_size=16)

    # Get the best hyperparameters
    best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
    print("Best Hyperparameters:", best_hps.values)

    # Get the best model based on test accuracy
    best_model = tuner.get_best_model()

    # Evaluate the best model
    test_loss, test_acc = best_model.evaluate(X_test, y_test)
    print(f"Best Model Test Accuracy: {test_acc:.4f}")

    # Return the best model
    return best_model, best_hps.values, test_acc

keras_model, keras_model_arams, keras_model_test_acc = tuner_search()


Trial 90 Complete [00h 01m 10s]
val_accuracy: 0.8940809965133667

Best val_accuracy So Far: 0.9034267663955688
Total elapsed time: 00h 45m 32s
Best Hyperparameters: {'neurons': 472, 'layers': 2, 'dropout_rate': 0.5, 'learning_rate': 0.001, 'optimizer': 'rmsprop', 'activation_0': 'leaky_relu', 'activation_1': 'relu', 'batch_norm': False, 'activation_2': 'leaky_relu', 'activation_3': 'elu', 'activation_4': 'tanh', 'activation_5': 'tanh', 'tuner/epochs': 50, 'tuner/initial_epoch': 17, 'tuner/bracket': 3, 'tuner/round': 3, 'tuner/trial_id': '0046'}


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  saveable.load_own_variables(weights_store.get(inner_path))


[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8835 - loss: 0.2888
Best Model Test Accuracy: 0.8847
