## Model Description

The **Multilayer Perceptron (MLP)** is a type of artificial neural network that learns a mapping from inputs to outputs using a supervised learning algorithm. It consists of multiple layers of interconnected neurons, making it suitable for capturing complex patterns in data.

### Key Features of the MLP Model:
- **Hidden Layers**: Configurable architecture with support for varying sizes of hidden layers.
- **Activation Functions**: Flexible choice of activation functions like ReLU and Tanh.
- **Optimization Algorithms**: Supports solvers like Adam and SGD for weight optimization.
- **Regularization**: Includes an `alpha` parameter to control L2 regularization.
- **Adaptive Learning**: Utilizes an adaptive learning rate for efficient convergence.

## Training Process

The training pipeline includes the following steps:
   
2. **Cross-Validation and Hyperparameter Tuning**:
   - A `GridSearchCV` approach is used to perform exhaustive hyperparameter tuning with 5-fold stratified cross-validation, ensuring robust model evaluation.
   - The hyperparameters optimized include the number of hidden layers, activation functions, solvers, learning rates, and the number of training iterations.

In [1]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from model_utils import run_classifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
import optuna
from sklearn.model_selection import cross_val_score
import joblib
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load datasets
X_train = pd.read_csv("../04_modelling/dataset/X_train.csv")
y_train = pd.read_csv("../04_modelling/dataset/y_train.csv")
X_test = pd.read_csv("../04_modelling/dataset/X_test.csv")
y_test = pd.read_csv("../04_modelling/dataset/y_test.csv")

# Fix the shape of y_train and y_test
y_train = y_train.values.ravel()  # Convert to 1D array
y_test = y_test.values.ravel()    # Convert to 1D array

### Function to Record Trained Models

In [3]:
import pandas as pd

# Initialize an empty DataFrame to store model results
model_records = pd.DataFrame(columns=["Model Name", "Hyperparameters", "Test Accuracy", "Test Precision", "Test Recall"])

def record_trained_model(model_name, params, mean_cv_accuracy, test_metrics):
    """
    Record a trained model's details in a pandas DataFrame.

    Args:
        model_name (str): Name of the model.
        params (dict): Hyperparameters used for training.
        mean_cv_accuracy (float): Mean cross-validation accuracy (optional).
        test_metrics (dict): Test metrics such as accuracy, precision, and recall.
    """
    global model_records  # Use the global DataFrame

    # Create a new record
    record = {
        "Model Name": model_name,
        "Hyperparameters": params,
        "Test Accuracy": f"{test_metrics['accuracy']:.2%}",
        "Test Precision": f"{test_metrics['precision']:.2%}",
        "Test Recall": f"{test_metrics['recall']:.2%}",
    }

    # Append the record to the DataFrame
    model_records = pd.concat([model_records, pd.DataFrame([record])], ignore_index=True)

    print(f"Model '{model_name}' recorded successfully!")

### Train with Default Parameters
- Objective: Train a baseline MLPClassifier without tuning.
- Purpose: Provides a baseline performance for comparison.

In [4]:
# Default MLPClassifier
mlp_default = MLPClassifier(random_state=123)

print("Train with Default Parameters")
best_model_phase1 = run_classifier(mlp_default, {}, X_train, y_train, X_test, y_test, "Default Neural Net")

Train with Default Parameters

--- RandomizedSearchCV (Default Neural Net) ---
RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=123, shuffle=True),
                   estimator=MLPClassifier(random_state=123),
                   param_distributions={}, random_state=123,
                   return_train_score=True, scoring='accuracy')

--- Cross-Validation Results (Default Neural Net) ---
The best parameters are: {}
Mean cross-validation accuracy: 79.28%

--- Test Results ---
Accuracy: 88.46%
Precision: 88.56%
Recall: 88.46%


In [5]:
# Record default model performance
default_test_metrics = {
    "accuracy": accuracy_score(y_test, best_model_phase1.predict(X_test)),
    "precision": precision_score(y_test, best_model_phase1.predict(X_test), average='weighted'),
    "recall": recall_score(y_test, best_model_phase1.predict(X_test), average='weighted')
}
record_trained_model("Default Neural Net", {}, np.nan, default_test_metrics)

Model 'Default Neural Net' recorded successfully!


In [6]:
model_records

Unnamed: 0,Model Name,Hyperparameters,Test Accuracy,Test Precision,Test Recall
0,Default Neural Net,{},88.46%,88.56%,88.46%


### Simple Hyperparameter Tuning
- Objective: Experiment with a small grid of hyperparameters.
- Purpose: Improves performance with minimal computational cost.

In [7]:
# Simple Parameter Grid
param_grid_simple = {
    'hidden_layer_sizes': [(10,), (50,)],
    'activation': ['relu'],
    'solver': ['adam'],
    'alpha': [0.0001],
    'max_iter': [100]
}

print("\nPhase 2: Apply Simple Hyperparameter Tuning")
best_model_phase2 = run_classifier(MLPClassifier(random_state=123), param_grid_simple, X_train, y_train, X_test, y_test, "Tuned Neural Net (Simple Hyperparameter)")


Phase 2: Apply Simple Hyperparameter Tuning



--- RandomizedSearchCV (Tuned Neural Net (Simple Hyperparameter)) ---
RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=123, shuffle=True),
                   estimator=MLPClassifier(random_state=123),
                   param_distributions={'activation': ['relu'],
                                        'alpha': [0.0001],
                                        'hidden_layer_sizes': [(10,), (50,)],
                                        'max_iter': [100], 'solver': ['adam']},
                   random_state=123, return_train_score=True,
                   scoring='accuracy')

--- Cross-Validation Results (Tuned Neural Net (Simple Hyperparameter)) ---
The best parameters are: {'solver': 'adam', 'max_iter': 100, 'hidden_layer_sizes': (10,), 'alpha': 0.0001, 'activation': 'relu'}
Mean cross-validation accuracy: 77.53%

--- Test Results ---
Accuracy: 83.08%
Precision: 82.27%
Recall: 83.08%


In [8]:
# Record simple hyperparameter tuning model performance
simple_test_metrics = {
    "accuracy": accuracy_score(y_test, best_model_phase2.predict(X_test)),
    "precision": precision_score(y_test, best_model_phase2.predict(X_test), average='weighted'),
    "recall": recall_score(y_test, best_model_phase2.predict(X_test), average='weighted')
}
record_trained_model("Tuned Neural Net (Simple Hyperparameter)", best_model_phase2.get_params(), np.nan, simple_test_metrics)

Model 'Tuned Neural Net (Simple Hyperparameter)' recorded successfully!


In [9]:
model_records

Unnamed: 0,Model Name,Hyperparameters,Test Accuracy,Test Precision,Test Recall
0,Default Neural Net,{},88.46%,88.56%,88.46%
1,Tuned Neural Net (Simple Hyperparameter),"{'activation': 'relu', 'alpha': 0.0001, 'batch...",83.08%,82.27%,83.08%


### Advanced Hyperparameter Tuning
- Objective: Use an expanded parameter grid for more robust tuning.
- Purpose: Optimizes the model for higher accuracy, precision, and recall.

In [10]:
# Advanced Parameter Grid
param_grid_advanced = {
    'hidden_layer_sizes': [(10,), (50,), (100,)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive'],
    'max_iter': [200, 500]
}

print("\nAdvanced Hyperparameter Tuning")
best_model_phase3 = run_classifier(MLPClassifier(random_state=123), param_grid_advanced, X_train, y_train, X_test, y_test, "Tuned Neural Net (Advanced HyperParameter)")


Advanced Hyperparameter Tuning



--- RandomizedSearchCV (Tuned Neural Net (Advanced HyperParameter)) ---
RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=123, shuffle=True),
                   estimator=MLPClassifier(random_state=123),
                   param_distributions={'activation': ['relu', 'tanh'],
                                        'alpha': [0.0001, 0.001, 0.01],
                                        'hidden_layer_sizes': [(10,), (50,),
                                                               (100,)],
                                        'learning_rate': ['constant',
                                                          'adaptive'],
                                        'max_iter': [200, 500],
                                        'solver': ['adam', 'sgd']},
                   random_state=123, return_train_score=True,
                   scoring='accuracy')

--- Cross-Validation Results (Tuned Neural Net (Advanced HyperParameter)) ---
The best parameters are: {'solver': 

In [11]:
# Record advanced hyperparameter tuning model performance
advanced_test_metrics = {
    "accuracy": accuracy_score(y_test, best_model_phase3.predict(X_test)),
    "precision": precision_score(y_test, best_model_phase3.predict(X_test), average='weighted'),
    "recall": recall_score(y_test, best_model_phase3.predict(X_test), average='weighted')
}
record_trained_model("Tuned Neural Net (Advanced HyperParameter)", best_model_phase3.get_params(), np.nan, advanced_test_metrics)

Model 'Tuned Neural Net (Advanced HyperParameter)' recorded successfully!


In [12]:
model_records

Unnamed: 0,Model Name,Hyperparameters,Test Accuracy,Test Precision,Test Recall
0,Default Neural Net,{},88.46%,88.56%,88.46%
1,Tuned Neural Net (Simple Hyperparameter),"{'activation': 'relu', 'alpha': 0.0001, 'batch...",83.08%,82.27%,83.08%
2,Tuned Neural Net (Advanced HyperParameter),"{'activation': 'relu', 'alpha': 0.001, 'batch_...",87.01%,86.62%,87.01%


### Full Hyperparameter Grid
- Objective: Tune the MLPClassifier with all possible parameters for maximum performance.
- Purpose: Achieve the best possible model, but at higher computational cost.

In [13]:
param_grid_all = {'hidden_layer_sizes': [(10,), (50,), (10, 10), (50, 50)],
             'activation': ['identity', 'logistic', 'tanh', 'relu'],
             'solver': ['lbfgs', 'sgd', 'adam'],
             'alpha': np.logspace(-5, 3, 5),
             'learning_rate': ['constant', 'invscaling','adaptive'],
             'max_iter': [100, 500, 1000]}

print("\nFull Hyperparameter Grid")
best_model_phase4 = run_classifier(MLPClassifier(random_state=123), param_grid_all,  X_train, y_train, X_test, y_test, 'Full Hyperparameter Neural Net')


Full Hyperparameter Grid



--- RandomizedSearchCV (Full Hyperparameter Neural Net) ---
RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=123, shuffle=True),
                   estimator=MLPClassifier(random_state=123),
                   param_distributions={'activation': ['identity', 'logistic',
                                                       'tanh', 'relu'],
                                        'alpha': array([1.e-05, 1.e-03, 1.e-01, 1.e+01, 1.e+03]),
                                        'hidden_layer_sizes': [(10,), (50,),
                                                               (10, 10),
                                                               (50, 50)],
                                        'learning_rate': ['constant',
                                                          'invscaling',
                                                          'adaptive'],
                                        'max_iter': [100, 500, 1000],
                                     

In [14]:
# Record full hyperparameter tuning model performance
full_test_metrics = {
    "accuracy": accuracy_score(y_test, best_model_phase4.predict(X_test)),
    "precision": precision_score(y_test, best_model_phase4.predict(X_test), average='weighted'),
    "recall": recall_score(y_test, best_model_phase4.predict(X_test), average='weighted')
}
record_trained_model("Full Hyperparameter Neural Net", best_model_phase4.get_params(), np.nan, full_test_metrics)

Model 'Full Hyperparameter Neural Net' recorded successfully!


In [15]:
model_records

Unnamed: 0,Model Name,Hyperparameters,Test Accuracy,Test Precision,Test Recall
0,Default Neural Net,{},88.46%,88.56%,88.46%
1,Tuned Neural Net (Simple Hyperparameter),"{'activation': 'relu', 'alpha': 0.0001, 'batch...",83.08%,82.27%,83.08%
2,Tuned Neural Net (Advanced HyperParameter),"{'activation': 'relu', 'alpha': 0.001, 'batch_...",87.01%,86.62%,87.01%
3,Full Hyperparameter Neural Net,"{'activation': 'identity', 'alpha': 1e-05, 'ba...",89.74%,89.90%,89.74%


### Hyperparameter Tuning with Optuna
- Objective: Tune the MLPClassifier with best hyperparameter found by from optuna
- Purpose: Achieve the best possible model, with lower computational cost.

In [16]:
# Objective function for Optuna
def objective(trial):
    # Define the hyperparameter search space
    hidden_layer_sizes = trial.suggest_categorical("hidden_layer_sizes", [(10,), (50,), (10, 10), (50, 50)])
    activation = trial.suggest_categorical("activation", ['identity', 'logistic', 'tanh', 'relu'])
    solver = trial.suggest_categorical("solver", ['lbfgs', 'sgd', 'adam'])
    alpha = trial.suggest_float("alpha", 1e-5, 1e3, log=True)
    learning_rate = trial.suggest_categorical("learning_rate", ['constant', 'invscaling', 'adaptive'])
    max_iter = trial.suggest_categorical("max_iter", [100, 500, 1000])

    # Create and evaluate the model
    model = MLPClassifier(
        hidden_layer_sizes=hidden_layer_sizes,
        activation=activation,
        solver=solver,
        alpha=alpha,
        learning_rate=learning_rate,
        max_iter=max_iter,
        random_state=123
    )

    # Use cross-validation to evaluate the model
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy")
    mean_cv_score = np.mean(cv_scores)

    return mean_cv_score

In [17]:
# Run Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

[I 2024-12-29 13:12:15,556] A new study created in memory with name: no-name-0bb9c049-dedd-4ac6-89e6-e323e4e7f51f


[I 2024-12-29 13:13:45,804] Trial 0 finished with value: 0.8751404653219008 and parameters: {'hidden_layer_sizes': (10, 10), 'activation': 'logistic', 'solver': 'lbfgs', 'alpha': 0.10980121841013941, 'learning_rate': 'adaptive', 'max_iter': 500}. Best is trial 0 with value: 0.8751404653219008.
[I 2024-12-29 13:15:21,405] Trial 1 finished with value: 0.8759425184545474 and parameters: {'hidden_layer_sizes': (50, 50), 'activation': 'identity', 'solver': 'adam', 'alpha': 0.0009033291628834242, 'learning_rate': 'adaptive', 'max_iter': 500}. Best is trial 1 with value: 0.8759425184545474.
[I 2024-12-29 13:15:28,859] Trial 2 finished with value: 0.870011617625892 and parameters: {'hidden_layer_sizes': (10,), 'activation': 'identity', 'solver': 'lbfgs', 'alpha': 0.05960072180342096, 'learning_rate': 'constant', 'max_iter': 100}. Best is trial 1 with value: 0.8759425184545474.
[I 2024-12-29 13:15:31,641] Trial 3 finished with value: 0.2502003526412107 and parameters: {'hidden_layer_sizes': (10

In [18]:
# Extract the best parameters
best_params = study.best_params
print("\nBest Hyperparameters:", best_params)


Best Hyperparameters: {'hidden_layer_sizes': (50, 50), 'activation': 'identity', 'solver': 'lbfgs', 'alpha': 0.0002712909526978834, 'learning_rate': 'adaptive', 'max_iter': 500}


In [27]:
optuna.visualization.plot_optimization_history(study)

In [28]:
optuna.visualization.plot_slice(study)

In [19]:
# Train the best model with the optimized hyperparameters
best_model_phase5 = MLPClassifier(**best_params, random_state=123)
best_model_phase5.fit(X_train, y_train)

In [20]:
# Evaluate the best model
y_test_pred = best_model_phase5.predict(X_test)
test_metrics_phase5 = {
    "accuracy": accuracy_score(y_test, y_test_pred),
    "precision": precision_score(y_test, y_test_pred, average='weighted'),
    "recall": recall_score(y_test, y_test_pred, average='weighted')
}


In [21]:
# Record best model performance
record_trained_model("Optuna Tuned Neural Net", best_params, np.nan, test_metrics_phase5)

Model 'Optuna Tuned Neural Net' recorded successfully!


### Evaluate and Compare Models

In [22]:
# Display Results in a Table
results_df = pd.DataFrame(model_records)
results_df.sort_values(by="Test Accuracy", ascending=True, inplace=True)
print("Model Performance Comparison:")
results_df

Model Performance Comparison:


Unnamed: 0,Model Name,Hyperparameters,Test Accuracy,Test Precision,Test Recall
1,Tuned Neural Net (Simple Hyperparameter),"{'activation': 'relu', 'alpha': 0.0001, 'batch...",83.08%,82.27%,83.08%
2,Tuned Neural Net (Advanced HyperParameter),"{'activation': 'relu', 'alpha': 0.001, 'batch_...",87.01%,86.62%,87.01%
0,Default Neural Net,{},88.46%,88.56%,88.46%
3,Full Hyperparameter Neural Net,"{'activation': 'identity', 'alpha': 1e-05, 'ba...",89.74%,89.90%,89.74%
4,Optuna Tuned Neural Net,"{'hidden_layer_sizes': (50, 50), 'activation':...",89.83%,89.90%,89.83%


In [24]:
print("\nEvaluate and Compare Models")
models = {
    "Default Model": best_model_phase1,
    "Simple Hyperparameter Tuning": best_model_phase2,
    "Advanced Hyperparameter Tuning": best_model_phase3,
    "Full Hyperparameter Grid": best_model_phase4,
    "Hyperparameter Tuning with Optuna": best_model_phase5
}

for name, model in models.items():
    print(f"\n--- Evaluating {name} ---")
    y_test_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_test_pred)
    precision = precision_score(y_test, y_test_pred, average='weighted')
    recall = recall_score(y_test, y_test_pred, average='weighted')

    print(f"Accuracy: {accuracy:.2%}")
    print(f"Precision: {precision:.2%}")
    print(f"Recall: {recall:.2%}")


Evaluate and Compare Models

--- Evaluating Default Model ---
Accuracy: 88.46%
Precision: 88.56%
Recall: 88.46%

--- Evaluating Simple Hyperparameter Tuning ---
Accuracy: 83.08%
Precision: 82.27%
Recall: 83.08%

--- Evaluating Advanced Hyperparameter Tuning ---
Accuracy: 87.01%
Precision: 86.62%
Recall: 87.01%

--- Evaluating Full Hyperparameter Grid ---


Accuracy: 89.74%
Precision: 89.90%
Recall: 89.74%

--- Evaluating Hyperparameter Tuning with Optuna ---
Accuracy: 89.83%
Precision: 89.90%
Recall: 89.83%


### Select the Best Model

In [25]:
# Select the model with the highest accuracy
best_model_name = max(models, key=lambda name: accuracy_score(y_test, models[name].predict(X_test)))
best_model = models[best_model_name]

print(f"\nThe Best Model is '{best_model_name}'")
print(best_model)



The Best Model is 'Hyperparameter Tuning with Optuna'
MLPClassifier(activation='identity', alpha=0.0002712909526978834,
              hidden_layer_sizes=(50, 50), learning_rate='adaptive',
              max_iter=500, random_state=123, solver='lbfgs')


### Save the Best Model

In [26]:
def save_model(model, path, model_name="mlp_model.pkl"):
    """
    Save the trained model to a specified directory.

    Args:
        model: Trained model object.
        path (str): Directory path to save the model.
        model_name (str): File name for the saved model.
    """
    # Ensure the path exists
    os.makedirs(path, exist_ok=True)

    # Save the model
    file_path = os.path.join(path, model_name)
    joblib.dump(model, file_path)
    print(f"Model saved successfully at {file_path}!")

# Save the selected best model
save_model(best_model, path="../04_modelling/models/", model_name="mlp_neural_net.pkl")

Model saved successfully at ../04_modelling/models/mlp_neural_net.pkl!


### Deep Learning Model with AutoML

In [None]:
# Import Necessary Libraries
import pandas as pd
import h2o

# For hyperparameter optimization
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
import optuna 

# Evaluation metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score, mean_absolute_error 

In [None]:
# Initialize H2O cluster
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,1 hour 7 mins
H2O_cluster_timezone:,Asia/Kuala_Lumpur
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,1 month and 26 days
H2O_cluster_name:,H2O_from_python_Huawei_kmu9vd
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.884 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [None]:
# Step 1: Load and Prepare Data
X_train = pd.read_csv("../04_modelling/dataset/X_train.csv")
y_train = pd.read_csv("../04_modelling/dataset/y_train.csv")
X_val = pd.read_csv("../04_modelling/dataset/X_val.csv")
y_val = pd.read_csv("../04_modelling/dataset/y_val.csv")
X_test = pd.read_csv("../04_modelling/dataset/X_test.csv")
y_test = pd.read_csv("../04_modelling/dataset/y_test.csv")

In [None]:
train_df = pd.concat([X_train, y_train], axis=1)
val_df = pd.concat([X_val, y_val], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

In [None]:
train_h2o = h2o.H2OFrame(train_df)
val_h2o = h2o.H2OFrame(val_df)
test_h2o = h2o.H2OFrame(test_df)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [None]:
# Define target and features
target = "yearly_compensation"
features = train_h2o.columns
if target in features:
    features.remove(target)

In [None]:
# Initialize a DataFrame to store model performance details
model_results = pd.DataFrame(columns=[
    "Model Name", "Hyperparameters", "MSE", "RMSE", "MAE", 
    "RMSLE", "Mean Residual Deviance", "R-Squared", "Additional Metrics"
])

# Helper function to record model performance
def record_model(name, hyperparameters, performance, additional_metrics=None):
    result = {
        "Model Name": name,
        "Hyperparameters": hyperparameters,
        "MSE": performance.mse(),
        "RMSE": performance.rmse(),
        "MAE": performance.mae(),
        "RMSLE": performance.rmsle(),
        "Mean Residual Deviance": performance.mean_residual_deviance(),
        "R-Squared": performance.r2(),
        "Additional Metrics": additional_metrics if additional_metrics else {}
    }
    
    # Append the result as a new row to the DataFrame
    global model_results  # Ensure we're working with the global DataFrame
    model_results = pd.concat([model_results, pd.DataFrame([result])], ignore_index=True)

### Train Default Deep Learning Model <br/>
The default deep learning model is trained with minimal hyperparameter customization. This serves as a baseline to compare against more complex models. The model uses a fixed random seed for reproducibility and evaluates its performance using metrics like RMSE, MAE, and R-squared.

In [None]:
# Train Default Deep Learning Model
default_dl_model = H2ODeepLearningEstimator(seed=42)
default_dl_model.train(x=features, y=target, training_frame=train_h2o, validation_frame=val_h2o)

deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%


Unnamed: 0,layer,units,type,dropout,l1,l2,mean_rate,rate_rms,momentum,mean_weight,weight_rms,mean_bias,bias_rms
,1,50,Input,0.0,,,,,,,,,
,2,200,Rectifier,0.0,0.0,0.0,0.2004067,0.101903,0.0,0.0013314,0.0955141,0.4508021,0.0680175
,3,200,Rectifier,0.0,0.0,0.0,0.396211,0.330171,0.0,-0.015934,0.0712199,0.9565161,0.0254086
,4,1,Linear,,0.0,0.0,0.0067211,0.0030025,0.0,0.0024632,0.043793,-0.0192963,0.0

Unnamed: 0,timestamp,duration,training_speed,epochs,iterations,samples,training_rmse,training_deviance,training_mae,training_r2,validation_rmse,validation_deviance,validation_mae,validation_r2
,2024-12-29 00:29:29,0.000 sec,,0.0,0,0.0,,,,,,,,
,2024-12-29 00:29:30,1.676 sec,4626 obs/sec,1.0,1,5459.0,1.188877,1.4134286,0.9284737,0.9753712,1.2615032,1.5913903,0.9682492,0.9722899
,2024-12-29 00:29:35,7.038 sec,7160 obs/sec,8.0,8,43672.0,0.3338209,0.1114364,0.2586734,0.9980582,0.419723,0.1761674,0.3156904,0.9969325
,2024-12-29 00:29:37,8.426 sec,7545 obs/sec,10.0,10,54590.0,0.3400327,0.1156222,0.264507,0.9979853,0.3886397,0.1510408,0.2982989,0.99737

variable,relative_importance,scaled_importance,percentage
job_title_Data_Engineer,1.0,1.0,0.0323628
used_tpu,0.8924959,0.8924959,0.0288836
ml_spending,0.8825805,0.8825805,0.0285628
country_Ukraine,0.8308105,0.8308105,0.0268873
country_United_States_of_America,0.7165332,0.7165332,0.0231890
country_SUMprofileTable_yearly_compensation,0.7129788,0.7129788,0.0230740
Total_Experience,0.7034249,0.7034249,0.0227648
country_Thailand,0.6894102,0.6894102,0.0223112
country_Austria,0.6892593,0.6892593,0.0223063
RecommendedLanguage_MEANprofileTable_yearly_compensation,0.6722112,0.6722112,0.0217546


In [None]:
# Evaluate Default Model
default_performance = default_dl_model.model_performance(test_h2o)
print("Default Deep Learning Model Performance:")
print(default_performance)

Default Deep Learning Model Performance:
ModelMetricsRegression: deeplearning
** Reported on test data. **

MSE: 0.17463401534693082
RMSE: 0.41789234899305205
MAE: 0.32230853365925477
RMSLE: 0.1474763140332855
Mean Residual Deviance: 0.17463401534693082


In [None]:
# Record default model performance
record_model("Default Deep Learning Model", {"seed": 42}, default_performance)

### Hyperparameter Optimization with Optuna <br/>
Optuna performs hyperparameter tuning using an efficient sampling strategy. The objective function trains the model with sampled hyperparameters and evaluates its validation RMSE. The best hyperparameters are selected after multiple trials.

In [None]:
# Step 3: Hyperparameter Optimization using Optuna
def objective(trial):
    # Define hyperparameter search space
    params = {
        "epochs": trial.suggest_int("epochs", 10, 100),
        "hidden": trial.suggest_categorical("hidden", [[50, 50], [100, 100], [200, 200]]),
        "input_dropout_ratio": trial.suggest_uniform("input_dropout_ratio", 0.0, 0.5),
        "l1": trial.suggest_loguniform("l1", 1e-6, 1e-3),
        "l2": trial.suggest_loguniform("l2", 1e-6, 1e-3),
        "activation": trial.suggest_categorical("activation", ["Rectifier", "Tanh", "Maxout"]),
    }
    
    # Train Deep Learning Model with hyperparameters
    model = H2ODeepLearningEstimator(**params, seed=42)
    model.train(x=features, y=target, training_frame=train_h2o, validation_frame=val_h2o)
    
    # Get validation performance (use RMSE as optimization target)
    performance = model.model_performance(val_h2o)
    return performance.rmse()

In [None]:
# Perform hyperparameter optimization
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)


[I 2024-12-29 00:29:42,859] A new study created in memory with name: no-name-e4712640-d386-4350-ae84-23414ccb4423


  "input_dropout_ratio": trial.suggest_uniform("input_dropout_ratio", 0.0, 0.5),
  "l1": trial.suggest_loguniform("l1", 1e-6, 1e-3),
  "l2": trial.suggest_loguniform("l2", 1e-6, 1e-3),


deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%


[I 2024-12-29 00:31:39,725] Trial 0 finished with value: 0.19440924416723668 and parameters: {'epochs': 42, 'hidden': [200, 200], 'input_dropout_ratio': 0.00026745724984855723, 'l1': 6.893661461455753e-06, 'l2': 0.0007896026535373353, 'activation': 'Maxout'}. Best is trial 0 with value: 0.19440924416723668.


deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%


[I 2024-12-29 00:32:35,049] Trial 1 finished with value: 1.3258625887990734 and parameters: {'epochs': 29, 'hidden': [200, 200], 'input_dropout_ratio': 0.28814156778198313, 'l1': 0.00027332229182007206, 'l2': 0.00017841999209665093, 'activation': 'Tanh'}. Best is trial 0 with value: 0.19440924416723668.


deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%


[I 2024-12-29 00:32:55,257] Trial 2 finished with value: 1.634747257157558 and parameters: {'epochs': 72, 'hidden': [100, 100], 'input_dropout_ratio': 0.318516456857556, 'l1': 0.00019395812581802688, 'l2': 7.3328662521291e-06, 'activation': 'Rectifier'}. Best is trial 0 with value: 0.19440924416723668.


deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%


[I 2024-12-29 00:33:16,431] Trial 3 finished with value: 1.2832072004390995 and parameters: {'epochs': 53, 'hidden': [50, 50], 'input_dropout_ratio': 0.14563924472250578, 'l1': 7.988093226347808e-06, 'l2': 1.081032819932921e-05, 'activation': 'Maxout'}. Best is trial 0 with value: 0.19440924416723668.


deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%


[I 2024-12-29 00:33:35,646] Trial 4 finished with value: 0.7368232134017909 and parameters: {'epochs': 52, 'hidden': [50, 50], 'input_dropout_ratio': 0.03874356909610871, 'l1': 2.4106046846133187e-05, 'l2': 2.144735567480832e-05, 'activation': 'Maxout'}. Best is trial 0 with value: 0.19440924416723668.


deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%


[I 2024-12-29 00:35:45,866] Trial 5 finished with value: 1.611695802529085 and parameters: {'epochs': 76, 'hidden': [200, 200], 'input_dropout_ratio': 0.34420899734364513, 'l1': 0.00024473462572799674, 'l2': 2.0137300272973865e-05, 'activation': 'Tanh'}. Best is trial 0 with value: 0.19440924416723668.


deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%


[I 2024-12-29 00:37:48,087] Trial 6 finished with value: 1.3204399784467176 and parameters: {'epochs': 48, 'hidden': [200, 200], 'input_dropout_ratio': 0.19764556109624187, 'l1': 0.0003363273435900321, 'l2': 0.00027856336597624997, 'activation': 'Maxout'}. Best is trial 0 with value: 0.19440924416723668.


deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%


[I 2024-12-29 00:38:02,415] Trial 7 finished with value: 0.5482873879057968 and parameters: {'epochs': 24, 'hidden': [100, 100], 'input_dropout_ratio': 0.01782356632052151, 'l1': 1.5156726405854706e-05, 'l2': 0.0008382895628739779, 'activation': 'Rectifier'}. Best is trial 0 with value: 0.19440924416723668.


deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%


[I 2024-12-29 00:38:13,056] Trial 8 finished with value: 1.0548193429126034 and parameters: {'epochs': 30, 'hidden': [50, 50], 'input_dropout_ratio': 0.11916323491184594, 'l1': 5.442544811038452e-05, 'l2': 6.270224339653445e-05, 'activation': 'Rectifier'}. Best is trial 0 with value: 0.19440924416723668.


deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%


[I 2024-12-29 00:38:31,243] Trial 9 finished with value: 1.2044782814433805 and parameters: {'epochs': 94, 'hidden': [50, 50], 'input_dropout_ratio': 0.1587875174035639, 'l1': 5.97442814239806e-06, 'l2': 3.12801044671335e-06, 'activation': 'Rectifier'}. Best is trial 0 with value: 0.19440924416723668.


deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%


[I 2024-12-29 00:39:13,122] Trial 10 finished with value: 2.06584952211662 and parameters: {'epochs': 14, 'hidden': [200, 200], 'input_dropout_ratio': 0.45207188149700694, 'l1': 1.2586308837063503e-06, 'l2': 1.0865552017995012e-06, 'activation': 'Maxout'}. Best is trial 0 with value: 0.19440924416723668.


deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%


[I 2024-12-29 00:39:27,923] Trial 11 finished with value: 0.35851053339713695 and parameters: {'epochs': 31, 'hidden': [100, 100], 'input_dropout_ratio': 0.004286391488563992, 'l1': 4.485454289832473e-06, 'l2': 0.000841119327548078, 'activation': 'Rectifier'}. Best is trial 0 with value: 0.19440924416723668.


deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%


[I 2024-12-29 00:39:45,385] Trial 12 finished with value: 0.7945864298144273 and parameters: {'epochs': 40, 'hidden': [100, 100], 'input_dropout_ratio': 0.060099704092667, 'l1': 1.925979397685232e-06, 'l2': 0.0009323099415308876, 'activation': 'Rectifier'}. Best is trial 0 with value: 0.19440924416723668.


deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%


[I 2024-12-29 00:40:42,497] Trial 13 finished with value: 0.31371684941472333 and parameters: {'epochs': 65, 'hidden': [100, 100], 'input_dropout_ratio': 0.006527424790946257, 'l1': 3.985843920009758e-06, 'l2': 0.00023080492111863114, 'activation': 'Maxout'}. Best is trial 0 with value: 0.19440924416723668.


deeplearning Model Build progress: |██████████████████████████████████

In [None]:
# Best Hyperparameters
best_params = study.best_params
print("Best Hyperparameters:")
print(best_params)

Best Hyperparameters:
{'epochs': 52, 'hidden': [100, 100], 'input_dropout_ratio': 0.008342451696622868, 'l1': 0.00012151328066147507, 'l2': 0.0001678209282947037, 'activation': 'Tanh'}


In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_slice(study)

### Train Deep Learning Model with Best Hyperparameters <br/>
The deep learning model is trained using the best hyperparameters found through Optuna. This approach aims to enhance performance metrics by optimizing key parameters such as the number of epochs, hidden layers, and dropout ratios.

In [None]:
# Train Deep Learning Model with Best Hyperparameters
tuned_dl_model = H2ODeepLearningEstimator(**best_params, seed=42)
tuned_dl_model.train(x=features, y=target, training_frame=train_h2o, validation_frame=val_h2o)

deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%


Unnamed: 0,layer,units,type,dropout,l1,l2,mean_rate,rate_rms,momentum,mean_weight,weight_rms,mean_bias,bias_rms
,1,50,Input,0.8342452,,,,,,,,,
,2,100,Tanh,0.0,0.0001215,0.0001678,0.8816644,0.3059335,0.0,0.0001598,0.025435,0.001921,0.0380167
,3,100,Tanh,0.0,0.0001215,0.0001678,0.9831108,0.0942007,0.0,-0.0001319,0.0122083,-0.0021567,0.0421184
,4,1,Linear,,0.0001215,0.0001678,0.9147559,0.2606895,0.0,0.0154104,0.11596,0.1144471,0.0

Unnamed: 0,timestamp,duration,training_speed,epochs,iterations,samples,training_rmse,training_deviance,training_mae,training_r2,validation_rmse,validation_deviance,validation_mae,validation_r2
,2024-12-28 22:52:17,0.000 sec,,0.0,0,0.0,,,,,,,,
,2024-12-28 22:52:20,2.822 sec,10470 obs/sec,5.1976552,1,28374.0,0.7517335,0.5651033,0.5743227,0.9901531,0.7582119,0.5748853,0.5646446,0.9899898
,2024-12-28 22:52:25,8.271 sec,10542 obs/sec,15.5909507,3,85111.0,0.5833219,0.3402645,0.4462536,0.9940709,0.6064818,0.3678202,0.4614304,0.9935953
,2024-12-28 22:52:32,14.579 sec,9952 obs/sec,26.0126397,5,142003.0,0.6031367,0.3637739,0.4707099,0.9936613,0.6191633,0.3833632,0.4926364,0.9933247
,2024-12-28 22:52:38,20.641 sec,9829 obs/sec,36.4149112,7,198789.0,0.5116001,0.2617346,0.3937612,0.9954393,0.5276004,0.2783621,0.405954,0.995153
,2024-12-28 22:52:44,26.703 sec,9763 obs/sec,46.7977652,9,255469.0,0.5268404,0.2775608,0.4120253,0.9951635,0.521938,0.2724193,0.4129254,0.9952565
,2024-12-28 22:52:47,29.809 sec,9731 obs/sec,52.0126397,10,283937.0,0.5515813,0.3042419,0.4376608,0.9946986,0.5575225,0.3108313,0.4421584,0.9945876
,2024-12-28 22:52:47,29.896 sec,9730 obs/sec,52.0126397,10,283937.0,0.5268404,0.2775608,0.4120253,0.9951635,0.521938,0.2724193,0.4129254,0.9952565

variable,relative_importance,scaled_importance,percentage
used_tpu,1.0,1.0,0.0274446
country_Ukraine,0.9816257,0.9816257,0.0269404
country_Philippines,0.9349760,0.9349760,0.0256601
ml_spending,0.9109827,0.9109827,0.0250016
Total_Experience,0.8950585,0.8950585,0.0245645
job_title_Data_Engineer,0.8669543,0.8669543,0.0237932
country_Singapore,0.8453551,0.8453551,0.0232005
country_Thailand,0.8075344,0.8075344,0.0221625
country_Spain,0.7780793,0.7780793,0.0213541
company_MEANprofileTable_yearly_compensation,0.7715148,0.7715148,0.0211739


In [None]:
# Evaluate Tuned Model
tuned_performance = tuned_dl_model.model_performance(test_h2o)
print("Tuned Deep Learning Model Performance:")
print(tuned_performance)


Tuned Deep Learning Model Performance:
ModelMetricsRegression: deeplearning
** Reported on test data. **

MSE: 0.3033611197459497
RMSE: 0.5507822798038711
MAE: 0.43116396331180634
RMSLE: 0.166014045295754
Mean Residual Deviance: 0.3033611197459497


In [None]:
# Record tuned model performance
record_model("Tuned Deep Learning Model", best_params, tuned_performance)

In [None]:
# Compare Predictions with Actual
predictions = tuned_dl_model.predict(test_h2o)
actual_values = y_test.to_numpy().ravel()
rounded_predictions = predictions.as_data_frame().to_numpy().ravel().round().astype(int)

deeplearning prediction progress: |██████████████████████████████████████████████| (done) 100%





In [None]:
accuracy = accuracy_score(actual_values, rounded_predictions)
print(f"Accuracy on Test Data: {accuracy:.2f}")

Accuracy on Test Data: 0.66


### Early Stopping Model <br/>
This model incorporates early stopping to prevent overfitting. Training halts if the model's RMSE does not improve within a specified number of rounds. This technique ensures efficient use of computational resources and reduces overtraining risks.


In [None]:
# Retrieve and print the best hyperparameters
best_params = study.best_params

# Train a early stopping model with the best hyperparameters and early stopping
early_stopping_model = H2ODeepLearningEstimator(**best_params, 
                                       stopping_metric="rmse", 
                                       stopping_rounds=5, 
                                       stopping_tolerance=0.01,
                                       seed=42)
early_stopping_model.train(x=features, y=target, training_frame=train_h2o, validation_frame=val_h2o)

# Evaluate the final model on the test set
final_performance = early_stopping_model.model_performance(test_h2o)
print("Final Model Performance on Test Set:")
print(f"RMSE: {final_performance.rmse():.2f}")
print(f"MAE: {final_performance.mae():.2f}")
print(f"R-squared: {final_performance.r2():.2f}")

In [None]:
# Record the performance of the early stopping model
record_model("Early Stopping Model", best_params, final_performance)

### Ensemble Model <br/>
An ensemble of models is trained with shared hyperparameters but varied hidden layer configurations. Predictions are averaged to improve robustness and reduce variance. The ensemble is evaluated using metrics like MAE and R-squared.


In [None]:
# Retrieve best hyperparameters from Optuna
best_params = study.best_params

# Extract shared hyperparameters for the ensemble
common_params = {
    "epochs": best_params.get("epochs", 100),
    "input_dropout_ratio": best_params.get("input_dropout_ratio", 0.0),
    "l1": best_params.get("l1", 1e-6),
    "l2": best_params.get("l2", 1e-6),
    "activation": best_params.get("activation", "Rectifier"),
    "seed": 42,
}

# Train Multiple Models for Ensembling using the best parameters
models = []
for hidden in [[50, 50], [100, 100], [200, 200]]:
    # Combine best params with unique hidden layers
    model_params = {**common_params, "hidden": hidden}
    
    # Train the model
    model = H2ODeepLearningEstimator(**model_params)
    model.train(x=features, y=target, training_frame=train_h2o, validation_frame=val_h2o)
    models.append(model)

# Ensemble Predictions (Averaging)
predictions = [model.predict(test_h2o).as_data_frame()["predict"].to_numpy() for model in models]
ensemble_prediction = np.mean(predictions, axis=0)

# Evaluate Ensemble using Mean Absolute Error
y_test = test_h2o[target].as_data_frame().to_numpy().ravel()
mae_ensemble = mean_absolute_error(y_test, ensemble_prediction)
print(f"Mean Absolute Error (MAE) for Ensemble: {mae_ensemble:.2f}")

# Add R-squared Evaluation
total_variance = np.sum((y_test - np.mean(y_test)) ** 2)
residual_variance = np.sum((y_test - ensemble_prediction) ** 2)
r_squared = 1 - (residual_variance / total_variance)
print(f"R-squared for Ensemble: {r_squared:.2f}")

In [None]:
# Record ensemble performance
record_model("Ensemble Model", {"common_params": best_params, "hidden_layers": [[50, 50], [100, 100], [200, 200]]}, final_performance, {"MAE": mae_ensemble})


### Final Evaluation and Model Saving <br/>
All trained models are compared based on RMSE, and the best-performing model is selected. The best model is then saved for future use, ensuring reproducibility and ease of deployment.

In [None]:
# Display Results in a Table
import pandas as pd

results_df = pd.DataFrame(model_results)
results_df.sort_values(by="RMSE", ascending=True, inplace=True)
print("Model Performance Comparison:")
results_df

# Select the best model based on RMSE
best_model_info = results_df.iloc[0]
print("\nBest Model:")
best_model_info

In [None]:
# Step 6: Save the Tuned Model
best_model_path = h2o.save_model(best_model_info, path="../04_modelling/models/")
print(f"Tuned model saved to: {best_model_path}")

# Shutdown H2O Cluster
h2o.cluster().shutdown()

## Archived

In [None]:
# from sklearn.metrics import mean_absolute_error
# import numpy as np

# # Train Multiple Models for Ensembling
# models = []
# for hidden in [[50, 50], [100, 100], [200, 200]]:
#     model = H2ODeepLearningEstimator(
#         hidden=hidden,
#         epochs=100,
#         seed=42
#     )
#     model.train(x=features, y=target, training_frame=train_h2o, validation_frame=val_h2o)
#     models.append(model)

# # Ensemble Predictions (Averaging)
# predictions = [model.predict(test_h2o).as_data_frame()["predict"].to_numpy() for model in models]
# ensemble_prediction = np.mean(predictions, axis=0)

# # Evaluate Ensemble using Mean Absolute Error
# y_test = test_h2o[target].as_data_frame().to_numpy().ravel()
# mae_ensemble = mean_absolute_error(y_test, ensemble_prediction)
# print(f"Mean Absolute Error (MAE) for Ensemble: {mae_ensemble:.2f}")

# # Optional: Add R-squared Evaluation
# total_variance = np.sum((y_test - np.mean(y_test)) ** 2)
# residual_variance = np.sum((y_test - ensemble_prediction) ** 2)
# r_squared = 1 - (residual_variance / total_variance)
# print(f"R-squared for Ensemble: {r_squared:.2f}")


In [None]:
# # Random Grid Search 
# from h2o.grid.grid_search import H2OGridSearch

# # Define hyperparameters
# hyper_params = {
#     "epochs": list(range(50, 201, 50)),
#     "hidden": [[50, 50], [100, 100], [200, 200]],
#     "input_dropout_ratio": [i / 10.0 for i in range(0, 6)],
#     "l1": [1e-6, 1e-5, 1e-4],
#     "l2": [1e-6, 1e-5, 1e-4],
#     "activation": ["Rectifier", "Tanh", "Maxout"]
# }

# # Search criteria for random grid search
# search_criteria = {
#     'strategy': 'RandomDiscrete',  # Random search
#     'max_models': 20,  # Maximum number of models to train
#     'seed': 42  # For reproducibility
# }

# # Random grid search setup
# random_grid = H2OGridSearch(
#     model=H2ODeepLearningEstimator(seed=42),
#     grid_id='random_grid_dl',
#     hyper_params=hyper_params,
#     search_criteria=search_criteria
# )

# # Train models with random grid search
# random_grid.train(x=features, y=target, training_frame=train_h2o, validation_frame=val_h2o)

# # Get the grid results, sorted by validation RMSE
# grid_results = random_grid.get_grid(sort_by="rmse", decreasing=False)

# # Get the best model based on RMSE
# best_model = grid_results.models[0]

# # Print details of the best model
# print("Best Random Search Model:")
# print(best_model)

# # Evaluate the best model on the test set
# performance = best_model.model_performance(test_h2o)

# # Print performance metrics
# print("Performance of Best Random Search Model:")
# print(f"RMSE: {performance.rmse():.2f}")
# print(f"MAE: {performance.mae():.2f}")
# print(f"R-squared: {performance.r2():.2f}")

In [None]:
# # Grid Search for Hyperparameter Tuning
# hyper_params = {
#     "epochs": [50, 100, 150],
#     "hidden": [[50, 50], [100, 100], [200, 200]],
#     "input_dropout_ratio": [0.0, 0.2, 0.4],
#     "l1": [1e-5, 1e-4, 1e-3],
#     "l2": [1e-5, 1e-4, 1e-3],
#     "activation": ["Rectifier", "Tanh", "Maxout"]
# }

# grid_search = H2OGridSearch(
#     H2ODeepLearningEstimator(seed=42),
#     hyper_params=hyper_params
# )

# # Train models with grid search
# grid_search.train(x=features, y=target, training_frame=train_h2o, validation_frame=val_h2o)

# # Get the best model from the grid search
# best_model = grid_search.get_grid(sort_by="rmse", decreasing=False).models[0]
# print("Best Grid Search Model:")
# print(best_model)

# # Evaluate the best grid search model
# performance = best_model.model_performance(test_h2o)
# print("Best Grid Search Model Performance:")
# print(performance)