## Model Description

The **Multilayer Perceptron (MLP)** is a type of artificial neural network that learns a mapping from inputs to outputs using a supervised learning algorithm. It consists of multiple layers of interconnected neurons, making it suitable for capturing complex patterns in data.

### Key Features of the MLP Model:
- **Hidden Layers**: Configurable architecture with support for varying sizes of hidden layers.
- **Activation Functions**: Flexible choice of activation functions like ReLU and Tanh.
- **Optimization Algorithms**: Supports solvers like Adam and SGD for weight optimization.
- **Regularization**: Includes an `alpha` parameter to control L2 regularization.
- **Adaptive Learning**: Utilizes an adaptive learning rate for efficient convergence.

## Training Process

The training pipeline includes the following steps:
   
2. **Cross-Validation and Hyperparameter Tuning**:
   - A `GridSearchCV` approach is used to perform exhaustive hyperparameter tuning with 5-fold stratified cross-validation, ensuring robust model evaluation.
   - The hyperparameters optimized include the number of hidden layers, activation functions, solvers, learning rates, and the number of training iterations.

In [1]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from model_utils import run_classifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
import optuna
from sklearn.model_selection import cross_val_score
import joblib
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load datasets
X_train = pd.read_csv("../04_modelling/dataset/X_train.csv")
y_train = pd.read_csv("../04_modelling/dataset/y_train.csv")
X_test = pd.read_csv("../04_modelling/dataset/X_test.csv")
y_test = pd.read_csv("../04_modelling/dataset/y_test.csv")

# Fix the shape of y_train and y_test
y_train = y_train.values.ravel()  # Convert to 1D array
y_test = y_test.values.ravel()    # Convert to 1D array

### Function to Record Trained Models

In [3]:
import pandas as pd

# Initialize an empty DataFrame to store model results
model_records = pd.DataFrame(columns=["Model Name", "Hyperparameters", "Test Accuracy", "Test Precision", "Test Recall"])

def record_trained_model(model_name, params, mean_cv_accuracy, test_metrics):
    """
    Record a trained model's details in a pandas DataFrame.

    Args:
        model_name (str): Name of the model.
        params (dict): Hyperparameters used for training.
        mean_cv_accuracy (float): Mean cross-validation accuracy (optional).
        test_metrics (dict): Test metrics such as accuracy, precision, and recall.
    """
    global model_records  # Use the global DataFrame

    # Create a new record
    record = {
        "Model Name": model_name,
        "Hyperparameters": params,
        "Test Accuracy": f"{test_metrics['accuracy']:.2%}",
        "Test Precision": f"{test_metrics['precision']:.2%}",
        "Test Recall": f"{test_metrics['recall']:.2%}",
    }

    # Append the record to the DataFrame
    model_records = pd.concat([model_records, pd.DataFrame([record])], ignore_index=True)

    print(f"Model '{model_name}' recorded successfully!")

### Train with Default Parameters
- Objective: Train a baseline MLPClassifier without tuning.
- Purpose: Provides a baseline performance for comparison.

In [4]:
# Default MLPClassifier
mlp_default = MLPClassifier(random_state=123)

print("Train with Default Parameters")
best_model_phase1 = run_classifier(mlp_default, {}, X_train, y_train, X_test, y_test, "Default Neural Net")

Train with Default Parameters

--- RandomizedSearchCV (Default Neural Net) ---
RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=123, shuffle=True),
                   estimator=MLPClassifier(random_state=123),
                   param_distributions={}, random_state=123,
                   return_train_score=True, scoring='accuracy')

--- Cross-Validation Results (Default Neural Net) ---
The best parameters are: {}
Mean cross-validation accuracy: 79.28%

--- Test Results ---
Accuracy: 88.46%
Precision: 88.56%
Recall: 88.46%


In [5]:
# Record default model performance
default_test_metrics = {
    "accuracy": accuracy_score(y_test, best_model_phase1.predict(X_test)),
    "precision": precision_score(y_test, best_model_phase1.predict(X_test), average='weighted'),
    "recall": recall_score(y_test, best_model_phase1.predict(X_test), average='weighted')
}
record_trained_model("Default Neural Net", {}, np.nan, default_test_metrics)

Model 'Default Neural Net' recorded successfully!


In [6]:
model_records

Unnamed: 0,Model Name,Hyperparameters,Test Accuracy,Test Precision,Test Recall
0,Default Neural Net,{},88.46%,88.56%,88.46%


### Simple Hyperparameter Tuning
- Objective: Experiment with a small grid of hyperparameters.
- Purpose: Improves performance with minimal computational cost.

In [7]:
# Simple Parameter Grid
param_grid_simple = {
    'hidden_layer_sizes': [(10,), (50,)],
    'activation': ['relu'],
    'solver': ['adam'],
    'alpha': [0.0001],
    'max_iter': [100]
}

print("\nPhase 2: Apply Simple Hyperparameter Tuning")
best_model_phase2 = run_classifier(MLPClassifier(random_state=123), param_grid_simple, X_train, y_train, X_test, y_test, "Tuned Neural Net (Simple Hyperparameter)")


Phase 2: Apply Simple Hyperparameter Tuning



--- RandomizedSearchCV (Tuned Neural Net (Simple Hyperparameter)) ---
RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=123, shuffle=True),
                   estimator=MLPClassifier(random_state=123),
                   param_distributions={'activation': ['relu'],
                                        'alpha': [0.0001],
                                        'hidden_layer_sizes': [(10,), (50,)],
                                        'max_iter': [100], 'solver': ['adam']},
                   random_state=123, return_train_score=True,
                   scoring='accuracy')

--- Cross-Validation Results (Tuned Neural Net (Simple Hyperparameter)) ---
The best parameters are: {'solver': 'adam', 'max_iter': 100, 'hidden_layer_sizes': (10,), 'alpha': 0.0001, 'activation': 'relu'}
Mean cross-validation accuracy: 77.53%

--- Test Results ---
Accuracy: 83.08%
Precision: 82.27%
Recall: 83.08%


In [8]:
# Record simple hyperparameter tuning model performance
simple_test_metrics = {
    "accuracy": accuracy_score(y_test, best_model_phase2.predict(X_test)),
    "precision": precision_score(y_test, best_model_phase2.predict(X_test), average='weighted'),
    "recall": recall_score(y_test, best_model_phase2.predict(X_test), average='weighted')
}
record_trained_model("Tuned Neural Net (Simple Hyperparameter)", best_model_phase2.get_params(), np.nan, simple_test_metrics)

Model 'Tuned Neural Net (Simple Hyperparameter)' recorded successfully!


In [9]:
model_records

Unnamed: 0,Model Name,Hyperparameters,Test Accuracy,Test Precision,Test Recall
0,Default Neural Net,{},88.46%,88.56%,88.46%
1,Tuned Neural Net (Simple Hyperparameter),"{'activation': 'relu', 'alpha': 0.0001, 'batch...",83.08%,82.27%,83.08%


### Advanced Hyperparameter Tuning
- Objective: Use an expanded parameter grid for more robust tuning.
- Purpose: Optimizes the model for higher accuracy, precision, and recall.

In [10]:
# Advanced Parameter Grid
param_grid_advanced = {
    'hidden_layer_sizes': [(10,), (50,), (100,)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive'],
    'max_iter': [200, 500]
}

print("\nAdvanced Hyperparameter Tuning")
best_model_phase3 = run_classifier(MLPClassifier(random_state=123), param_grid_advanced, X_train, y_train, X_test, y_test, "Tuned Neural Net (Advanced HyperParameter)")


Advanced Hyperparameter Tuning



--- RandomizedSearchCV (Tuned Neural Net (Advanced HyperParameter)) ---
RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=123, shuffle=True),
                   estimator=MLPClassifier(random_state=123),
                   param_distributions={'activation': ['relu', 'tanh'],
                                        'alpha': [0.0001, 0.001, 0.01],
                                        'hidden_layer_sizes': [(10,), (50,),
                                                               (100,)],
                                        'learning_rate': ['constant',
                                                          'adaptive'],
                                        'max_iter': [200, 500],
                                        'solver': ['adam', 'sgd']},
                   random_state=123, return_train_score=True,
                   scoring='accuracy')

--- Cross-Validation Results (Tuned Neural Net (Advanced HyperParameter)) ---
The best parameters are: {'solver': 

In [11]:
# Record advanced hyperparameter tuning model performance
advanced_test_metrics = {
    "accuracy": accuracy_score(y_test, best_model_phase3.predict(X_test)),
    "precision": precision_score(y_test, best_model_phase3.predict(X_test), average='weighted'),
    "recall": recall_score(y_test, best_model_phase3.predict(X_test), average='weighted')
}
record_trained_model("Tuned Neural Net (Advanced HyperParameter)", best_model_phase3.get_params(), np.nan, advanced_test_metrics)

Model 'Tuned Neural Net (Advanced HyperParameter)' recorded successfully!


In [12]:
model_records

Unnamed: 0,Model Name,Hyperparameters,Test Accuracy,Test Precision,Test Recall
0,Default Neural Net,{},88.46%,88.56%,88.46%
1,Tuned Neural Net (Simple Hyperparameter),"{'activation': 'relu', 'alpha': 0.0001, 'batch...",83.08%,82.27%,83.08%
2,Tuned Neural Net (Advanced HyperParameter),"{'activation': 'relu', 'alpha': 0.001, 'batch_...",87.01%,86.62%,87.01%


### Full Hyperparameter Grid
- Objective: Tune the MLPClassifier with all possible parameters for maximum performance.
- Purpose: Achieve the best possible model, but at higher computational cost.

In [13]:
param_grid_all = {'hidden_layer_sizes': [(10,), (50,), (10, 10), (50, 50)],
             'activation': ['identity', 'logistic', 'tanh', 'relu'],
             'solver': ['lbfgs', 'sgd', 'adam'],
             'alpha': np.logspace(-5, 3, 5),
             'learning_rate': ['constant', 'invscaling','adaptive'],
             'max_iter': [100, 500, 1000]}

print("\nFull Hyperparameter Grid")
best_model_phase4 = run_classifier(MLPClassifier(random_state=123), param_grid_all,  X_train, y_train, X_test, y_test, 'Full Hyperparameter Neural Net')


Full Hyperparameter Grid



--- RandomizedSearchCV (Full Hyperparameter Neural Net) ---
RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=123, shuffle=True),
                   estimator=MLPClassifier(random_state=123),
                   param_distributions={'activation': ['identity', 'logistic',
                                                       'tanh', 'relu'],
                                        'alpha': array([1.e-05, 1.e-03, 1.e-01, 1.e+01, 1.e+03]),
                                        'hidden_layer_sizes': [(10,), (50,),
                                                               (10, 10),
                                                               (50, 50)],
                                        'learning_rate': ['constant',
                                                          'invscaling',
                                                          'adaptive'],
                                        'max_iter': [100, 500, 1000],
                                     

In [14]:
# Record full hyperparameter tuning model performance
full_test_metrics = {
    "accuracy": accuracy_score(y_test, best_model_phase4.predict(X_test)),
    "precision": precision_score(y_test, best_model_phase4.predict(X_test), average='weighted'),
    "recall": recall_score(y_test, best_model_phase4.predict(X_test), average='weighted')
}
record_trained_model("Full Hyperparameter Neural Net", best_model_phase4.get_params(), np.nan, full_test_metrics)

Model 'Full Hyperparameter Neural Net' recorded successfully!


In [15]:
model_records

Unnamed: 0,Model Name,Hyperparameters,Test Accuracy,Test Precision,Test Recall
0,Default Neural Net,{},88.46%,88.56%,88.46%
1,Tuned Neural Net (Simple Hyperparameter),"{'activation': 'relu', 'alpha': 0.0001, 'batch...",83.08%,82.27%,83.08%
2,Tuned Neural Net (Advanced HyperParameter),"{'activation': 'relu', 'alpha': 0.001, 'batch_...",87.01%,86.62%,87.01%
3,Full Hyperparameter Neural Net,"{'activation': 'identity', 'alpha': 1e-05, 'ba...",89.74%,89.90%,89.74%


### Hyperparameter Tuning with Optuna
- Objective: Tune the MLPClassifier with best hyperparameter found by from optuna
- Purpose: Achieve the best possible model, with lower computational cost.

In [16]:
# Objective function for Optuna
def objective(trial):
    # Define the hyperparameter search space
    hidden_layer_sizes = trial.suggest_categorical("hidden_layer_sizes", [(10,), (50,), (10, 10), (50, 50)])
    activation = trial.suggest_categorical("activation", ['identity', 'logistic', 'tanh', 'relu'])
    solver = trial.suggest_categorical("solver", ['lbfgs', 'sgd', 'adam'])
    alpha = trial.suggest_float("alpha", 1e-5, 1e3, log=True)
    learning_rate = trial.suggest_categorical("learning_rate", ['constant', 'invscaling', 'adaptive'])
    max_iter = trial.suggest_categorical("max_iter", [100, 500, 1000])

    # Create and evaluate the model
    model = MLPClassifier(
        hidden_layer_sizes=hidden_layer_sizes,
        activation=activation,
        solver=solver,
        alpha=alpha,
        learning_rate=learning_rate,
        max_iter=max_iter,
        random_state=123
    )

    # Use cross-validation to evaluate the model
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy")
    mean_cv_score = np.mean(cv_scores)

    return mean_cv_score

In [17]:
# Run Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

[I 2024-12-29 13:12:15,556] A new study created in memory with name: no-name-0bb9c049-dedd-4ac6-89e6-e323e4e7f51f


[I 2024-12-29 13:13:45,804] Trial 0 finished with value: 0.8751404653219008 and parameters: {'hidden_layer_sizes': (10, 10), 'activation': 'logistic', 'solver': 'lbfgs', 'alpha': 0.10980121841013941, 'learning_rate': 'adaptive', 'max_iter': 500}. Best is trial 0 with value: 0.8751404653219008.
[I 2024-12-29 13:15:21,405] Trial 1 finished with value: 0.8759425184545474 and parameters: {'hidden_layer_sizes': (50, 50), 'activation': 'identity', 'solver': 'adam', 'alpha': 0.0009033291628834242, 'learning_rate': 'adaptive', 'max_iter': 500}. Best is trial 1 with value: 0.8759425184545474.
[I 2024-12-29 13:15:28,859] Trial 2 finished with value: 0.870011617625892 and parameters: {'hidden_layer_sizes': (10,), 'activation': 'identity', 'solver': 'lbfgs', 'alpha': 0.05960072180342096, 'learning_rate': 'constant', 'max_iter': 100}. Best is trial 1 with value: 0.8759425184545474.
[I 2024-12-29 13:15:31,641] Trial 3 finished with value: 0.2502003526412107 and parameters: {'hidden_layer_sizes': (10

In [18]:
# Extract the best parameters
best_params = study.best_params
print("\nBest Hyperparameters:", best_params)


Best Hyperparameters: {'hidden_layer_sizes': (50, 50), 'activation': 'identity', 'solver': 'lbfgs', 'alpha': 0.0002712909526978834, 'learning_rate': 'adaptive', 'max_iter': 500}


In [27]:
optuna.visualization.plot_optimization_history(study)

In [28]:
optuna.visualization.plot_slice(study)

In [19]:
# Train the best model with the optimized hyperparameters
best_model_phase5 = MLPClassifier(**best_params, random_state=123)
best_model_phase5.fit(X_train, y_train)

In [20]:
# Evaluate the best model
y_test_pred = best_model_phase5.predict(X_test)
test_metrics_phase5 = {
    "accuracy": accuracy_score(y_test, y_test_pred),
    "precision": precision_score(y_test, y_test_pred, average='weighted'),
    "recall": recall_score(y_test, y_test_pred, average='weighted')
}


In [21]:
# Record best model performance
record_trained_model("Optuna Tuned Neural Net", best_params, np.nan, test_metrics_phase5)

Model 'Optuna Tuned Neural Net' recorded successfully!


### Evaluate and Compare Models

In [22]:
# Display Results in a Table
results_df = pd.DataFrame(model_records)
results_df.sort_values(by="Test Accuracy", ascending=True, inplace=True)
print("Model Performance Comparison:")
results_df

Model Performance Comparison:


Unnamed: 0,Model Name,Hyperparameters,Test Accuracy,Test Precision,Test Recall
1,Tuned Neural Net (Simple Hyperparameter),"{'activation': 'relu', 'alpha': 0.0001, 'batch...",83.08%,82.27%,83.08%
2,Tuned Neural Net (Advanced HyperParameter),"{'activation': 'relu', 'alpha': 0.001, 'batch_...",87.01%,86.62%,87.01%
0,Default Neural Net,{},88.46%,88.56%,88.46%
3,Full Hyperparameter Neural Net,"{'activation': 'identity', 'alpha': 1e-05, 'ba...",89.74%,89.90%,89.74%
4,Optuna Tuned Neural Net,"{'hidden_layer_sizes': (50, 50), 'activation':...",89.83%,89.90%,89.83%


In [24]:
print("\nEvaluate and Compare Models")
models = {
    "Default Model": best_model_phase1,
    "Simple Hyperparameter Tuning": best_model_phase2,
    "Advanced Hyperparameter Tuning": best_model_phase3,
    "Full Hyperparameter Grid": best_model_phase4,
    "Hyperparameter Tuning with Optuna": best_model_phase5
}

for name, model in models.items():
    print(f"\n--- Evaluating {name} ---")
    y_test_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_test_pred)
    precision = precision_score(y_test, y_test_pred, average='weighted')
    recall = recall_score(y_test, y_test_pred, average='weighted')

    print(f"Accuracy: {accuracy:.2%}")
    print(f"Precision: {precision:.2%}")
    print(f"Recall: {recall:.2%}")


Evaluate and Compare Models

--- Evaluating Default Model ---
Accuracy: 88.46%
Precision: 88.56%
Recall: 88.46%

--- Evaluating Simple Hyperparameter Tuning ---
Accuracy: 83.08%
Precision: 82.27%
Recall: 83.08%

--- Evaluating Advanced Hyperparameter Tuning ---
Accuracy: 87.01%
Precision: 86.62%
Recall: 87.01%

--- Evaluating Full Hyperparameter Grid ---


Accuracy: 89.74%
Precision: 89.90%
Recall: 89.74%

--- Evaluating Hyperparameter Tuning with Optuna ---
Accuracy: 89.83%
Precision: 89.90%
Recall: 89.83%


### Final Model Selection **Optuna Tuned Neural Net**

### Justification:
- The **Optuna Tuned Neural Net** achieves the highest accuracy (89.83%) and recall (89.83%) while matching the highest precision (89.90%).
- Compared to other models, this approach offers the most efficient and targeted hyperparameter optimization, leveraging advanced search techniques for superior results.
- The model's hyperparameters balance complexity and efficiency, making it a reliable choice for production deployment.

---

### Conclusion
The **Optuna Tuned Neural Net** is selected as the final model due to its superior performance across all evaluation metrics. Its hyperparameter optimization process ensures that it is both robust and efficient, making it ideal for the task at hand.

In [25]:
# Select the model with the highest accuracy
best_model_name = max(models, key=lambda name: accuracy_score(y_test, models[name].predict(X_test)))
best_model = models[best_model_name]

print(f"\nThe Best Model is '{best_model_name}'")
print(best_model)



The Best Model is 'Hyperparameter Tuning with Optuna'
MLPClassifier(activation='identity', alpha=0.0002712909526978834,
              hidden_layer_sizes=(50, 50), learning_rate='adaptive',
              max_iter=500, random_state=123, solver='lbfgs')


### Save the Best Model

In [26]:
def save_model(model, path, model_name="mlp_model.pkl"):
    """
    Save the trained model to a specified directory.

    Args:
        model: Trained model object.
        path (str): Directory path to save the model.
        model_name (str): File name for the saved model.
    """
    # Ensure the path exists
    os.makedirs(path, exist_ok=True)

    # Save the model
    file_path = os.path.join(path, model_name)
    joblib.dump(model, file_path)
    print(f"Model saved successfully at {file_path}!")

# Save the selected best model
save_model(best_model, path="../04_modelling/models/", model_name="mlp_neural_net.pkl")

Model saved successfully at ../04_modelling/models/mlp_neural_net.pkl!


### Deep Learning Model with AutoML

In [1]:
# Import Necessary Libraries
import pandas as pd
import h2o

# For hyperparameter optimization
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
import optuna 

# Evaluation metrics
from sklearn.metrics import accuracy_score, mean_absolute_error 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Initialize H2O cluster
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; OpenJDK 64-Bit Server VM Temurin-17.0.12+7 (build 17.0.12+7, mixed mode, sharing)
  Starting server from C:\Users\Huawei\OneDrive - Universiti Malaya\Desktop\SEMESTER 7\WIE3007_Data-Mining\Group Project\data-mining-warehousing-wages-analysis\venv\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\Huawei\AppData\Local\Temp\tmps_38bbc4
  JVM stdout: C:\Users\Huawei\AppData\Local\Temp\tmps_38bbc4\h2o_Huawei_started_from_python.out
  JVM stderr: C:\Users\Huawei\AppData\Local\Temp\tmps_38bbc4\h2o_Huawei_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,06 secs
H2O_cluster_timezone:,Asia/Kuala_Lumpur
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,1 month and 27 days
H2O_cluster_name:,H2O_from_python_Huawei_az56vq
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.961 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [3]:
# Step 1: Load and Prepare Data
X_train = pd.read_csv("../04_modelling/dataset/X_train.csv")
y_train = pd.read_csv("../04_modelling/dataset/y_train.csv")
X_val = pd.read_csv("../04_modelling/dataset/X_val.csv")
y_val = pd.read_csv("../04_modelling/dataset/y_val.csv")
X_test = pd.read_csv("../04_modelling/dataset/X_test.csv")
y_test = pd.read_csv("../04_modelling/dataset/y_test.csv")

In [4]:
train_df = pd.concat([X_train, y_train], axis=1)
val_df = pd.concat([X_val, y_val], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

In [5]:
train_h2o = h2o.H2OFrame(train_df)
val_h2o = h2o.H2OFrame(val_df)
test_h2o = h2o.H2OFrame(test_df)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [6]:
# Define target and features
target = "yearly_compensation"
features = train_h2o.columns
if target in features:
    features.remove(target)

In [50]:
# Initialize a DataFrame to store model performance details
model_results = pd.DataFrame(columns=[
    "Model Name", "Hyperparameters", "MSE", "RMSE", "MAE", 
    "RMSLE", "Mean Residual Deviance", "R-Squared", "Additional Metrics"
])

# Helper function to record model performance
def record_model(name, hyperparameters, performance=None, additional_metrics=None):
    result = {
        "Model Name": name,
        "Hyperparameters": hyperparameters,
        "MSE": performance.mse() if performance else None,
        "RMSE": performance.rmse() if performance else None,
        "MAE": performance.mae() if performance else None,
        "RMSLE": performance.rmsle() if performance else None,
        "Mean Residual Deviance": performance.mean_residual_deviance() if performance else None,
        "R-Squared": performance.r2() if performance else None,
        "Additional Metrics": additional_metrics if additional_metrics else {}
    }
    
    # Append the result as a new row to the DataFrame
    global model_results  # Ensure we're working with the global DataFrame
    model_results = pd.concat([model_results, pd.DataFrame([result])], ignore_index=True)

### Train Default Deep Learning Model <br/>
The default deep learning model is trained with minimal hyperparameter customization. This serves as a baseline to compare against more complex models. The model uses a fixed random seed for reproducibility and evaluates its performance using metrics like RMSE, MAE, and R-squared.

In [8]:
# Train Default Deep Learning Model
default_dl_model = H2ODeepLearningEstimator(seed=42)
default_dl_model.train(x=features, y=target, training_frame=train_h2o, validation_frame=val_h2o)

deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%


Unnamed: 0,layer,units,type,dropout,l1,l2,mean_rate,rate_rms,momentum,mean_weight,weight_rms,mean_bias,bias_rms
,1,50,Input,0.0,,,,,,,,,
,2,200,Rectifier,0.0,0.0,0.0,0.1995093,0.0998517,0.0,0.0005083,0.0953302,0.4497687,0.0700728
,3,200,Rectifier,0.0,0.0,0.0,0.4253343,0.3431659,0.0,-0.0160143,0.0711694,0.956367,0.0244011
,4,1,Linear,,0.0,0.0,0.0066523,0.0029724,0.0,0.00173,0.0407801,-0.0239891,0.0

Unnamed: 0,timestamp,duration,training_speed,epochs,iterations,samples,training_rmse,training_deviance,training_mae,training_r2,validation_rmse,validation_deviance,validation_mae,validation_r2
,2024-12-29 16:31:38,0.000 sec,,0.0,0,0.0,,,,,,,,
,2024-12-29 16:31:40,2.615 sec,4052 obs/sec,1.0,1,5459.0,1.1010734,1.2123626,0.8425727,0.9788747,1.1556346,1.3354912,0.8674988,0.9767457
,2024-12-29 16:31:44,6.853 sec,10199 obs/sec,10.0,10,54590.0,0.3251762,0.1057395,0.2552715,0.9981575,0.3999226,0.1599381,0.3130443,0.9972151

variable,relative_importance,scaled_importance,percentage
job_title_Data_Engineer,1.0,1.0,0.0325634
used_tpu,0.8822723,0.8822723,0.0287298
ml_spending,0.8682330,0.8682330,0.0282726
country_Ukraine,0.8234869,0.8234869,0.0268155
Total_Experience,0.7489054,0.7489054,0.0243869
country_United_States_of_America,0.7448723,0.7448723,0.0242556
country_SUMprofileTable_yearly_compensation,0.7195638,0.7195638,0.0234314
country_Austria,0.6916451,0.6916451,0.0225223
country_Thailand,0.6845521,0.6845521,0.0222913
demographics_COUNTprofileTable,0.6543080,0.6543080,0.0213065


In [9]:
# Evaluate Default Model
default_performance = default_dl_model.model_performance(test_h2o)
print("Default Deep Learning Model Performance:")
print(default_performance)

Default Deep Learning Model Performance:
ModelMetricsRegression: deeplearning
** Reported on test data. **

MSE: 0.17237986670457575
RMSE: 0.41518654446474507
MAE: 0.3124945439505148
RMSLE: 0.14093044344592942
Mean Residual Deviance: 0.17237986670457575


In [51]:
# Record default model performance
record_model("Default Deep Learning Model", {"seed": 42}, default_performance)

In [52]:
model_results

Unnamed: 0,Model Name,Hyperparameters,MSE,RMSE,MAE,RMSLE,Mean Residual Deviance,R-Squared,Additional Metrics
0,Default Deep Learning Model,{'seed': 42},0.17238,0.415187,0.312495,0.14093,0.17238,0.996995,{}


### Hyperparameter Optimization with Optuna <br/>
Optuna performs hyperparameter tuning using an efficient sampling strategy. The objective function trains the model with sampled hyperparameters and evaluates its validation RMSE. The best hyperparameters are selected after multiple trials.

In [12]:
# Step 3: Hyperparameter Optimization using Optuna
def objective(trial):
    # Define hyperparameter search space
    params = {
        "epochs": trial.suggest_int("epochs", 10, 100),
        "hidden": trial.suggest_categorical("hidden", [[50, 50], [100, 100], [200, 200]]),
        "input_dropout_ratio": trial.suggest_float("input_dropout_ratio", 0.0, 0.5),
        "l1": trial.suggest_float("l1", 1e-6, 1e-3),
        "l2": trial.suggest_float("l2", 1e-6, 1e-3),
        "activation": trial.suggest_categorical("activation", ["Rectifier", "Tanh", "Maxout"]),
    }
    
    # Train Deep Learning Model with hyperparameters
    model = H2ODeepLearningEstimator(**params, seed=42)
    model.train(x=features, y=target, training_frame=train_h2o, validation_frame=val_h2o)
    
    # Get validation performance (use RMSE as optimization target)
    performance = model.model_performance(val_h2o)
    return performance.rmse()

In [13]:
# Perform hyperparameter optimization
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)

[I 2024-12-29 16:31:57,991] A new study created in memory with name: no-name-b862c1de-612b-467d-9900-2ad3eeb615d9


deeplearning Model Build progress: |



█████████████████████████████████████████████| (done) 100%


[I 2024-12-29 16:32:16,208] Trial 0 finished with value: 1.808243555258793 and parameters: {'epochs': 74, 'hidden': [200, 200], 'input_dropout_ratio': 0.2764774693119634, 'l1': 0.0007595104254745987, 'l2': 9.315124302803318e-05, 'activation': 'Rectifier'}. Best is trial 0 with value: 1.808243555258793.


deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%


[I 2024-12-29 16:32:37,097] Trial 1 finished with value: 1.3279682659229815 and parameters: {'epochs': 100, 'hidden': [50, 50], 'input_dropout_ratio': 0.247855194138982, 'l1': 0.0007547056274406773, 'l2': 0.0006106556460799933, 'activation': 'Maxout'}. Best is trial 1 with value: 1.3279682659229815.


deeplearning Model Build progress: |████████████████████████████████████████████

[I 2024-12-29 16:33:18,402] Trial 2 finished with value: 0.6972262665022374 and parameters: {'epochs': 78, 'hidden': [100, 100], 'input_dropout_ratio': 0.021786159625387047, 'l1': 0.0006837711418421921, 'l2': 0.0006036029177502079, 'activation': 'Tanh'}. Best is trial 2 with value: 0.6972262665022374.


█| (done) 100%
deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%


[I 2024-12-29 16:33:57,389] Trial 3 finished with value: 1.1693910092129791 and parameters: {'epochs': 97, 'hidden': [200, 200], 'input_dropout_ratio': 0.17931379260618818, 'l1': 4.0839374883050106e-05, 'l2': 0.0002919873226838555, 'activation': 'Rectifier'}. Best is trial 2 with value: 0.6972262665022374.


deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%


[I 2024-12-29 16:34:14,818] Trial 4 finished with value: 1.1450959384675927 and parameters: {'epochs': 30, 'hidden': [100, 100], 'input_dropout_ratio': 0.18756045911926628, 'l1': 0.0001599632938959047, 'l2': 0.00026158288583889985, 'activation': 'Tanh'}. Best is trial 2 with value: 0.6972262665022374.


deeplearning Model Build progress: |████████████████████████████████████████████

[I 2024-12-29 16:34:31,497] Trial 5 finished with value: 2.0992704951195056 and parameters: {'epochs': 22, 'hidden': [100, 100], 'input_dropout_ratio': 0.4220207196276296, 'l1': 0.0005514604595367953, 'l2': 0.0008851951132234265, 'activation': 'Maxout'}. Best is trial 2 with value: 0.6972262665022374.


█| (done) 100%
deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%


[I 2024-12-29 16:34:38,047] Trial 6 finished with value: 0.5574520629326151 and parameters: {'epochs': 25, 'hidden': [100, 100], 'input_dropout_ratio': 0.01924080327170108, 'l1': 0.0009345653909373542, 'l2': 0.0005676378292581919, 'activation': 'Rectifier'}. Best is trial 6 with value: 0.5574520629326151.


deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%


[I 2024-12-29 16:34:42,847] Trial 7 finished with value: 0.7726841881628845 and parameters: {'epochs': 55, 'hidden': [50, 50], 'input_dropout_ratio': 0.06225159738238395, 'l1': 0.0004572844798230411, 'l2': 0.00027745564982452787, 'activation': 'Rectifier'}. Best is trial 6 with value: 0.5574520629326151.


deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%


[I 2024-12-29 16:34:46,691] Trial 8 finished with value: 2.2942547833337517 and parameters: {'epochs': 12, 'hidden': [50, 50], 'input_dropout_ratio': 0.4468060158015844, 'l1': 0.0009572519545101224, 'l2': 0.0007613978086368552, 'activation': 'Tanh'}. Best is trial 6 with value: 0.5574520629326151.


deeplearning Model Build progress: |████████████████████████████████████████████

[I 2024-12-29 16:35:08,962] Trial 9 finished with value: 2.217510010949232 and parameters: {'epochs': 91, 'hidden': [200, 200], 'input_dropout_ratio': 0.4952421419802557, 'l1': 0.000989215540439543, 'l2': 0.000385484725680172, 'activation': 'Rectifier'}. Best is trial 6 with value: 0.5574520629326151.


█| (done) 100%
deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%


[I 2024-12-29 16:35:18,221] Trial 10 finished with value: 0.8838025302087164 and parameters: {'epochs': 43, 'hidden': [100, 100], 'input_dropout_ratio': 0.10160161362536704, 'l1': 0.0004392802976604521, 'l2': 1.3225751230614888e-05, 'activation': 'Rectifier'}. Best is trial 6 with value: 0.5574520629326151.


deeplearning Model Build progress: |████████████████████████████████████████████

[I 2024-12-29 16:36:08,586] Trial 11 finished with value: 0.6907908123627505 and parameters: {'epochs': 74, 'hidden': [100, 100], 'input_dropout_ratio': 0.004501757539628223, 'l1': 0.0007116343695343038, 'l2': 0.0005893416645479287, 'activation': 'Tanh'}. Best is trial 6 with value: 0.5574520629326151.


█| (done) 100%
deeplearning Model Build progress: |████████████████████████████████████████████

[I 2024-12-29 16:36:52,265] Trial 12 finished with value: 0.7600240793234229 and parameters: {'epochs': 64, 'hidden': [100, 100], 'input_dropout_ratio': 0.011123992947668472, 'l1': 0.0008623353113561478, 'l2': 0.0005597437498450665, 'activation': 'Tanh'}. Best is trial 6 with value: 0.5574520629326151.


█| (done) 100%
deeplearning Model Build progress: |████████████████████████████████████████████

[I 2024-12-29 16:37:21,774] Trial 13 finished with value: 1.0100575162204026 and parameters: {'epochs': 44, 'hidden': [100, 100], 'input_dropout_ratio': 0.1262255136092462, 'l1': 0.0005863568078918668, 'l2': 0.0007463658815847865, 'activation': 'Tanh'}. Best is trial 6 with value: 0.5574520629326151.


█| (done) 100%
deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%


[I 2024-12-29 16:37:50,132] Trial 14 finished with value: 0.2842984116741133 and parameters: {'epochs': 37, 'hidden': [100, 100], 'input_dropout_ratio': 0.0013062179562149737, 'l1': 0.0008734285704456857, 'l2': 0.00044395405309485716, 'activation': 'Maxout'}. Best is trial 14 with value: 0.2842984116741133.


deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%


[I 2024-12-29 16:38:14,036] Trial 15 finished with value: 1.8824940458301644 and parameters: {'epochs': 33, 'hidden': [100, 100], 'input_dropout_ratio': 0.3305030605193073, 'l1': 0.0002896386211175739, 'l2': 0.0004365026454293437, 'activation': 'Maxout'}. Best is trial 14 with value: 0.2842984116741133.


deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%


[I 2024-12-29 16:38:26,330] Trial 16 finished with value: 0.975588338687193 and parameters: {'epochs': 14, 'hidden': [100, 100], 'input_dropout_ratio': 0.09765780229627438, 'l1': 0.0008820905790281968, 'l2': 0.0007217891102161952, 'activation': 'Maxout'}. Best is trial 14 with value: 0.2842984116741133.


deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%


[I 2024-12-29 16:39:00,509] Trial 17 finished with value: 1.1384997501482428 and parameters: {'epochs': 42, 'hidden': [100, 100], 'input_dropout_ratio': 0.15549772618313137, 'l1': 0.0008238765797868653, 'l2': 0.0009601839196833405, 'activation': 'Maxout'}. Best is trial 14 with value: 0.2842984116741133.


deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%


[I 2024-12-29 16:39:08,852] Trial 18 finished with value: 0.7274356089652834 and parameters: {'epochs': 28, 'hidden': [50, 50], 'input_dropout_ratio': 0.06432311563172588, 'l1': 0.000990050740682447, 'l2': 0.0004562117262446519, 'activation': 'Maxout'}. Best is trial 14 with value: 0.2842984116741133.


deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%


[I 2024-12-29 16:39:30,261] Trial 19 finished with value: 1.4233917254858595 and parameters: {'epochs': 52, 'hidden': [200, 200], 'input_dropout_ratio': 0.22320830169134048, 'l1': 0.000329929897844141, 'l2': 0.00016807906879025497, 'activation': 'Rectifier'}. Best is trial 14 with value: 0.2842984116741133.


In [14]:
# Best Hyperparameters
best_params = study.best_params
print("Best Hyperparameters:")
print(best_params)

Best Hyperparameters:
{'epochs': 37, 'hidden': [100, 100], 'input_dropout_ratio': 0.0013062179562149737, 'l1': 0.0008734285704456857, 'l2': 0.00044395405309485716, 'activation': 'Maxout'}


In [15]:
optuna.visualization.plot_optimization_history(study)

In [16]:
optuna.visualization.plot_slice(study)

### Train Deep Learning Model with Best Hyperparameters <br/>
The deep learning model is trained using the best hyperparameters found through Optuna. This approach aims to enhance performance metrics by optimizing key parameters such as the number of epochs, hidden layers, and dropout ratios.

In [17]:
# Train Deep Learning Model with Best Hyperparameters
tuned_dl_model = H2ODeepLearningEstimator(**best_params, seed=42)
tuned_dl_model.train(x=features, y=target, training_frame=train_h2o, validation_frame=val_h2o)

deeplearning Model Build progress: |

█████████████████████████████████████████████| (done) 100%


Unnamed: 0,layer,units,type,dropout,l1,l2,mean_rate,rate_rms,momentum,mean_weight,weight_rms,mean_bias,bias_rms
,1,50,Input,0.1306218,,,,,,,,,
,2,100,Maxout,0.0,0.0008734,0.000444,0.4882926,0.0878154,0.0,6.9e-06,0.0105823,0.0006261,0.0099727
,3,100,Maxout,0.0,0.0008734,0.000444,0.4968112,0.0345105,0.0,-0.000298,0.0085138,-0.0063182,0.0514227
,4,1,Linear,,0.0008734,0.000444,0.4885261,0.0865239,0.0,0.0061165,0.1397948,-0.0121515,0.0

Unnamed: 0,timestamp,duration,training_speed,epochs,iterations,samples,training_rmse,training_deviance,training_mae,training_r2,validation_rmse,validation_deviance,validation_mae,validation_r2
,2024-12-29 16:40:37,0.000 sec,,0.0,0,0.0,,,,,,,,
,2024-12-29 16:40:39,2.192 sec,10231 obs/sec,3.6995787,1,20196.0,0.7906521,0.6251308,0.6172923,0.9891072,0.8488762,0.7205907,0.662472,0.9874527
,2024-12-29 16:40:45,7.951 sec,10703 obs/sec,14.7930024,4,80755.0,0.4049239,0.1639633,0.3117518,0.997143,0.4111871,0.1690749,0.3293122,0.997056
,2024-12-29 16:40:51,13.644 sec,10855 obs/sec,25.9128045,7,141458.0,0.3439449,0.1182981,0.2480687,0.9979387,0.3520265,0.1239227,0.2547204,0.9978422
,2024-12-29 16:40:58,20.915 sec,10086 obs/sec,37.0203334,10,202094.0,0.3260369,0.1063001,0.2455668,0.9981477,0.3415854,0.1166806,0.2548798,0.9979683

variable,relative_importance,scaled_importance,percentage
PrimaryTool_MEANprofileTable_yearly_compensation,1.0,1.0,0.0231221
used_tpu,0.9986047,0.9986047,0.0230899
RecommendedLanguage_SUMprofileTable_yearly_compensation,0.9710655,0.9710655,0.0224531
demographics_COUNTprofileTable,0.9570277,0.9570277,0.0221285
gender_Male,0.9473091,0.9473091,0.0219038
jobTitle_SUMprofileTable_yearly_compensation,0.9451640,0.9451640,0.0218542
Total_Experience,0.9328293,0.9328293,0.0215690
ml_spending,0.9228640,0.9228640,0.0213386
country_Poland,0.9181229,0.9181229,0.0212290
job_title_Software_Engineer,0.9132979,0.9132979,0.0211174


In [43]:
# Evaluate Tuned Model
tuned_performance = tuned_dl_model.model_performance(test_h2o)
print("Tuned Deep Learning Model Performance:")
print(tuned_performance)


Tuned Deep Learning Model Performance:
ModelMetricsRegression: deeplearning
** Reported on test data. **

MSE: 0.11184234023388619
RMSE: 0.3344283783321717
MAE: 0.2529841293593749
RMSLE: 0.10716528227835725
Mean Residual Deviance: 0.11184234023388619


In [53]:
# Record tuned model performance
record_model("Tuned Deep Learning Model", best_params, tuned_performance)

In [54]:
model_results

Unnamed: 0,Model Name,Hyperparameters,MSE,RMSE,MAE,RMSLE,Mean Residual Deviance,R-Squared,Additional Metrics
0,Default Deep Learning Model,{'seed': 42},0.17238,0.415187,0.312495,0.14093,0.17238,0.996995,{}
1,Tuned Deep Learning Model,"{'epochs': 37, 'hidden': [100, 100], 'input_dr...",0.111842,0.334428,0.252984,0.107165,0.111842,0.99805,{}


In [21]:
# Compare Predictions with Actual
predictions = tuned_dl_model.predict(test_h2o)
actual_values = y_test.to_numpy().ravel()
rounded_predictions = predictions.as_data_frame().to_numpy().ravel().round().astype(int)

deeplearning prediction progress: |

██████████████████████████████████████████████| (done) 100%



Converting H2O frame to pandas dataframe using single-thread.  For faster conversion using multi-thread, install polars and pyarrow and use it as pandas_df = h2o_df.as_data_frame(use_multi_thread=True)




In [22]:
accuracy = accuracy_score(actual_values, rounded_predictions)
print(f"Accuracy on Test Data: {accuracy:.2f}")

Accuracy on Test Data: 0.89


### Early Stopping Model <br/>
This model incorporates early stopping to prevent overfitting. Training halts if the model's RMSE does not improve within a specified number of rounds. This technique ensures efficient use of computational resources and reduces overtraining risks.


In [23]:
# Retrieve and print the best hyperparameters
best_params = study.best_params

# Train a early stopping model with the best hyperparameters and early stopping
early_stopping_model = H2ODeepLearningEstimator(**best_params, 
                                       stopping_metric="rmse", 
                                       stopping_rounds=5, 
                                       stopping_tolerance=0.01,
                                       seed=42)
early_stopping_model.train(x=features, y=target, training_frame=train_h2o, validation_frame=val_h2o)

# Evaluate the final model on the test set
final_performance = early_stopping_model.model_performance(test_h2o)
print("Final Model Performance on Test Set:")
print(f"RMSE: {final_performance.rmse():.2f}")
print(f"MAE: {final_performance.mae():.2f}")
print(f"R-squared: {final_performance.r2():.2f}")

deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%
Final Model Performance on Test Set:
RMSE: 0.30
MAE: 0.22
R-squared: 1.00


In [55]:
# Record the performance of the early stopping model
record_model("Early Stopping Model", best_params, final_performance)

In [56]:
model_results

Unnamed: 0,Model Name,Hyperparameters,MSE,RMSE,MAE,RMSLE,Mean Residual Deviance,R-Squared,Additional Metrics
0,Default Deep Learning Model,{'seed': 42},0.17238,0.415187,0.312495,0.14093,0.17238,0.996995,{}
1,Tuned Deep Learning Model,"{'epochs': 37, 'hidden': [100, 100], 'input_dr...",0.111842,0.334428,0.252984,0.107165,0.111842,0.99805,{}
2,Early Stopping Model,"{'epochs': 37, 'hidden': [100, 100], 'input_dr...",0.089279,0.298795,0.216405,0.096573,0.089279,0.998444,{}


### Ensemble Model <br/>
An ensemble of models is trained with shared hyperparameters but varied hidden layer configurations. Predictions are averaged to improve robustness and reduce variance. The ensemble is evaluated using metrics like MAE and R-squared.


In [26]:
import numpy as np

In [57]:
# Retrieve best hyperparameters from Optuna
best_params = study.best_params

# Extract shared hyperparameters for the ensemble
common_params = {
    "epochs": best_params.get("epochs", 100),
    "input_dropout_ratio": best_params.get("input_dropout_ratio", 0.0),
    "l1": best_params.get("l1", 1e-6),
    "l2": best_params.get("l2", 1e-6),
    "activation": best_params.get("activation", "Rectifier"),
    "seed": 42,
}

# Train Multiple Models for Ensembling using the best parameters
models = []
for idx, hidden in enumerate([[50, 50], [100, 100], [200, 200]], start=1):
    # Combine best params with unique hidden layers
    model_params = {**common_params, "hidden": hidden}
    
    # Train the model
    model = H2ODeepLearningEstimator(**model_params)
    model.train(x=features, y=target, training_frame=train_h2o, validation_frame=val_h2o)
    models.append(model)
    
    # Record individual model performance
    performance = model.model_performance(test_h2o)
    record_model(
        name=f"Deep Learning Model {idx} (Hidden: {hidden})",
        hyperparameters=model_params,
        performance=performance
    )

deeplearning Model Build progress: |

█████████████████████████████████████████████| (done) 100%
deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%
deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%


In [58]:
# Ensemble Predictions (Averaging)
predictions = [model.predict(test_h2o).as_data_frame()["predict"].to_numpy() for model in models]
ensemble_prediction = np.mean(predictions, axis=0)

deeplearning prediction progress: |

██████████████████████████████████████████████| (done) 100%
deeplearning prediction progress: |


Converting H2O frame to pandas dataframe using single-thread.  For faster conversion using multi-thread, install polars and pyarrow and use it as pandas_df = h2o_df.as_data_frame(use_multi_thread=True)




██████████████████████████████████████████████| (done) 100%
deeplearning prediction progress: |


Converting H2O frame to pandas dataframe using single-thread.  For faster conversion using multi-thread, install polars and pyarrow and use it as pandas_df = h2o_df.as_data_frame(use_multi_thread=True)




██████████████████████████████████████████████| (done) 100%



Converting H2O frame to pandas dataframe using single-thread.  For faster conversion using multi-thread, install polars and pyarrow and use it as pandas_df = h2o_df.as_data_frame(use_multi_thread=True)




In [59]:
# Evaluate Ensemble using Mean Absolute Error
y_test = test_h2o[target].as_data_frame().to_numpy().ravel()
mae_ensemble = mean_absolute_error(y_test, ensemble_prediction)


Converting H2O frame to pandas dataframe using single-thread.  For faster conversion using multi-thread, install polars and pyarrow and use it as pandas_df = h2o_df.as_data_frame(use_multi_thread=True)




In [60]:
# Add R-squared Evaluation for the ensemble
total_variance = np.sum((y_test - np.mean(y_test)) ** 2)
residual_variance = np.sum((y_test - ensemble_prediction) ** 2)
r_squared = 1 - (residual_variance / total_variance)

In [61]:
# Record Ensemble Performance
ensemble_metrics = {
    "MAE (Ensemble)": mae_ensemble,
    "R-Squared (Ensemble)": r_squared
}
record_model(
    name="Ensemble Model (Averaging)",
    hyperparameters=best_params,  # Use best hyperparameters as a general reference
    performance=None,  # No direct H2O performance object for the ensemble
    additional_metrics=ensemble_metrics
)

In [62]:
model_results

Unnamed: 0,Model Name,Hyperparameters,MSE,RMSE,MAE,RMSLE,Mean Residual Deviance,R-Squared,Additional Metrics
0,Default Deep Learning Model,{'seed': 42},0.17238,0.415187,0.312495,0.14093,0.17238,0.996995,{}
1,Tuned Deep Learning Model,"{'epochs': 37, 'hidden': [100, 100], 'input_dr...",0.111842,0.334428,0.252984,0.107165,0.111842,0.99805,{}
2,Early Stopping Model,"{'epochs': 37, 'hidden': [100, 100], 'input_dr...",0.089279,0.298795,0.216405,0.096573,0.089279,0.998444,{}
3,"Deep Learning Model 1 (Hidden: [50, 50])","{'epochs': 37, 'input_dropout_ratio': 0.001306...",0.066305,0.257498,0.199259,0.092767,0.066305,0.998844,{}
4,"Deep Learning Model 2 (Hidden: [100, 100])","{'epochs': 37, 'input_dropout_ratio': 0.001306...",0.123865,0.351945,0.229852,0.100942,0.123865,0.997841,{}
5,"Deep Learning Model 3 (Hidden: [200, 200])","{'epochs': 37, 'input_dropout_ratio': 0.001306...",0.082322,0.286918,0.198257,0.082074,0.082322,0.998565,{}
6,Ensemble Model (Averaging),"{'epochs': 37, 'hidden': [100, 100], 'input_dr...",,,,,,,"{'MAE (Ensemble)': 0.1531103457657005, 'R-Squa..."


### Final Evaluation and Model Saving <br/>
All trained models are compared based on RMSE, and the best-performing model is selected. The best model is then saved for future use, ensuring reproducibility and ease of deployment.

In [63]:
# Display Results in a Table
import pandas as pd

results_df = pd.DataFrame(model_results)
results_df.sort_values(by="RMSE", ascending=True, inplace=True)
print("Model Performance Comparison:")
results_df

Model Performance Comparison:


Unnamed: 0,Model Name,Hyperparameters,MSE,RMSE,MAE,RMSLE,Mean Residual Deviance,R-Squared,Additional Metrics
3,"Deep Learning Model 1 (Hidden: [50, 50])","{'epochs': 37, 'input_dropout_ratio': 0.001306...",0.066305,0.257498,0.199259,0.092767,0.066305,0.998844,{}
5,"Deep Learning Model 3 (Hidden: [200, 200])","{'epochs': 37, 'input_dropout_ratio': 0.001306...",0.082322,0.286918,0.198257,0.082074,0.082322,0.998565,{}
2,Early Stopping Model,"{'epochs': 37, 'hidden': [100, 100], 'input_dr...",0.089279,0.298795,0.216405,0.096573,0.089279,0.998444,{}
1,Tuned Deep Learning Model,"{'epochs': 37, 'hidden': [100, 100], 'input_dr...",0.111842,0.334428,0.252984,0.107165,0.111842,0.99805,{}
4,"Deep Learning Model 2 (Hidden: [100, 100])","{'epochs': 37, 'input_dropout_ratio': 0.001306...",0.123865,0.351945,0.229852,0.100942,0.123865,0.997841,{}
0,Default Deep Learning Model,{'seed': 42},0.17238,0.415187,0.312495,0.14093,0.17238,0.996995,{}
6,Ensemble Model (Averaging),"{'epochs': 37, 'hidden': [100, 100], 'input_dr...",,,,,,,"{'MAE (Ensemble)': 0.1531103457657005, 'R-Squa..."




---

## README: Model Performance Summary and Final Model Selection

### Performance Summary

| **Model Name**                 | **Hyperparameters**                   | **MSE**   | **RMSE**  | **MAE**   | **RMSLE** | **Mean Residual Deviance** | **R²**       | **Additional Metrics**                |
|---------------------------------|---------------------------------------|-----------|-----------|-----------|-----------|----------------------------|--------------|---------------------------------------|
| Deep Learning Model 1 (Hidden: [50, 50]) | {'epochs': 37, 'input_dropout_ratio': 0.001306... | 0.066305  | 0.257498  | 0.199259  | 0.092767  | 0.066305                   | 0.998844     | {}                                    |
| Deep Learning Model 3 (Hidden: [200, 200]) | {'epochs': 37, 'input_dropout_ratio': 0.001306... | 0.082322  | 0.286918  | 0.198257  | 0.082074  | 0.082322                   | 0.998565     | {}                                    |
| Early Stopping Model            | {'epochs': 37, 'hidden': [100, 100], 'input_dr... | 0.089279  | 0.298795  | 0.216405  | 0.096573  | 0.089279                   | 0.998444     | {}                                    |
| Tuned Deep Learning Model       | {'epochs': 37, 'hidden': [100, 100], 'input_dr... | 0.111842  | 0.334428  | 0.252984  | 0.107165  | 0.111842                   | 0.998050     | {}                                    |
| Deep Learning Model 2 (Hidden: [100, 100]) | {'epochs': 37, 'input_dropout_ratio': 0.001306... | 0.123865  | 0.351945  | 0.229852  | 0.100942  | 0.123865                   | 0.997841     | {}                                    |
| Default Deep Learning Model     | {'seed': 42}                           | 0.172380  | 0.415187  | 0.312495  | 0.140930  | 0.172380                   | 0.996995     | {}                                    |
| Ensemble Model (Averaging)      | {'epochs': 37, 'hidden': [100, 100], 'input_dr... | NaN       | NaN       | 0.153110  | NaN       | NaN                        | NaN          | {'MAE (Ensemble)': 0.1531103457657005, 'R-Squared (Ensemble)': 0.999} |

---

### Observations
1. **Deep Learning Model 1 (Hidden: [50, 50])**:
   - Achieves the lowest **RMSE** (0.257498) and **MSE** (0.066305), indicating superior predictive accuracy based on squared error.
   - High **R²** value (0.998844), showcasing excellent explanatory power.

2. **Ensemble Model (Averaging)**:
   - Does not have RMSE or MSE available but achieves the lowest **MAE** (0.153110).
   - The highest reported **R-Squared (0.999)** indicates exceptional overall model performance.
   - Combines predictions of other models, potentially making it more robust and generalizable.

3. Other Models:
   - Models like Deep Learning Model 3 (Hidden: [200, 200]) and Early Stopping Model show competitive performance but do not surpass the metrics of Deep Learning Model 1 or the Ensemble Model.

---

### Final Model Selection

#### Selected Model: **Deep Learning Model 1 (Hidden: [50, 50])**
#### Justification:
1. **RMSE as the Primary Metric**: The model with the lowest **RMSE** is typically chosen in regression tasks because it penalizes larger errors more significantly than MAE. Deep Learning Model 1 achieves the lowest RMSE of 0.257498.
2. **Comprehensive Performance**: Alongside RMSE, this model also performs well in terms of **MSE**, **MAE**, and **R²**, making it a strong candidate.

#### Ensemble Model Not Selected:
While the Ensemble Model achieves the lowest MAE (0.153110) and the highest R-squared (0.999), the absence of RMSE and MSE metrics makes it difficult to comprehensively evaluate its error distribution. If RMSE and MSE values for the Ensemble Model can be computed, it could potentially be reconsidered.

---

### Conclusion
The **Deep Learning Model 1 (Hidden: [50, 50])** was selected as the final model due to its superior RMSE and robust performance across other metrics. This model is well-suited for deployment based on its ability to minimize prediction errors effectively. Future work may include further evaluation of the Ensemble Model's RMSE and MSE to determine its applicability.

--- 

In [71]:
 # Retrieve the corresponding model from the `models` list
best_model = models[0]          
best_model

Unnamed: 0,layer,units,type,dropout,l1,l2,mean_rate,rate_rms,momentum,mean_weight,weight_rms,mean_bias,bias_rms
,1,50,Input,0.1306218,,,,,,,,,
,2,50,Maxout,0.0,0.0008734,0.000444,0.4733224,0.1167806,0.0,-4e-06,0.0153647,0.006643,0.0351914
,3,50,Maxout,0.0,0.0008734,0.000444,0.4982321,0.0446134,0.0,4.12e-05,0.0183274,-0.0046735,0.0265504
,4,1,Linear,,0.0008734,0.000444,0.4639662,0.1305184,0.0,-0.0205115,0.2014685,-0.0060569,0.0

Unnamed: 0,timestamp,duration,training_speed,epochs,iterations,samples,training_rmse,training_deviance,training_mae,training_r2,validation_rmse,validation_deviance,validation_mae,validation_r2
,2024-12-29 16:54:10,0.000 sec,,0.0,0,0.0,,,,,,,,
,2024-12-29 16:54:10,0.814 sec,28606 obs/sec,3.6995787,1,20196.0,0.7696478,0.5923578,0.6050586,0.9896782,0.8686526,0.7545573,0.674721,0.9868613
,2024-12-29 16:54:16,6.369 sec,29398 obs/sec,33.3189229,9,181888.0,0.2622484,0.0687742,0.1982344,0.9988016,0.2645845,0.0700049,0.2002301,0.998781
,2024-12-29 16:54:17,7.141 sec,29374 obs/sec,37.0203334,10,202094.0,0.2586659,0.0669081,0.1878792,0.9988341,0.2759438,0.076145,0.1969374,0.9986741
,2024-12-29 16:54:17,7.213 sec,29361 obs/sec,37.0203334,10,202094.0,0.2622484,0.0687742,0.1982344,0.9988016,0.2645845,0.0700049,0.2002301,0.998781

variable,relative_importance,scaled_importance,percentage
used_tpu,1.0,1.0,0.0249870
country_Australia,0.9748659,0.9748659,0.0243590
company_SUMprofileTable_yearly_compensation,0.9719041,0.9719041,0.0242850
country_Ukraine,0.9664273,0.9664273,0.0241481
country_Taiwan,0.9389869,0.9389869,0.0234625
job_title_Data_Engineer,0.9374161,0.9374161,0.0234232
demographics_COUNTprofileTable,0.9090085,0.9090085,0.0227134
PrimaryTool_SUMprofileTable_yearly_compensation,0.8912052,0.8912052,0.0222686
country_SUMprofileTable_yearly_compensation,0.8884457,0.8884457,0.0221996
company_size,0.8597455,0.8597455,0.0214825


In [72]:
import pickle

# Step 1: Save the H2O model in its native format
best_model_path = h2o.save_model(best_model, path="../04_modelling/models/")
print(f"Tuned model saved to: {best_model_path}")

# Step 2: Save the model path in a `.pkl` file
pkl_file_path = "../04_modelling/models/h20_autoML_DL.pkl"
with open(pkl_file_path, "wb") as file:
    pickle.dump({"model_path": best_model_path}, file)

print(f"Model metadata saved as Pickle file to: {pkl_file_path}")

Tuned model saved to: C:\Users\Huawei\OneDrive - Universiti Malaya\Desktop\SEMESTER 7\WIE3007_Data-Mining\Group Project\data-mining-warehousing-wages-analysis\notebooks\04_modelling\models\DeepLearning_model_python_1735461052018_31
Model metadata saved as Pickle file to: ../04_modelling/models/h20_autoML_DL.pkl


In [73]:
# Shutdown H2O Cluster
h2o.cluster().shutdown()

H2O session _sid_941a closed.


## Archived

In [None]:
# from sklearn.metrics import mean_absolute_error
# import numpy as np

# # Train Multiple Models for Ensembling
# models = []
# for hidden in [[50, 50], [100, 100], [200, 200]]:
#     model = H2ODeepLearningEstimator(
#         hidden=hidden,
#         epochs=100,
#         seed=42
#     )
#     model.train(x=features, y=target, training_frame=train_h2o, validation_frame=val_h2o)
#     models.append(model)

# # Ensemble Predictions (Averaging)
# predictions = [model.predict(test_h2o).as_data_frame()["predict"].to_numpy() for model in models]
# ensemble_prediction = np.mean(predictions, axis=0)

# # Evaluate Ensemble using Mean Absolute Error
# y_test = test_h2o[target].as_data_frame().to_numpy().ravel()
# mae_ensemble = mean_absolute_error(y_test, ensemble_prediction)
# print(f"Mean Absolute Error (MAE) for Ensemble: {mae_ensemble:.2f}")

# # Optional: Add R-squared Evaluation
# total_variance = np.sum((y_test - np.mean(y_test)) ** 2)
# residual_variance = np.sum((y_test - ensemble_prediction) ** 2)
# r_squared = 1 - (residual_variance / total_variance)
# print(f"R-squared for Ensemble: {r_squared:.2f}")


In [None]:
# # Random Grid Search 
# from h2o.grid.grid_search import H2OGridSearch

# # Define hyperparameters
# hyper_params = {
#     "epochs": list(range(50, 201, 50)),
#     "hidden": [[50, 50], [100, 100], [200, 200]],
#     "input_dropout_ratio": [i / 10.0 for i in range(0, 6)],
#     "l1": [1e-6, 1e-5, 1e-4],
#     "l2": [1e-6, 1e-5, 1e-4],
#     "activation": ["Rectifier", "Tanh", "Maxout"]
# }

# # Search criteria for random grid search
# search_criteria = {
#     'strategy': 'RandomDiscrete',  # Random search
#     'max_models': 20,  # Maximum number of models to train
#     'seed': 42  # For reproducibility
# }

# # Random grid search setup
# random_grid = H2OGridSearch(
#     model=H2ODeepLearningEstimator(seed=42),
#     grid_id='random_grid_dl',
#     hyper_params=hyper_params,
#     search_criteria=search_criteria
# )

# # Train models with random grid search
# random_grid.train(x=features, y=target, training_frame=train_h2o, validation_frame=val_h2o)

# # Get the grid results, sorted by validation RMSE
# grid_results = random_grid.get_grid(sort_by="rmse", decreasing=False)

# # Get the best model based on RMSE
# best_model = grid_results.models[0]

# # Print details of the best model
# print("Best Random Search Model:")
# print(best_model)

# # Evaluate the best model on the test set
# performance = best_model.model_performance(test_h2o)

# # Print performance metrics
# print("Performance of Best Random Search Model:")
# print(f"RMSE: {performance.rmse():.2f}")
# print(f"MAE: {performance.mae():.2f}")
# print(f"R-squared: {performance.r2():.2f}")

In [None]:
# # Grid Search for Hyperparameter Tuning
# hyper_params = {
#     "epochs": [50, 100, 150],
#     "hidden": [[50, 50], [100, 100], [200, 200]],
#     "input_dropout_ratio": [0.0, 0.2, 0.4],
#     "l1": [1e-5, 1e-4, 1e-3],
#     "l2": [1e-5, 1e-4, 1e-3],
#     "activation": ["Rectifier", "Tanh", "Maxout"]
# }

# grid_search = H2OGridSearch(
#     H2ODeepLearningEstimator(seed=42),
#     hyper_params=hyper_params
# )

# # Train models with grid search
# grid_search.train(x=features, y=target, training_frame=train_h2o, validation_frame=val_h2o)

# # Get the best model from the grid search
# best_model = grid_search.get_grid(sort_by="rmse", decreasing=False).models[0]
# print("Best Grid Search Model:")
# print(best_model)

# # Evaluate the best grid search model
# performance = best_model.model_performance(test_h2o)
# print("Best Grid Search Model Performance:")
# print(performance)