### Import Library

In [41]:
import os
import joblib
import pandas as pd
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    mean_squared_error,
    mean_absolute_error,
    r2_score,
)
from sklearn.ensemble import RandomForestRegressor

# Load datasets
X_train = pd.read_csv("../04_modelling/dataset/X_train.csv")
y_train = pd.read_csv("../04_modelling/dataset/y_train.csv").squeeze()  # Convert to Series
X_test = pd.read_csv("../04_modelling/dataset/X_test.csv")
y_test = pd.read_csv("../04_modelling/dataset/y_test.csv").squeeze()    # Convert to Series

# Function to load scikit-learn models
def load_sklearn_model(file_name):
    with open(file_name, "rb") as file:
        return joblib.load(file)

# Function to record evaluation scores
def record_scores(results_df, **metrics):
    """
    Append evaluation scores to the results DataFrame.
    """
    new_row = pd.DataFrame([metrics])
    results_df = pd.concat([results_df, new_row], ignore_index=True)
    return results_df

# Ensure y datasets are flattened
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Function to evaluate models
def evaluate_models(models, X_test, y_test, results_df):
    """
    Evaluate models and handle specific preprocessing requirements.
    """
    for model_name, model in models.items():
        try:
            if isinstance(model, RandomForestRegressor):  # Handle regression models
                predictions = model.predict(X_test)
                
                # Regression metrics
                mse = mean_squared_error(y_test, predictions)
                mae = mean_absolute_error(y_test, predictions)
                r2 = r2_score(y_test, predictions)
                
                results_df = record_scores(
                    results_df,
                    Model=model_name,
                    MSE=mse,
                    MAE=mae,
                    R2=r2,
                    Accuracy=None,
                    Precision=None,
                    Recall=None,
                    F1_Score=None,
                    AUC_ROC=None,
                )
            else:  # Handle classification models
                predictions = model.predict(X_test)
                predictions_proba = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None

                accuracy = accuracy_score(y_test, predictions)
                precision = precision_score(y_test, predictions, average="weighted")
                recall = recall_score(y_test, predictions, average="weighted")
                f1 = f1_score(y_test, predictions, average="weighted")
                
                # Handle AUC-ROC
                if predictions_proba is not None and len(set(y_test)) > 2:
                    auc_roc = roc_auc_score(pd.get_dummies(y_test), predictions_proba, multi_class="ovr")
                elif predictions_proba is not None:
                    auc_roc = roc_auc_score(y_test, predictions_proba[:, 1])
                else:
                    auc_roc = None

                results_df = record_scores(
                    results_df,
                    Model=model_name,
                    Accuracy=accuracy,
                    Precision=precision,
                    Recall=recall,
                    F1_Score=f1,
                    AUC_ROC=auc_roc,
                    MSE=None,
                    MAE=None,
                    R2=None,
                )
        except Exception as e:
            print(f"Error evaluating model {model_name}: {e}")
    return results_df

# Define scikit-learn model files
sklearn_model_files = [
    "../04_modelling/models/dtree_model.pkl",
    "../04_modelling/models/mlp_neural_net.pkl",
    "../04_modelling/models/random_forest_model.pkl",
]

# Load scikit-learn models
sklearn_models = {}
for file_name in sklearn_model_files:
    model_name = os.path.splitext(os.path.basename(file_name))[0]
    try:
        sklearn_models[model_name] = load_sklearn_model(file_name)
        print(f"Loaded scikit-learn model: {model_name}")
    except Exception as e:
        print(f"Error loading scikit-learn model {model_name}: {e}")

# Prepare the results DataFrame
results = pd.DataFrame(columns=[
    "Model", "Accuracy", "Precision", "Recall", "F1_Score", "AUC_ROC", "MSE", "MAE", "R2"
])

# Evaluate scikit-learn models
results = evaluate_models(sklearn_models, X_test, y_test, results)

# Display results
results

Loaded scikit-learn model: dtree_model
Loaded scikit-learn model: mlp_neural_net
Loaded scikit-learn model: random_forest_model


Unnamed: 0,Model,Accuracy,Precision,Recall,F1_Score,AUC_ROC,MSE,MAE,R2
0,dtree_model,0.555556,0.543476,0.555556,0.538275,0.955129,,,
1,mlp_neural_net,0.898291,0.898978,0.898291,0.897737,0.996135,,,
2,random_forest_model,,,,,,7.372916,1.973631,0.871482


In [47]:
import os
import zipfile
import joblib
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
import h2o
from h2o.estimators import H2OEstimator

In [48]:
# Initialize H2O
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,57 mins 15 secs
H2O_cluster_timezone:,Asia/Kuala_Lumpur
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,1 month and 29 days
H2O_cluster_name:,H2O_from_python_Huawei_r97o7a
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.880 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [49]:
# Load datasets
X_train = pd.read_csv("../04_modelling/dataset/X_train.csv")
y_train = pd.read_csv("../04_modelling/dataset/y_train.csv")  # Convert to Series
X_test = pd.read_csv("../04_modelling/dataset/X_test.csv")
y_test = pd.read_csv("../04_modelling/dataset/y_test.csv")  # Convert to Series

In [50]:
y_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1170 entries, 0 to 1169
Data columns (total 1 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   yearly_compensation  1170 non-null   int64
dtypes: int64(1)
memory usage: 9.3 KB


In [51]:
train_df = pd.concat([X_train, y_train], axis=1)

In [52]:
train_h2o = h2o.H2OFrame(train_df)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [53]:
# Define target and features
target = "yearly_compensation"
features = train_h2o.columns
if target in features:
    features.remove(target)

In [54]:
# Function to load scikit-learn models
def load_sklearn_model(file_name):
    with open(file_name, "rb") as file:
        return joblib.load(file)

In [56]:
# Function to load H2O models
def load_h2o_model(file_name):
    return h2o.load_model(file_name)

In [57]:
# Function to load zipped H2O models (ensemble or similar)
def load_zipped_h2o_model(file_name):
    with zipfile.ZipFile(file_name, "r") as zip_ref:
        zip_ref.extractall("temp_models")
    extracted_files = os.listdir("temp_models")
    model_path = next((os.path.join("temp_models", f) for f in extracted_files if not f.endswith('.zip')), None)
    if model_path:
        return load_h2o_model(model_path)
    raise ValueError("No valid model found in the zip file.")

In [58]:
# Function to load models based on file type
def load_model(file_name):
    if file_name.endswith(".pkl"):
        return load_sklearn_model(file_name)
    elif file_name.endswith(".zip"):
        return load_zipped_h2o_model(file_name)
    else:
        return load_h2o_model(file_name)

In [59]:
# Function to record evaluation scores
def record_scores(model_name, accuracy, precision, recall, f1, auc_roc, results_df):
    new_row = pd.DataFrame({
        "Model": [model_name],
        "Accuracy": [accuracy],
        "Precision": [precision],
        "Recall": [recall],
        "F1-Score": [f1],
        "AUC-ROC": [auc_roc]
    })
    return pd.concat([results_df, new_row], ignore_index=True)

In [None]:
# Function to evaluate H2O models
def evaluate_h2o_models(models, X_test, y_test, results_df):
    h2o_test = h2o.H2OFrame(X_test)  # Convert test data to H2OFrame
    for model_name, model in models.items():
        try:
            if isinstance(model, H2OEstimator):
                # Get predictions
                predictions_df = model.predict(h2o_test).as_data_frame(use_multi_thread=True)
                predictions = predictions_df["predict"]  # Predicted class labels
                
                # Convert predictions and y_test to compatible formats
                predictions = pd.Categorical(predictions)
                y_test = pd.Categorical(y_test)
                
                # Ensure metrics align
                accuracy = accuracy_score(y_test, predictions)
                precision = precision_score(y_test, predictions, average="weighted")
                recall = recall_score(y_test, predictions, average="weighted")
                f1 = f1_score(y_test, predictions, average="weighted")
                
                # Handle probabilities for AUC-ROC
                if "predict" in predictions_df:
                    predictions_proba = predictions_df.drop(columns=["predict"])
                    if len(set(y_test)) > 2:  # Multiclass
                        auc_roc = roc_auc_score(pd.get_dummies(y_test), predictions_proba, multi_class="ovr")
                    else:  # Binary
                        auc_roc = roc_auc_score(y_test, predictions_proba.iloc[:, 1])
                else:
                    auc_roc = None

                # Record scores
                results_df = record_scores(model_name, accuracy, precision, recall, f1, auc_roc, results_df)

        except Exception as e:
            print(f"Error evaluating H2O model {model_name}: {e}")
    return results_df

In [60]:
# Define model files
model_files = [
    "../04_modelling/models/DeepLearning_model_python_1735461052018_31",
    "../04_modelling/models/StackedEnsemble_model_python_1735565212779_5",
    # "../04_modelling/models/voted_ensemble_model.zip"
]

# Load models
loaded_models = {}
for file_name in model_files:
    model_name = os.path.splitext(os.path.basename(file_name))[0]
    try:
        loaded_models[model_name] = load_model(file_name)
        print(f"Loaded model: {model_name}")
    except Exception as e:
        print(f"Error loading model {model_name}: {e}")
loaded_models

Loaded model: DeepLearning_model_python_1735461052018_31
Loaded model: StackedEnsemble_model_python_1735565212779_5


{'DeepLearning_model_python_1735461052018_31': Model Details
H2ODeepLearningEstimator : Deep Learning
Model Key: DeepLearning_model_python_1735461052018_31


Status of Neuron Layers: predicting yearly_compensation, regression, gaussian distribution, Quadratic loss, 10,251 weights/biases, 138.0 KB, 202,094 training samples, mini-batch size 1
    layer    units    type    dropout              l1                     l2                      mean_rate           rate_rms             momentum    mean_weight             weight_rms            mean_bias              bias_rms
--  -------  -------  ------  -------------------  ---------------------  ----------------------  ------------------  -------------------  ----------  ----------------------  --------------------  ---------------------  -----------------------
    1        50       Input   0.13062179562149737
    2        50       Maxout  0.0                  0.0008734285704456857  0.00044395405309485716  0.4733223503382411  0.11678057909011

In [None]:
loaded_models.items()

TypeError: items() takes no arguments (1 given)

In [None]:

# # Filter H2O models
# h2o_models = {name: model for name, model in loaded_models.items() if isinstance(model, H2OEstimator)}

# # Prepare results DataFrame
# results = pd.DataFrame(columns=["Model", "Accuracy", "Precision", "Recall", "F1-Score", "AUC-ROC"])

# # Evaluate H2O models
# results = evaluate_h2o_models(h2o_models, X_test, y_test_cat, results)

# # Display results
# results

### Archive

In [22]:
import pickle
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
import h2o
import os
import zipfile
from h2o.estimators.estimator_base import H2OEstimator
import joblib

In [23]:
# Start the H2O server for AutoML models
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321.

 connected.


0,1
H2O_cluster_uptime:,7 mins 00 secs
H2O_cluster_timezone:,Asia/Kuala_Lumpur
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,1 month and 29 days
H2O_cluster_name:,H2O_from_python_Huawei_r97o7a
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.952 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


### Load Model helper function

In [24]:
# Initialize dictionary to store loaded models
loaded_models = {}

# Define model files
model_files = [
    "../04_modelling/models/catboost_grid_search.pkl",
    "../04_modelling/models/DeepLearning_model_python_1735461052018_31",
    "../04_modelling/models/dtree_model.pkl",
    "../04_modelling/models/mlp_neural_net.pkl",
    "../04_modelling/models/random_forest_model.pkl",
    "../04_modelling/models/StackedEnsemble_model_python_1735565212779_5",
    "../04_modelling/models/voted_ensemble_model.zip"
]

# Function to load scikit-learn models
def load_sklearn_model(file_name):
    with open(file_name, "rb") as file:
        return joblib.load(file)

# Function to load H2O models
def load_h2o_model(file_name):
    return h2o.load_model(file_name)

# Function to load a model based on its file type
def load_model(file_name):
    if file_name.endswith(".pkl"):
        return load_sklearn_model(file_name)
    elif file_name.endswith(".zip"):
        # Assuming zip contains a model file
        with zipfile.ZipFile(file_name, "r") as zip_ref:
            zip_ref.extractall("temp_models")
        # Add specific logic for ensemble models here if necessary
        return None  # Placeholder
    else:
        return load_h2o_model(file_name)

# Load all models
for file_name in model_files:
    model_name = os.path.splitext(os.path.basename(file_name))[0]
    try:
        loaded_models[model_name] = load_model(file_name)
        print(f"Loaded model: {model_name}")
    except Exception as e:
        print(f"Error loading model {model_name}: {e}")

Error loading model catboost_grid_search: DLL load failed while importing _catboost: Not enough memory resources are available to process this command.
Loaded model: DeepLearning_model_python_1735461052018_31
Loaded model: dtree_model
Loaded model: mlp_neural_net
Loaded model: random_forest_model
Loaded model: StackedEnsemble_model_python_1735565212779_5
Loaded model: voted_ensemble_model


In [25]:
loaded_models

{'DeepLearning_model_python_1735461052018_31': Model Details
H2ODeepLearningEstimator : Deep Learning
Model Key: DeepLearning_model_python_1735461052018_31


Status of Neuron Layers: predicting yearly_compensation, regression, gaussian distribution, Quadratic loss, 10,251 weights/biases, 138.0 KB, 202,094 training samples, mini-batch size 1
    layer    units    type    dropout              l1                     l2                      mean_rate           rate_rms             momentum    mean_weight             weight_rms            mean_bias              bias_rms
--  -------  -------  ------  -------------------  ---------------------  ----------------------  ------------------  -------------------  ----------  ----------------------  --------------------  ---------------------  -----------------------
    1        50       Input   0.13062179562149737
    2        50       Maxout  0.0                  0.0008734285704456857  0.00044395405309485716  0.4733223503382411  0.11678057909011

### Model Evaluation 

In [26]:
# Load datasets
X_train = pd.read_csv("../04_modelling/dataset/X_train.csv")
y_train = pd.read_csv("../04_modelling/dataset/y_train.csv").squeeze()  # Convert to Series if needed
X_test = pd.read_csv("../04_modelling/dataset/X_test.csv")
y_test = pd.read_csv("../04_modelling/dataset/y_test.csv").squeeze()    # Convert to Series if needed

In [27]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Function to record evaluation scores
def record_scores(model_name, accuracy, precision, recall, f1, auc_roc, results_df):
    """
    Append evaluation scores to the results DataFrame using pd.concat.
    """
    new_row = pd.DataFrame({
        "Model": [model_name],
        "Accuracy": [accuracy],
        "Precision": [precision],
        "Recall": [recall],
        "F1-Score": [f1],
        "AUC-ROC": [auc_roc]
    })
    results_df = pd.concat([results_df, new_row], ignore_index=True)
    return results_df

# Function to evaluate scikit-learn models
def evaluate_sklearn_models(models, X_test, y_test, results_df):
    """
    Evaluate scikit-learn models.
    """
    for model_name, model in models.items():
        try:
            predictions = model.predict(X_test)
            predictions_proba = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None

            accuracy = accuracy_score(y_test, predictions)
            precision = precision_score(y_test, predictions, average="weighted")
            recall = recall_score(y_test, predictions, average="weighted")
            f1 = f1_score(y_test, predictions, average="weighted")
            
            # Handle AUC-ROC for multiclass
            if predictions_proba is not None and len(set(y_test)) > 2:
                auc_roc = roc_auc_score(pd.get_dummies(y_test), predictions_proba, multi_class="ovr")
            elif predictions_proba is not None:
                auc_roc = roc_auc_score(y_test, predictions_proba[:, 1])
            else:
                auc_roc = None

            results_df = record_scores(model_name, accuracy, precision, recall, f1, auc_roc, results_df)

        except Exception as e:
            print(f"Error evaluating scikit-learn model {model_name}: {e}")
    return results_df

In [28]:
# Function to evaluate scikit-learn models
def evaluate_sklearn_models(models, X_test, y_test, results_df):
    """
    Evaluate scikit-learn models.
    """
    for model_name, model in models.items():
        try:
            predictions = model.predict(X_test)
            predictions_proba = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None

            accuracy = accuracy_score(y_test, predictions)
            precision = precision_score(y_test, predictions, average="weighted")
            recall = recall_score(y_test, predictions, average="weighted")
            f1 = f1_score(y_test, predictions, average="weighted")
            auc_roc = (
                roc_auc_score(y_test, predictions_proba, multi_class="ovr") if predictions_proba is not None else None
            )

            results_df = record_scores(model_name, accuracy, precision, recall, f1, auc_roc, results_df)

        except Exception as e:
            print(f"Error evaluating scikit-learn model {model_name}: {e}")
    return results_df

In [38]:
import h2o
from h2o.estimators import H2OEstimator
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import pandas as pd

# Function to evaluate H2O models
def evaluate_h2o_models(models, X_test, y_test, results_df):
    """
    Evaluate H2O models.
    """
    h2o_test = h2o.H2OFrame(X_test)  # Convert test data to H2OFrame
    for model_name, model in models.items():
        try:
            if isinstance(model, H2OEstimator):
                # Get predictions
                predictions_df = model.predict(h2o_test).as_data_frame(use_multi_thread=True)
                predictions = predictions_df["predict"]  # Predicted classes

                # Check for probabilities if needed
                predictions_proba = predictions_df.drop(columns=["predict"], errors="ignore") if "predict" in predictions_df else None

                # Ensure `y_test` is categorical if predictions are categorical
                y_test = pd.Categorical(y_test) if isinstance(y_test, pd.Series) else y_test

                # Calculate metrics
                accuracy = accuracy_score(y_test, predictions)
                precision = precision_score(y_test, predictions, average="weighted")
                recall = recall_score(y_test, predictions, average="weighted")
                f1 = f1_score(y_test, predictions, average="weighted")

                # Handle AUC-ROC for binary or multiclass
                if predictions_proba is not None:
                    if len(set(y_test)) > 2:  # Multiclass
                        auc_roc = roc_auc_score(pd.get_dummies(y_test), predictions_proba, multi_class="ovr")
                    else:  # Binary
                        auc_roc = roc_auc_score(y_test, predictions_proba.iloc[:, 1])
                else:
                    auc_roc = None

                # Record scores
                results_df = record_scores(model_name, accuracy, precision, recall, f1, auc_roc, results_df)

        except Exception as e:
            print(f"Error evaluating H2O model {model_name}: {e}")
    return results_df

In [30]:
# Separate models into scikit-learn and H2O categories
sklearn_models = {
    name: model for name, model in loaded_models.items() if not isinstance(model, H2OEstimator)
}
h2o_models = {
    name: model for name, model in loaded_models.items() if isinstance(model, H2OEstimator)
}

In [31]:
sklearn_models

{'dtree_model': BaggingClassifier(bootstrap=False,
                   estimator=DecisionTreeClassifier(criterion='entropy',
                                                    max_depth=16,
                                                    min_samples_leaf=4,
                                                    min_samples_split=9,
                                                    random_state=123),
                   max_features=0.6986074415599616,
                   max_samples=0.7080360245310086, n_estimators=74,
                   random_state=123),
 'mlp_neural_net': MLPClassifier(activation='identity', alpha=0.0002712909526978834,
               hidden_layer_sizes=(50, 50), learning_rate='adaptive',
               max_iter=500, random_state=123, solver='lbfgs'),
 'random_forest_model': RandomForestRegressor(max_features=None, min_samples_leaf=2, n_estimators=500,
                       random_state=42),
 'voted_ensemble_model': None}

In [32]:
# Prepare the results DataFrame
results = pd.DataFrame(columns=["Model", "Accuracy", "Precision", "Recall", "F1-Score", "AUC-ROC"])

In [33]:
results = evaluate_sklearn_models(sklearn_models, X_test, y_test, results)

Error evaluating scikit-learn model random_forest_model: Classification metrics can't handle a mix of multiclass and continuous targets
Error evaluating scikit-learn model voted_ensemble_model: 'NoneType' object has no attribute 'predict'


In [39]:
results = evaluate_h2o_models(h2o_models, X_test, y_test, results)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
deeplearning prediction progress: |██████████████████████████████████████████████| (done) 100%
Export File progress: |██████████████████████████████████████████████████████████| (done) 100%
Error evaluating H2O model DeepLearning_model_python_1735461052018_31: Classification metrics can't handle a mix of multiclass and continuous targets
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Export File progress: |██████████████████████████████████████████████████████████| (done) 100%
Error evaluating H2O model StackedEnsemble_model_python_1735565212779_5: Classification metrics can't handle a mix of multiclass and continuous targets


In [35]:
results

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score,AUC-ROC
0,dtree_model,0.555556,0.543476,0.555556,0.538275,0.955129
1,mlp_neural_net,0.898291,0.898978,0.898291,0.897737,0.996135


In [1]:
import os
import pandas as pd
import h2o
from h2o.estimators import H2ODeepLearningEstimator
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Initialize H2O
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; OpenJDK 64-Bit Server VM Temurin-17.0.12+7 (build 17.0.12+7, mixed mode, sharing)
  Starting server from C:\Users\Huawei\OneDrive - Universiti Malaya\Desktop\SEMESTER 7\WIE3007_Data-Mining\Group Project\data-mining-warehousing-wages-analysis\venv\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\Huawei\AppData\Local\Temp\tmp58f_ikfo
  JVM stdout: C:\Users\Huawei\AppData\Local\Temp\tmp58f_ikfo\h2o_Huawei_started_from_python.out
  JVM stderr: C:\Users\Huawei\AppData\Local\Temp\tmp58f_ikfo\h2o_Huawei_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,05 secs
H2O_cluster_timezone:,Asia/Kuala_Lumpur
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,1 month and 30 days
H2O_cluster_name:,H2O_from_python_Huawei_5wops1
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.961 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%
Training Performance:
ModelMetricsRegression: deeplearning
** Reported on train data. **

MSE: 0.11935238463856258
RMSE: 0.3454741446744786
MAE: 0.2704941605232003
RMSLE: 0.10651841963076515
Mean Residual Deviance: 0.11935238463856258

Test Performance:
ModelMetricsRegression: deeplearning
** Reported on test data. **

MSE: 0.19339239352761498
RMSE: 0.43976402027407263
MAE: 0.33467287406073365
RMSLE: 0.12822266355020856
Mean Residual Deviance: 0.19339239352761498
deeplearning prediction progress: |██████████████████████████████████████████████| (done) 100%

Additional Evaluation Metrics:
MSE: 0.19339239352761528
RMSE: 0.43976402027407296
MAE: 0.334672874060734
R^2 Score: 0.996628950635985
Model saved to: C




In [None]:
# Initialize dictionary to store loaded models
loaded_models = {}

# Define model files
model_files = [
    "../04_modelling/models/catboost_grid_search.pkl",
    "../04_modelling/models/DeepLearning_model_python_1735461052018_31",
    "../04_modelling/models/dtree_model.pkl",
    "../04_modelling/models/mlp_neural_net.pkl",
    "../04_modelling/models/random_forest_model.pkl",
    "../04_modelling/models/StackedEnsemble_model_python_1735565212779_5",
    "../04_modelling/models/voted_ensemble_model.zip"
]

# Function to load scikit-learn models
def load_sklearn_model(file_name):
    with open(file_name, "rb") as file:
        return joblib.load(file)

# Function to load H2O models
def load_h2o_model(file_name):
    return h2o.load_model(file_name)

# Function to load a model based on its file type
def load_model(file_name):
    if file_name.endswith(".pkl"):
        return load_sklearn_model(file_name)
    elif file_name.endswith(".zip"):
        # Assuming zip contains a model file
        with zipfile.ZipFile(file_name, "r") as zip_ref:
            zip_ref.extractall("temp_models")
        # Add specific logic for ensemble models here if necessary
        return None  # Placeholder
    else:
        return load_h2o_model(file_name)

# Load all models
for file_name in model_files:
    model_name = os.path.splitext(os.path.basename(file_name))[0]
    try:
        loaded_models[model_name] = load_model(file_name)
        print(f"Loaded model: {model_name}")
    except Exception as e:
        print(f"Error loading model {model_name}: {e}")

In [None]:
# Load datasets
X_train = pd.read_csv("../04_modelling/dataset/X_train.csv")
y_train = pd.read_csv("../04_modelling/dataset/y_train.csv")  # Ensure this is a DataFrame or Series
X_test = pd.read_csv("../04_modelling/dataset/X_test.csv")
y_test = pd.read_csv("../04_modelling/dataset/y_test.csv")  # Ensure this is a DataFrame or Series

In [None]:
# Combine X_train and y_train into one DataFrame for H2OFrame
train_df = pd.concat([X_train, y_train], axis=1)
train_h2o = h2o.H2OFrame(train_df)

In [None]:
# Combine X_test and y_test into one DataFrame for H2OFrame
test_df = pd.concat([X_test, y_test], axis=1)
test_h2o = h2o.H2OFrame(test_df)

In [None]:
# Define target and features
target = y_train.columns[0]  # Assuming the target column name is present in y_train
features = X_train.columns.tolist()

In [None]:
# Evaluate the model
def evaluate_model(model, train_h2o, test_h2o, y_test):
    # Training performance
    train_performance = model.model_performance(train=True)
    print("Training Performance:")
    print(train_performance)

    # Validation performance
    test_performance = model.model_performance(test_data=test_h2o)
    print("\nTest Performance:")
    print(test_performance)

    # Extracting predictions
    predictions = model.predict(test_h2o).as_data_frame().values.flatten()

    # Calculate additional metrics using sklearn
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    print("\nAdditional Evaluation Metrics:")
    print(f"MSE: {mse}")
    print(f"RMSE: {rmse}")
    print(f"MAE: {mae}")
    print(f"R^2 Score: {r2}")

In [None]:
# Call the evaluation function
evaluate_model(loaded_models, train_h2o, test_h2o, y_test)

In [2]:
import os
import zipfile
import pandas as pd
import h2o
from h2o.estimators import H2ODeepLearningEstimator
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Initialize H2O
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,16 mins 01 secs
H2O_cluster_timezone:,Asia/Kuala_Lumpur
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,1 month and 30 days
H2O_cluster_name:,H2O_from_python_Huawei_5wops1
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.884 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [9]:
# Initialize dictionary to store loaded models
loaded_models = {}

In [3]:
# Define model files
model_files = [
    "../04_modelling/models/DeepLearning_model_python_1735461052018_31",
    "../04_modelling/models/StackedEnsemble_model_python_1735565212779_5",
    # "../04_modelling/models/voted_ensemble_model.zip"
]

In [4]:
# Function to load H2O models
def load_h2o_model(file_name):
    return h2o.load_model(file_name)

In [6]:
# Function to load a model based on its file type
def load_model(file_name):
    if file_name.endswith(".pkl"):
        return load_sklearn_model(file_name)
    elif file_name.endswith(".zip"):
        # Assuming zip contains a model file
        with zipfile.ZipFile(file_name, "r") as zip_ref:
            zip_ref.extractall("temp_models")
        # Add specific logic for ensemble models here if necessary
        return None  # Placeholder
    else:
        return load_h2o_model(file_name)

In [10]:
# Load all models
for file_name in model_files:
    model_name = os.path.splitext(os.path.basename(file_name))[0]
    try:
        loaded_models[model_name] = load_model(file_name)
        print(f"Loaded model: {model_name}")
    except Exception as e:
        print(f"Error loading model {model_name}: {e}")

Loaded model: DeepLearning_model_python_1735461052018_31
Loaded model: StackedEnsemble_model_python_1735565212779_5


In [11]:
loaded_models

{'DeepLearning_model_python_1735461052018_31': Model Details
H2ODeepLearningEstimator : Deep Learning
Model Key: DeepLearning_model_python_1735461052018_31


Status of Neuron Layers: predicting yearly_compensation, regression, gaussian distribution, Quadratic loss, 10,251 weights/biases, 138.0 KB, 202,094 training samples, mini-batch size 1
    layer    units    type    dropout              l1                     l2                      mean_rate           rate_rms             momentum    mean_weight             weight_rms            mean_bias              bias_rms
--  -------  -------  ------  -------------------  ---------------------  ----------------------  ------------------  -------------------  ----------  ----------------------  --------------------  ---------------------  -----------------------
    1        50       Input   0.13062179562149737
    2        50       Maxout  0.0                  0.0008734285704456857  0.00044395405309485716  0.4733223503382411  0.11678057909011

In [12]:
# Load datasets
X_train = pd.read_csv("../04_modelling/dataset/X_train.csv")
y_train = pd.read_csv("../04_modelling/dataset/y_train.csv")  # Ensure this is a DataFrame or Series
X_test = pd.read_csv("../04_modelling/dataset/X_test.csv")
y_test = pd.read_csv("../04_modelling/dataset/y_test.csv")  # Ensure this is a DataFrame or Series

In [13]:
# Combine X_test and y_test into one DataFrame for H2OFrame
test_df = pd.concat([X_test, y_test], axis=1)
test_h2o = h2o.H2OFrame(test_df)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [14]:
# Initialize a DataFrame to store model evaluation results
results = pd.DataFrame(columns=["Model", "MSE", "RMSE", "MAE", "R2"])

In [15]:
# Evaluate the model and record performance
def evaluate_model(model_name, model, test_h2o, y_test):
    try:
        # Validation performance
        test_performance = model.model_performance(test_data=test_h2o)
        print(f"\nTest Performance for {model_name}:")
        print(test_performance)

        # Extracting predictions
        predictions = model.predict(test_h2o).as_data_frame().values.flatten()

        # Calculate additional metrics using sklearn
        mse = mean_squared_error(y_test, predictions)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, predictions)
        r2 = r2_score(y_test, predictions)

        print(f"\nAdditional Evaluation Metrics for {model_name}:")
        print(f"MSE: {mse}")
        print(f"RMSE: {rmse}")
        print(f"MAE: {mae}")
        print(f"R^2 Score: {r2}")

        # Append results to the DataFrame
        results.loc[len(results)] = [model_name, mse, rmse, mae, r2]
    except Exception as e:
        print(f"Error evaluating model {model_name}: {e}")

In [16]:
# Evaluate each loaded model
for model_name, model in loaded_models.items():
    evaluate_model(model_name, model, test_h2o, y_test)


Test Performance for DeepLearning_model_python_1735461052018_31:
ModelMetricsRegression: deeplearning
** Reported on test data. **

MSE: 0.06630532219303646
RMSE: 0.25749819842677824
MAE: 0.19925882702258896
RMSLE: 0.09276666964802921
Mean Residual Deviance: 0.06630532219303646
deeplearning prediction progress: |██████████████████████████████████████████████| (done) 100%






Additional Evaluation Metrics for DeepLearning_model_python_1735461052018_31:
MSE: 0.06630532219303649
RMSE: 0.2574981984267783
MAE: 0.199258827022589
R^2 Score: 0.9988442228252492

Test Performance for StackedEnsemble_model_python_1735565212779_5:
ModelMetricsRegressionGLM: stackedensemble
** Reported on test data. **

MSE: 0.5516425154961121
RMSE: 0.7427264068929501
MAE: 0.536177505742845
RMSLE: NaN
Mean Residual Deviance: 0.5516425154961121
R^2: 0.990384243572841
Null degrees of freedom: 1169
Residual degrees of freedom: 1167
Null deviance: 67121.28489254977
Residual deviance: 645.4217431304512
AIC: 2632.3357482544166
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%






Additional Evaluation Metrics for StackedEnsemble_model_python_1735565212779_5:
MSE: 0.5516425154961121
RMSE: 0.7427264068929501
MAE: 0.5361775057428452
R^2 Score: 0.990384243572841


In [17]:
# Display and save the evaluation results
print("\nModel Evaluation Results:")
results


Model Evaluation Results:


Unnamed: 0,Model,MSE,RMSE,MAE,R2
0,DeepLearning_model_python_1735461052018_31,0.066305,0.257498,0.199259,0.998844
1,StackedEnsemble_model_python_1735565212779_5,0.551643,0.742726,0.536178,0.990384


In [36]:
# # Evaluate each model
# for model_name, model in loaded_models.items():
#     try:
#         # Check if model is from scikit-learn or H2O
#         if isinstance(model, h2o.estimators.estimator_base.H2OEstimator):
#             h2o_test = h2o.H2OFrame(X_test)
#             predictions = model.predict(h2o_test).as_data_frame(use_multi_thread=True)["predict"]
#             predictions_proba = model.predict(h2o_test).as_data_frame(use_multi_thread=True)
#         else:
#             predictions = model.predict(X_test)
#             predictions_proba = model.predict_proba(X_test)

#         # Compute metrics
#         accuracy = accuracy_score(y_test, predictions)
#         precision = precision_score(y_test, predictions, average="weighted")
#         recall = recall_score(y_test, predictions, average="weighted")
#         f1 = f1_score(y_test, predictions, average="weighted")
#         auc_roc = roc_auc_score(y_test, predictions_proba, multi_class="ovr")

#         # Append results
#         results = results.append({
#             "Model": model_name,
#             "Accuracy": accuracy,
#             "Precision": precision,
#             "Recall": recall,
#             "F1-Score": f1,
#             "AUC-ROC": auc_roc
#         }, ignore_index=True)

#     except Exception as e:
#         print(f"Error evaluating model {model_name}: {e}")

### Visualize Results

In [37]:
# import matplotlib.pyplot as plt

# # Display the results table
# print(results)

# # Plot the results
# results.set_index("Model").plot(kind="bar", figsize=(10, 6))
# plt.title("Model Performance Metrics")
# plt.ylabel("Score")
# plt.xticks(rotation=45)
# plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
# plt.tight_layout()
# plt.show()