In [1]:
import os
import time
import pandas as pd
import matplotlib.pyplot as plt
import h2o
from h2o.automl import H2OAutoML
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [2]:
RANDOM_STATE = 42
TRAIN_TIME_SECS = 600
MAX_MODELS = 20
N_THREADS = -1

IMBALANCED_DATA_PATH = '../data/01_raw/original_imbalanced.csv'
PROCESSED_DATA_DIR = '../data/02_processed/'
SMOTE_DATA_PATH = os.path.join(PROCESSED_DATA_DIR, 'smote_augmented_train.csv')

os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)

In [4]:
def initialize_h2o():
    """Robust initialisation of H2O cluster"""
    h2o.init(nthreads=N_THREADS, max_mem_size="10g")

def run_automl_on_h20_frames(train_hf, valid_hf, test_hf, predictors, response, dataset_name):
    aml = H2OAutoML(
        max_runtime_secs=TRAIN_TIME_SECS,
        max_models=MAX_MODELS,
        seed=RANDOM_STATE,
        project_name=f"{dataset_name}_{int(time.time())}",
        nfolds=0,
        sort_metric="AUCPR"
    )
    aml.train(x=predictors, y=response, training_frame=train_hf, validation_frame=valid_hf)

    pref = aml.leader.model_performance(test_hf)

    perf = aml.leader.model_performance(test_hf)
    metrics = {
        "auc": perf.auc(), "auprc": perf.aucpr(), "f1": perf.F1()[0][1],
        "recall": perf.recall()[0][1], "precision": perf.precision()[0][1],
        "accuracy": perf.accuracy()[0][1]
    }
    return aml, metrics

In [5]:
# Load model
print("Loading imbalanced dataset...")
df_imbalanced = pd.read_csv(IMBALANCED_DATA_PATH)
response = "target"

X = df_imbalanced.drop(response, axis=1)
y = df_imbalanced[response]

Loading imbalanced dataset...


# Pre SMOTE data splitting

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=RANDOM_STATE, stratify=y_train)

print("Original training set distribution")
print(y_train.value_counts(normalize=True))

Original training set distribution
target
0    0.920003
1    0.079997
Name: proportion, dtype: float64


# Apply SMOTE to the training set

In [7]:
print("\nApplying SMOTE...")
X_train_numeric = pd.get_dummies(X_train)
smote = SMOTE(random_state=RANDOM_STATE, k_neighbors=5)

X_train_smote, y_train_smote = smote.fit_resample(X_train_numeric, y_train)

print("\nSMOTE augmented training set distribution:")
print(y_train_smote.value_counts(normalize=True))


Applying SMOTE...

SMOTE augmented training set distribution:
target
0    0.5
1    0.5
Name: proportion, dtype: float64


# Prepare Dataframe for H2O

In [8]:
df_train_smote = pd.concat([pd.DataFrame(X_train_smote), pd.DataFrame(y_train_smote)], axis=1)

df_val = pd.concat([X_val, y_val], axis=1)
df_test = pd.concat([X_test, y_test], axis=1)

df_val_encoded = pd.get_dummies(df_val).reindex(columns=df_train_smote.columns, fill_value=0)
df_test_encoded = pd.get_dummies(df_test).reindex(columns=df_train_smote.columns, fill_value=0)

# Train and Evaluate Model

In [9]:
initialize_h2o()

train_hf = h2o.H2OFrame(df_train_smote)
valid_hf = h2o.H2OFrame(df_val_encoded)
test_hf = h2o.H2OFrame(df_test_encoded)

# Define predictors and response
predictors = train_hf.columns
predictors.remove(response)
train_hf[response] = train_hf[response].asfactor()
valid_hf[response] = valid_hf[response].asfactor()
test_hf[response] = test_hf[response].asfactor()

# Run AutoML
print("\nTraining AutoML on Smote Aug data...")
aml_smote, metrics_smote = run_automl_on_h20_frames(train_hf, valid_hf, test_hf, predictors, response, "smote_augmented")

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; OpenJDK 64-Bit Server VM Temurin-17.0.16+8 (build 17.0.16+8, mixed mode, sharing)
  Starting server from C:\Project\Synthetic Intelligence\.venv\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\bharg\AppData\Local\Temp\tmpzprj4ft3
  JVM stdout: C:\Users\bharg\AppData\Local\Temp\tmpzprj4ft3\h2o_bharg_started_from_python.out
  JVM stderr: C:\Users\bharg\AppData\Local\Temp\tmpzprj4ft3\h2o_bharg_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,Europe/London
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.7
H2O_cluster_version_age:,5 months and 5 days
H2O_cluster_name:,H2O_from_python_bharg_grbbsh
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,10 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%

Training AutoML on Smote Aug data...
AutoML progress: |
01:36:04.400: AutoML: XGBoost is not available; skipping it.

███████████████████████████████████████████████████████████████| (done) 100%


In [10]:
# Display results
print("\n SMOTE Model Performance on Original Test Set")
df_metrics_smote = pd.DataFrame([metrics_smote], index=['SMOTE']).round(4)
print(df_metrics_smote)

df_train_smote.to_csv(SMOTE_DATA_PATH, index=False)
print(f"\nSaved SMOTE-augmented training data to {SMOTE_DATA_PATH}")

h2o.cluster().shutdown()


 SMOTE Model Performance on Original Test Set
          auc   auprc     f1  recall  precision  accuracy
SMOTE  0.9706  0.7344  0.803     1.0        1.0    0.9657

Saved SMOTE-augmented training data to ../data/02_processed/smote_augmented_train.csv
H2O session _sid_b208 closed.
