In [3]:
import os
import time
import pandas as pd
import numpy as np
from h2o.automl import H2OAutoML
import h2o


In [2]:
RANDOM_STATE = 42
N_SYNTHETIC_SAMPLES_TO_GENERATE = 250000 
MODEL_CONFIDENCE_THRESHOLD = 0.75
MAX_GENERATION_ATTEMPTS = 1000000 

IMBALANCED_DATA_PATH = '../data/01_raw/original_imbalanced.csv'
PROCESSED_DATA_DIR = '../data/02_processed/'
MODEL_DRIVEN_SYNTHETIC_PATH = os.path.join(PROCESSED_DATA_DIR, 'model_driven_synthetic.csv')
MODELS_DIR = '../models/'
os.makedirs(MODELS_DIR, exist_ok=True)

In [4]:
h2o.init(nthreads=-1, max_mem_size="10g")
df_imbalanced = pd.read_csv(IMBALANCED_DATA_PATH)
hf_imbalanced = h2o.H2OFrame(df_imbalanced)
response = "target"
predictors = [c for c in hf_imbalanced.columns if c != response]
hf_imbalanced[response] = hf_imbalanced[response].asfactor()
train_i, valid_i, test_i = hf_imbalanced.split_frame(ratios=[0.7, 0.15], seed=RANDOM_STATE)

leader_model_path = os.path.join(MODELS_DIR, "imbalanced_leader.model")
if os.path.exists(leader_model_path):
    print("Loading saved imbalanced model...")
    imbalanced_leader_model = h2o.load_model(leader_model_path)
else:
    print("Training and saving imbalanced model...")
    aml_imb = H2OAutoML(
        max_runtime_secs=3600, max_models=20, seed=RANDOM_STATE, project_name=f"imbalanced_for_generation", sort_metric="AUCPR"
    )
    aml_imb.train(x=predictors, y=response, training_frame=train_i, validation_frame=valid_i)
    imbalanced_leader_model = aml_imb.leader
    h2o.save_model(model=imbalanced_leader_model, path=MODELS_DIR, filename="imbalanced_leader.model", force=True)

print("Imbalanced Leader Model for Generation:")
print(imbalanced_leader_model.model_id)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; OpenJDK 64-Bit Server VM Temurin-17.0.16+8 (build 17.0.16+8, mixed mode, sharing)
  Starting server from C:\Project\Synthetic Intelligence\.venv\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\bharg\AppData\Local\Temp\tmpn9hivmq9
  JVM stdout: C:\Users\bharg\AppData\Local\Temp\tmpn9hivmq9\h2o_bharg_started_from_python.out
  JVM stderr: C:\Users\bharg\AppData\Local\Temp\tmpn9hivmq9\h2o_bharg_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,Europe/London
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.7
H2O_cluster_version_age:,5 months and 5 days
H2O_cluster_name:,H2O_from_python_bharg_cfx5me
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,10 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Training and saving imbalanced model...
AutoML progress: |
13:06:55.362: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
13:06:55.373: AutoML: XGBoost is not available; skipping it.

███████████████████████████████████████████████████████████████| (done) 100%
Imbalanced Leader Model for Generation:
StackedEnsemble_AllModels_1_AutoML_1_20250902_130655
