In [5]:
import os
import time
import pandas as pd
import numpy as np
from h2o.automl import H2OAutoML
import h2o


In [6]:
RANDOM_STATE = 42
N_SYNTHETIC_SAMPLES_TO_GENERATE = 250000 
MODEL_CONFIDENCE_THRESHOLD = 0.75
MAX_GENERATION_ATTEMPTS = 1000000 

IMBALANCED_DATA_PATH = '../data/01_raw/original_imbalanced.csv'
PROCESSED_DATA_DIR = '../data/02_processed/'
MODEL_DRIVEN_SYNTHETIC_PATH = os.path.join(PROCESSED_DATA_DIR, 'model_driven_synthetic.csv')
MODELS_DIR = '../models/'
os.makedirs(MODELS_DIR, exist_ok=True)

In [7]:
h2o.init(nthreads=-1, max_mem_size="10g")
df_imbalanced = pd.read_csv(IMBALANCED_DATA_PATH)
hf_imbalanced = h2o.H2OFrame(df_imbalanced)
response = "target"
predictors = [c for c in hf_imbalanced.columns if c != response]
hf_imbalanced[response] = hf_imbalanced[response].asfactor()
train_i, valid_i, test_i = hf_imbalanced.split_frame(ratios=[0.7, 0.15], seed=RANDOM_STATE)

leader_model_path = os.path.join(MODELS_DIR, "imbalanced_leader.model")
if os.path.exists(leader_model_path):
    print("Loading saved imbalanced model...")
    imbalanced_leader_model = h2o.load_model(leader_model_path)
else:
    print("Training and saving imbalanced model...")
    aml_imb = H2OAutoML(
        max_runtime_secs=3600, max_models=20, seed=RANDOM_STATE, project_name=f"imbalanced_for_generation", sort_metric="AUCPR"
    )
    aml_imb.train(x=predictors, y=response, training_frame=train_i, validation_frame=valid_i)
    imbalanced_leader_model = aml_imb.leader
    h2o.save_model(model=imbalanced_leader_model, path=MODELS_DIR, filename="imbalanced_leader.model", force=True)

print("Imbalanced Leader Model for Generation:")
print(imbalanced_leader_model.model_id)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; OpenJDK 64-Bit Server VM Temurin-17.0.16+8 (build 17.0.16+8, mixed mode, sharing)
  Starting server from C:\Project\Synthetic Intelligence\.venv\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\bharg\AppData\Local\Temp\tmp1yublog8
  JVM stdout: C:\Users\bharg\AppData\Local\Temp\tmp1yublog8\h2o_bharg_started_from_python.out
  JVM stderr: C:\Users\bharg\AppData\Local\Temp\tmp1yublog8\h2o_bharg_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,Europe/London
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.7
H2O_cluster_version_age:,5 months and 5 days
H2O_cluster_name:,H2O_from_python_bharg_cd4fgy
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,10 Gb
H2O_cluster_total_cores:,0
H2O_cluster_allowed_cores:,0


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Loading saved imbalanced model...
Imbalanced Leader Model for Generation:
StackedEnsemble_AllModels_1_AutoML_1_20250902_130655


# Generate Synthetic Data

In [8]:
print(f"Strating model driven synthetic data generation for {N_SYNTHETIC_SAMPLES_TO_GENERATE} samples...")
df_train = train_i.as_data_frame()
df_minority = df_train[df_train['target'] == 1].drop('target', axis=1)
df_majority = df_train[df_train['target'] == 0]

numerical_cols = df_minority.select_dtypes(include=np.number).columns
categorical_cols = df_minority.select_dtypes(include=['object', 'category']).columns

synthetic_samples = []
for i in range(MAX_GENERATION_ATTEMPTS):
    if len(synthetic_samples) >= N_SYNTHETIC_SAMPLES_TO_GENERATE:
        print("Successfully generated target number of sampels")
        break

    # 1. Randomly pick two minority samples
    p1_idx, p2_idx = np.random.choice(df_minority.index, 2, replace=False)
    p1, p2 = df_minority.loc[p1_idx], df_minority.loc[p2_idx]

    # 2. Interpolate numerical, randomly select categorical
    new_sample = {}
    interpolation_ratio = np.random.rand()
    new_sample.update((p1[numerical_cols] * interpolation_ratio + p2[numerical_cols] * (1-interpolation_ratio)).to_dict())
    new_sample.update(p1[categorical_cols] if np.random.rand() > 0.5 else p2[categorical_cols])

    # 3. Use the model as a filter
    new_sample_df = pd.DataFrame([new_sample])
    new_sample_hf = h2o.H2OFrame(new_sample_df)

    prediction = imbalanced_leader_model.predict(new_sample_hf)

    if prediction['p1'][0, 0] >= MODEL_CONFIDENCE_THRESHOLD:
        new_sample['target'] = 1
        synthetic_samples.append(new_sample)
    
    if i % 10000 == 0:
        print(f"Attempts: {i}, Generated: {len(synthetic_samples)}/{N_SYNTHETIC_SAMPLES_TO_GENERATE}")
    
if len(synthetic_samples) < N_SYNTHETIC_SAMPLES_TO_GENERATE:
    print(f"Warning: Generation finished early. Generated {len(synthetic_samples)} samples.")

# Create and save the new balanced dataset
df_synthetic = pd.DataFrame(synthetic_samples)
df_model_driven = pd.concat([df_majority, df_synthetic]).sample(frac=1, random_state=RANDOM_STATE)

print("\nNew model driven dataset class distribution:")
print(df_model_driven['target'].value_counts(normalize=True))

df_model_driven.to_csv(MODEL_DRIVEN_SYNTHETIC_PATH, index=False)
print(f"Saved model driven synthetic dataset to {MODEL_DRIVEN_SYNTHETIC_PATH}")

h2o.cluster().shutdown()

Strating model driven synthetic data generation for 250000 samples...





Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Attempts: 0, Generated: 1/250000
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Parse progress: |

H2OJobCancelled: Job<$03017f00000132d4ffffffff$_8cafdaa6223e029686719cd919464212> was cancelled by the user.