In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularPredictor
import matplotlib.pyplot as plt
import os

In [2]:
# remove models folder if it exists
import shutil
shutil.rmtree("../models", ignore_errors=True)

In [3]:
def prepare_data(true_data, counter_files, max_index):
    combined_data = {}
    # Define the column names to keep.
    cols = [f"B{i}" for i in range(1, max_index + 1)]

    # Process the true data: retain only the B columns and add the true label.
    true_data['label'] = "true"
    true_data = true_data[cols + ["label"]]

    for key, path in counter_files.items():
        # Load the counter example file.
        df_counter = pd.read_csv(path)
        # Retain only the B columns and add the false label.
        df_counter['label'] = "false"
        df_counter = df_counter[cols + ["label"]]
        # Combine the true data with this counter example.
        combined = pd.concat([true_data, df_counter], ignore_index=True)
        combined_data[key] = combined

    return combined_data

In [4]:
presets = ['good_quality', 'optimize_for_deployment']

def generateModels(combined_data, prefix, max_index, time_limit=180):
    results = {}
    # Define the feature columns
    feature_cols = [f"B{i}" for i in range(1, max_index + 1)]

    for key, df in combined_data.items():
        # Split data into features and label.
        X = df[feature_cols]
        y = df["label"]

        # Train-test split: 70% train, 30% test.
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
        train_data = pd.concat([X_train, y_train], axis=1)
        test_data = pd.concat([X_test, y_test], axis=1)

        # Define an output folder for saving models for this dataset variant.
        output_folder = f"../models/{prefix}/{key}"
        os.makedirs(output_folder, exist_ok=True)

        # Train the model using AutoGluon.
        predictor = TabularPredictor(label="label", path=output_folder, eval_metric='f1').fit(train_data, presets=presets, time_limit=time_limit)

        # Evaluate the model on the test data.
        leaderboard = predictor.leaderboard(test_data, silent=True)
        results[key] = leaderboard

        print(f"Results for {key}:")
        print(leaderboard)

    # Create results folder and save leaderboards.
    results_folder = f"../models/{prefix}/results"
    os.makedirs(results_folder, exist_ok=True)
    for key, result in results.items():
        result.to_csv(os.path.join(results_folder, f"{key}_leaderboard.csv"), index=False)

In [7]:
true_file = "../data/ei/data_ei.csv"
ei_true_data = pd.read_csv(true_file)

# List of counter example files for the EI zone
counter_files = {
    "ei_random": "../data/ei/data_ei_random.csv",
    "ei_ez_counter": "../data/ei/data_ez_counter_example.csv",
    "ei_ie_counter": "../data/ei/data_ie_counter_example.csv",
    "ei_ze_counter": "../data/ei/data_ze_counter_example.csv",
    "sample_combined_counter": "../data/ei/data_sample_combined.csv"
}

# For EI zone, our CSV files have columns B1 to B12.
combined_data = prepare_data(ei_true_data, counter_files, max_index=12)

# Dictionary to store the combined DataFrames and later model results
generateModels(combined_data, 'ei', 12, time_limit=900)

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.9.13
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
CPU Count:          16
Memory Avail:       1.85 GB / 15.35 GB (12.1%)
Disk Space Avail:   552.81 GB / 731.50 GB (75.6%)
Presets specified: ['good_quality', 'optimize_for_deployment']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Note: `save_bag_folds=False`! This will greatly reduce peak disk usage during fit (by ~8x), but runs the risk of an out-of-memory error during model refit if memory is small relative to the data size.
	You can avoid this risk by setting `save_bag_folds=True`.
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacki

Results for ei_ze_counter:
                          model  score_test score_val eval_metric  \
0      WeightedEnsemble_L2_FULL    0.986809      None          f1   
1    ExtraTreesEntr_BAG_L1_FULL    0.986763      None          f1   
2    ExtraTreesGini_BAG_L1_FULL    0.986548      None          f1   
3  RandomForestGini_BAG_L1_FULL    0.986426      None          f1   
4  RandomForestEntr_BAG_L1_FULL    0.985934      None          f1   
5           XGBoost_BAG_L1_FULL    0.983768      None          f1   
6          LightGBM_BAG_L1_FULL    0.981441      None          f1   
7          CatBoost_BAG_L1_FULL    0.978723      None          f1   

   pred_time_test  pred_time_val    fit_time  pred_time_test_marginal  \
0        1.266767            NaN  202.186594                 0.017528   
1        0.224659       0.681959    1.163824                 0.224659   
2        0.233698       0.784976    1.801519                 0.233698   
3        0.210040       0.641834    1.152670               

Leaderboard on holdout data (DyStack):
                           model  score_holdout  score_val eval_metric  pred_time_test  pred_time_val   fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0       WeightedEnsemble_L2_FULL       0.980705   0.980182          f1        0.592708            NaN   6.544712                 0.004333                     NaN           1.382270            2       True          6
1     ExtraTreesGini_BAG_L2_FULL       0.980599   0.981131          f1        0.868075            NaN  11.250058                 0.148804                0.755926           1.652681            2       True         12
2     ExtraTreesEntr_BAG_L2_FULL       0.980591   0.981288          f1        0.858309            NaN  10.679852                 0.139038                0.694881           1.082475            2       True         13
3         LightGBMXT_BAG_L2_FULL       0.980400   0.981046          f1        0.734928           

Results for sample_combined_counter:
                          model  score_test score_val eval_metric  \
0      WeightedEnsemble_L2_FULL    0.984326      None          f1   
1  RandomForestGini_BAG_L1_FULL    0.983899      None          f1   
2    ExtraTreesGini_BAG_L1_FULL    0.983835      None          f1   
3  RandomForestEntr_BAG_L1_FULL    0.983624      None          f1   
4          LightGBM_BAG_L1_FULL    0.981142      None          f1   

   pred_time_test  pred_time_val  fit_time  pred_time_test_marginal  \
0        1.085394            NaN  9.033000                 0.016394   
1        0.219027       0.701526  1.214624                 0.219027   
2        0.253825       0.916529  1.700583                 0.253825   
3        0.233492       0.790707  1.042015                 0.233492   
4        0.362656            NaN  2.683753                 0.362656   

   pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  \
0                     NaN           2.392025     

In [None]:
true_file = "../data/ie/data_ie.csv"
ie_true_data = pd.read_csv(true_file)

# List of counter example files for the EI zone
counter_files = {
    "ie_random": "../data/ie/data_ie_random.csv",
    "ie_ez_counter": "../data/ie/data_ez_counter_example.csv",
    "ie_ei_true_counter": "../data/ie/data_ei_true_counter_example.csv",
    "ie_ei_counter": "../data/ie/data_ei_counter_example.csv",
    "ie_ze_counter": "../data/ie/data_ze_counter_example.csv",
    "sample_combined_counter": "../data/ie/data_sample_combined.csv"
}

# For IE zone, our CSV files have columns B1 to B105.
combined_data = prepare_data(ie_true_data, counter_files, max_index=105)

# Dictionary to store the combined DataFrames and later model results
generateModels(combined_data, 'ie', 105, time_limit=900)

In [None]:
true_file = "../data/ez/data_ez.csv"
ez_true_data = pd.read_csv(true_file)

# List of counter example files for the EI zone
counter_files = {
    "ez_random": "../data/ez/data_ez_random.csv",
    "ez_ei_counter": "../data/ez/data_ei_counter_example.csv",
    "ez_ie_counter": "../data/ez/data_ie_counter_example.csv",
    "ez_ze_counter": "../data/ez/data_ze_counter_example.csv",
    "sample_combined_counter": "../data/ez/data_sample_combined.csv"
}

# For EZ zone, our CSV files have columns B1 to B550.
combined_data = prepare_data(ez_true_data, counter_files, max_index=550)

# Dictionary to store the combined DataFrames and later model results
generateModels(combined_data, 'ez', 550, time_limit=900)

In [None]:
true_file = "../data/ze/data_ze.csv"
ze_true_data = pd.read_csv(true_file)

# List of counter example files for the EI zone
counter_files = {
    "ze_random": "../data/ze/data_ze_random.csv",
    "ze_ez_counter": "../data/ze/data_ez_counter_example.csv",
    "ze_ie_counter": "../data/ze/data_ie_counter_example.csv",
    "ze_ei_counter": "../data/ze/data_ei_counter_example.csv",
    "sample_combined_counter": "../data/ze/data_sample_combined.csv"
}

# For EZ zone, our CSV files have columns B1 to B550.
combined_data = prepare_data(ze_true_data, counter_files, max_index=550)

# Dictionary to store the combined DataFrames and later model results
generateModels(combined_data, 'ze', 550, time_limit=900)