In [1]:
# !pip install mljar-supervised

In [32]:
import os
import time
from os import path

import joblib
import numpy as np
import pandas as pd
from autogluon.tabular import TabularPredictor
from catboost import CatBoostClassifier
from scipy.stats import randint, uniform
from sklearn.ensemble import (
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    StackingClassifier,
    VotingClassifier,
)
from sklearn.feature_selection import (
    RFE,
    RFECV,
    SelectKBest,
    VarianceThreshold,
    f_classif,
)
from sklearn.linear_model import ElasticNet, LogisticRegression
from sklearn.metrics import balanced_accuracy_score, mutual_info_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import shuffle
from supervised.automl import AutoML
from xgboost import XGBClassifier

# from supervised.automl import AutoML  # mljar-supervised

In [33]:
# Constants

SEED = 42
N_JOBS = -1
RANDOM_SEARCH_N_ITER = 1
TRAIN_TIME_LIMIT_AUTOGLUON = 60 * 1 * 1
TRAIN_TIME_LIMIT_MLJAR = 60 * 1 * 1
OUTPUT_DIR_MANUAL = path.join("output", "manual")
OUTPUT_DIR_AUTOGLUON = path.join("output", "autogluon")
OUTPUT_DIR_MLJAR = path.join("output", "mljar")
UNIQUE_ID = time.strftime("%Y%m%d_%H%M%S")
APPLY_REMOVE_LOW_VARIANCE_FEATURES = False
APPLY_REMOVE_CORRELATED_FEATURES = False
APPLY_REMOVE_RANDOM_FEATURES = False
APPLY_ANOVA = False
ANOVE_FEATURES = 25

In [34]:
# prepare output directories
for output_dir in [OUTPUT_DIR_MANUAL, OUTPUT_DIR_AUTOGLUON, OUTPUT_DIR_MLJAR]:
    if not path.exists(path.join(output_dir, UNIQUE_ID)):
        print(f"Creating output directory {path.join(output_dir, UNIQUE_ID)}")
        os.makedirs(path.join(output_dir, UNIQUE_ID))

Creating output directory output\manual\20240114_144651
Creating output directory output\autogluon\20240114_144651
Creating output directory output\mljar\20240114_144651
Creating output directory output\auto_sklearn\20240114_144651


In [5]:
def remove_highly_correlated_features(train_x, valid_x, test_x, threshold=0.95):
    # Calculate correlation matrix
    corr_matrix = np.corrcoef(train_x, rowvar=False)
    # Select upper triangle of correlation matrix
    upper = np.triu(corr_matrix, k=1)
    # Find indices of feature columns with correlation greater than threshold
    to_drop = [i for i in range(upper.shape[1]) if any(upper[:, i] > threshold)]

    # Drop features from train, validation, and test set
    train_x = np.delete(train_x, to_drop, axis=1)
    valid_x = np.delete(valid_x, to_drop, axis=1)
    test_x = np.delete(test_x, to_drop, axis=1)

    return train_x, valid_x, test_x

In [6]:
# Remove Low Variance Columns
def remove_low_variance_features(train_x, valid_x, test_x, threshold=(0.8 * (1 - 0.8))):
    sel = VarianceThreshold(threshold=threshold)
    sel.fit(train_x)
    train_x = train_x[:, sel.get_support(indices=True)]
    valid_x = valid_x[:, sel.get_support(indices=True)]
    test_x = test_x[:, sel.get_support(indices=True)]
    return train_x, valid_x, test_x

In [7]:
# Remove Random Columns
def remove_random_features(
    train_x: np.ndarray,
    train_y: np.ndarray,
    valid_x: np.ndarray,
    test_x: np.ndarray,
    importance=0.005,
):
    tree: DecisionTreeClassifier = DecisionTreeClassifier(random_state=0)
    tree.fit(train_x, train_y)
    importances = tree.feature_importances_

    # Assume columns with very low importance are "random"
    # This threshold can be adjusted based on domain knowledge
    important_indices = [i for i, imp in enumerate(importances) if imp > importance]
    train_x = train_x[:, important_indices]
    valid_x = valid_x[:, important_indices]
    test_x = test_x[:, important_indices]
    return train_x, valid_x, test_x

In [8]:
def anova_filter(
    train_x: np.ndarray,
    train_y: np.ndarray,
    valid_x: np.ndarray,
    test_x: np.ndarray,
    k: int = 50,
):
    selector = SelectKBest(f_classif, k=k)
    selector.fit(train_x, train_y)

    train_x = selector.transform(train_x)
    valid_x = selector.transform(valid_x)
    test_x = selector.transform(test_x)
    return train_x, valid_x, test_x

In [9]:
def dump_proba(model, test_x, output_path_proba):
    proba = model.predict_proba(test_x)

    if isinstance(proba, pd.DataFrame):
        proba = proba.values

    np.savetxt(
        output_path_proba,
        proba[:, 1],
        delimiter="\n",
        header='"313201_313212"',
        comments="",
        # fmt="%.19f",
    )

In [10]:
def dump_model(model, output_path_model):
    joblib.dump(model, output_path_model)

In [11]:
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.feature_selection import RFE, RFECV


def perform_feature_selection(
    train_x, train_y, valid_x, test_x, n_features_to_select=None
):
    estimator_et = ExtraTreesClassifier()
    rfe_et = RFE(estimator=estimator_et, n_features_to_select=250)
    rfe_et.fit(train_x, train_y)
    train_x = train_x[:, rfe_et.support_]
    valid_x = valid_x[:, rfe_et.support_]
    test_x = test_x[:, rfe_et.support_]
    print(train_x.shape, valid_x.shape, test_x.shape)

    estimator_rf = RandomForestClassifier()
    rfe_rf = RFE(estimator=estimator_rf, n_features_to_select=125)
    rfe_rf.fit(train_x, train_y)
    train_x = train_x[:, rfe_rf.support_]
    valid_x = valid_x[:, rfe_rf.support_]
    test_x = test_x[:, rfe_rf.support_]
    print(train_x.shape, valid_x.shape, test_x.shape)

    rfecv_et = RFECV(estimator=estimator_et, cv=3, min_features_to_select=25)
    rfecv_et.fit(train_x, train_y)
    train_x = train_x[:, rfecv_et.support_]
    valid_x = valid_x[:, rfecv_et.support_]
    test_x = test_x[:, rfecv_et.support_]
    print(train_x.shape, valid_x.shape, test_x.shape)

    rfecv_rf = RFECV(estimator=estimator_rf, cv=3, min_features_to_select=15)
    rfecv_rf.fit(train_x, train_y)
    train_x = train_x[:, rfecv_rf.support_]
    valid_x = valid_x[:, rfecv_rf.support_]
    test_x = test_x[:, rfecv_rf.support_]
    print(train_x.shape, valid_x.shape, test_x.shape)

    return train_x, valid_x, test_x

In [12]:
prefix = ""

_test_x = pd.read_table(prefix + "artificial_test.data", sep=" ", header=None)
_test_x.drop(_test_x.columns[500], axis=1, inplace=True)
_train_y = pd.read_table(prefix + "artificial_train.labels", header=None)
_train_x = pd.read_table(prefix + "artificial_train.data", sep=" ", header=None)
_train_x.drop(_train_x.columns[500], axis=1, inplace=True)

In [13]:
_test_x = np.array(_test_x, dtype=float, copy=True)
_train_x = np.array(_train_x, dtype=float, copy=True)
_train_y = np.array(_train_y, dtype=float, copy=True)

In [14]:
_train_x, _train_y = shuffle(_train_x, _train_y, random_state=42)

In [15]:
def get_train_and_validation_data():
    split = 400
    train_x, valid_x = _train_x[split:].copy(), _train_x[:split].copy()
    train_y, valid_y = _train_y[split:].copy(), _train_y[:split].copy()
    return train_x, train_y, valid_x, valid_y

In [16]:
train_x, train_y, valid_x, valid_y = get_train_and_validation_data()
print(train_x.shape, train_y.shape, valid_x.shape, valid_y.shape)
print(train_x.head)
print(train_y.head)

(1600, 500) (1600, 1) (400, 500) (400, 1)


In [17]:
train_x, valid_x, test_x = perform_feature_selection(
    train_x, train_y.copy().ravel(), valid_x, _test_x, n_features_to_select=None
)
print(train_x.shape, valid_x.shape, test_x.shape)

(1600, 250) (400, 250) (600, 250)
(1600, 125) (400, 125) (600, 125)
(1600, 25) (400, 25) (600, 25)
(1600, 20) (400, 20) (600, 20)
(1600, 20) (400, 20) (600, 20)


In [19]:
if APPLY_REMOVE_CORRELATED_FEATURES:
    train_x, valid_x, test_x = remove_highly_correlated_features(
        train_x, valid_x, _test_x
    )
    print("train_x.shape: ", train_x.shape)
    print("valid_x.shape: ", valid_x.shape)
    print("test_x.shape: ", test_x.shape)

In [20]:
if APPLY_REMOVE_LOW_VARIANCE_FEATURES:
    train_x, valid_x, test_x = remove_low_variance_features(train_x, valid_x, test_x)
    print("train_x.shape: ", train_x.shape)
    print("valid_x.shape: ", valid_x.shape)
    print("test_x.shape: ", test_x.shape)

In [21]:
if APPLY_REMOVE_RANDOM_FEATURES:
    train_x, valid_x, test_x = remove_random_features(
        train_x=train_x, train_y=train_y, valid_x=valid_x, test_x=test_x
    )
    print("train_x.shape: ", train_x.shape)
    print("valid_x.shape: ", valid_x.shape)
    print("test_x.shape: ", test_x.shape)

In [22]:
if APPLY_ANOVA:
    train_x, valid_x, test_x = anova_filter(
        train_x=train_x,
        train_y=train_y,
        valid_x=valid_x,
        test_x=test_x,
        k=ANOVE_FEATURES,
    )
    print("train_x.shape: ", train_x.shape)
    print("valid_x.shape: ", valid_x.shape)
    print("test_x.shape: ", test_x.shape)

In [23]:
print("train_x.shape: ", train_x.shape)
print("train_y.shape: ", train_y.shape)
print("valid_x.shape: ", valid_x.shape)
print("valid_y.shape: ", valid_y.shape)
print("test_x.shape: ", test_x.shape)

train_x.shape:  (1600, 20)
train_y.shape:  (1600, 1)
valid_x.shape:  (400, 20)
valid_y.shape:  (400, 1)
test_x.shape:  (600, 20)


In [24]:
if isinstance(train_x, np.ndarray):
    print("ok")
if isinstance(train_y, np.ndarray):
    print("ok")
if isinstance(valid_x, np.ndarray):
    print("ok")
if isinstance(valid_y, np.ndarray):
    print("ok")
if isinstance(test_x, np.ndarray):
    print("ok")

ok
ok
ok
ok
ok


### manual model

In [29]:
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import RandomizedSearchCV

base_classifiers_1 = [
    (
        "mlp",
        make_pipeline(
            StandardScaler(),
            MLPClassifier(
                random_state=SEED,
                max_iter=1000,
                early_stopping=True,
                tol=1e-3,
                solver="lbfgs",
                hidden_layer_sizes=(100, 300, 200, 100),
                alpha=0.001,
            ),
        ),
    ),
    (
        "rf",
        make_pipeline(
            StandardScaler(),
            RandomForestClassifier(
                random_state=SEED,
                n_estimators=500,
                max_depth=30,
                min_samples_leaf=4,
                min_samples_split=2,
            ),
        ),
    ),
]

base_classifiers_2 = [
    (
        "mlp",
        make_pipeline(
            StandardScaler(),
            MLPClassifier(
                random_state=SEED,
                max_iter=1000,
                early_stopping=True,
                tol=1e-3,
                solver="lbfgs",
                hidden_layer_sizes=(100, 300, 200, 100),
                alpha=0.001,
            ),
        ),
    ),
    (
        "gbc",
        make_pipeline(
            StandardScaler(),
            GradientBoostingClassifier(
                random_state=SEED,
                max_features=None,
                n_estimators=500,
                max_depth=30,
                min_samples_leaf=2,
                min_samples_split=5,
            ),
        ),
    ),
]

stacked_ensamble_1 = StackingClassifier(
    estimators=base_classifiers_1, final_estimator=LogisticRegression(), cv=5
)

stacked_ensamble_2 = StackingClassifier(
    estimators=base_classifiers_2, final_estimator=LogisticRegression(), cv=5
)

committee_models = [
    ("stacked_ensemble_1", stacked_ensamble_1),
    ("stacked_ensemble_2", stacked_ensamble_2),
    (
        "gbc",
        make_pipeline(
            StandardScaler(),
            GradientBoostingClassifier(
                random_state=SEED,
                max_features=None,
                n_estimators=500,
                max_depth=30,
                min_samples_leaf=2,
                min_samples_split=5,
            ),
        ),
    ),
    ("et", make_pipeline(StandardScaler(), ExtraTreesClassifier(
        random_state=SEED,
        n_estimators=500,
        max_depth=30,
        min_samples_leaf=4,
        min_samples_split=2,
        ))),
    (
        "rf",
        make_pipeline(
            StandardScaler(),
            RandomForestClassifier(
                random_state=SEED,
                n_estimators=500,
                max_depth=30,
                min_samples_leaf=4,
                min_samples_split=2,
            ),
        ),
    ),
    (
        "mlp",
        make_pipeline(
            StandardScaler(),
            MLPClassifier(
                random_state=SEED,
                max_iter=1000,
                early_stopping=True,
                tol=1e-3,
                solver="lbfgs",
                hidden_layer_sizes=(100, 300, 200, 100),
                alpha=0.001,
            ),
        ),
    ),
    (
        "cb",
        make_pipeline(
            StandardScaler(),
            CatBoostClassifier(
                iterations=500,
                learning_rate=0.03,
                depth=6,
                l2_leaf_reg=3,
                border_count=32,
                cat_features=None,
                loss_function="Logloss",
                eval_metric="Accuracy",
                random_seed=SEED,
                early_stopping_rounds=50,
                verbose=100,
            ),
        ),
    ),
    (
        "xgb",
        make_pipeline(
            StandardScaler(),
            XGBClassifier(
                random_state=SEED,
                use_label_encoder=False,
                eval_metric=balanced_accuracy_score,
                n_estimators=500,
                learning_rate=0.02,
                max_depth=6,
                min_child_weight=1,
                subsample=0.8,
                colsample_bytree=0.8,
                gamma=0,
                reg_alpha=0.1,
                reg_lambda=1.0,
                scale_pos_weight=1,
            ),
        ),
    ),
]

committee_model = VotingClassifier(committee_models, voting="soft")
committee_model.fit(train_x.copy(), train_y.copy().ravel())

0:	learn: 0.7775000	total: 122ms	remaining: 42.6s
100:	learn: 0.9162500	total: 294ms	remaining: 724ms
200:	learn: 0.9418750	total: 475ms	remaining: 352ms
300:	learn: 0.9631250	total: 648ms	remaining: 105ms
349:	learn: 0.9706250	total: 728ms	remaining: 0us


In [30]:
y_pred = committee_model.predict(valid_x)
balanced_accuracy = balanced_accuracy_score(valid_y, y_pred)

print(f"Model Balanced Accuracy: {balanced_accuracy}")

Model Balanced Accuracy: 0.9000525118151584


In [35]:
output_path_proba = path.join(OUTPUT_DIR_MANUAL, UNIQUE_ID, "manual_model_proba.txt")
output_path_model = path.join(OUTPUT_DIR_MANUAL, UNIQUE_ID, "manual_model.pkl")
dump_proba(committee_model, test_x, output_path_proba)
dump_model(committee_model, output_path_model)

### Autogluon

In [36]:
train_data = np.concatenate((train_x.copy(), train_y.copy()), axis=1)
train_data_pd = pd.DataFrame(train_data, copy=True)
train_data_pd.rename(columns={train_data_pd.columns[-1]: "class"}, inplace=True)

valid_data = np.concatenate((valid_x.copy(), valid_y.copy()), axis=1)
valid_data_pd = pd.DataFrame(data=valid_data, copy=True)
valid_data_pd.rename(columns={valid_data_pd.columns[-1]: "class"}, inplace=True)

print(train_data_pd.shape, valid_data_pd.shape)

(1600, 21) (400, 21)


In [37]:
save_path = path.join(OUTPUT_DIR_AUTOGLUON, UNIQUE_ID)
predictor = TabularPredictor(
    label="class",
    path=save_path,
    eval_metric="balanced_accuracy",
    problem_type="binary",
).fit(
    train_data=train_data_pd,
    time_limit=TRAIN_TIME_LIMIT_AUTOGLUON,
    presets="best_quality",
    hyperparameters="default",
    fit_weighted_ensemble=True,
    fit_full_last_level_weighted_ensemble=True,
    full_weighted_ensemble_additionally=True,
    num_bag_folds=15,
    num_bag_sets=25,
    num_stack_levels=3,
    auto_stack=True,
    dynamic_stacking=True,
    feature_generator="auto",
    hyperparameter_tune_kwargs={
        "scheduler": "local",
        "searcher": "auto",
        "time_out": 1200,
        "num_trials": 30,
    },
)

Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=3, num_bag_folds=15, num_bag_sets=25
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 60 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: output\autogluon\20240114_144651/ds_sub_fit/sub_fit_ho.
2024-01-14 14:47:04,082	INFO util.py:159 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
Beginning AutoGluon training ... Time limit = 15s
AutoGluon will save models to "output\autogluon\20240114_144651/ds_sub_fit/su

In [38]:
predictor.leaderboard()

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBMLarge_BAG_L3,0.871871,balanced_accuracy,0.104815,23.99478,0.041661,8.735041,3,True,5
1,WeightedEnsemble_L4,0.871871,balanced_accuracy,0.111595,24.011844,0.006779,0.017064,4,True,6
2,WeightedEnsemble_ALL_L5,0.871871,balanced_accuracy,0.121702,24.541931,0.016887,0.54715,5,True,8
3,LightGBMLarge_BAG_L2,0.871838,balanced_accuracy,0.063155,15.259739,0.047872,8.527765,2,True,3
4,WeightedEnsemble_L3,0.871838,balanced_accuracy,0.066155,15.278766,0.003,0.019027,3,True,4
5,LightGBMLarge_BAG_L4,0.868706,balanced_accuracy,0.124007,31.039169,0.019192,7.044388,4,True,7
6,WeightedEnsemble_L5,0.868706,balanced_accuracy,0.138849,31.039169,0.014841,0.0,5,True,9
7,LightGBMLarge_BAG_L1,0.85872,balanced_accuracy,0.015282,6.731974,0.015282,6.731974,1,True,1
8,WeightedEnsemble_L2,0.85872,balanced_accuracy,0.021303,6.739744,0.00602,0.00777,2,True,2


In [39]:
predictor.evaluate(valid_data_pd)

{'balanced_accuracy': 0.8404140931709635,
 'accuracy': 0.84,
 'mcc': 0.681441898430591,
 'roc_auc': 0.9231076992323273,
 'f1': 0.8423645320197045,
 'precision': 0.8181818181818182,
 'recall': 0.868020304568528}

In [31]:
output_path_proba = path.join(
    OUTPUT_DIR_AUTOGLUON, UNIQUE_ID, "autogluon_model_proba.txt"
)
dump_proba(predictor, pd.DataFrame(test_x), output_path_proba)

### MLJar

In [32]:
automl = AutoML(
    mode="Compete",
    ml_task="binary_classification",
    total_time_limit=TRAIN_TIME_LIMIT_MLJAR,
    eval_metric="f1",
    random_state=SEED,
    results_path=path.join(OUTPUT_DIR_MLJAR, UNIQUE_ID, "tmp"),
)
train_y = train_y.copy().reshape(-1)
print(train_y)
automl.fit(train_x.copy(), train_y)

[ 1.  1.  1. ...  1. -1. -1.]
AutoML directory: output\mljar\20240114_114214\tmp
The task is binary_classification with evaluation metric f1
AutoML will use algorithms: ['Decision Tree', 'Linear', 'Random Forest', 'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network', 'Nearest Neighbors']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree f1 0.742515 trained in 3.76 seconds
Adjust validation. Remove: 1_DecisionTree
Validation strategy: 10-fold CV Shuffle,Stratify
* Step simple_algorithms will try to check up to 4 models
1_DecisionTree f1 0.743881 trained in 6.52 seconds
2_DecisionTree f1 0.660644 trained in 6.99 seco



6_Default_Xgboost f1 0.815871 trained in 25.23 seconds
7_Default_CatBoost f1 0.836836 trained in 16.83 seconds
8_Default_NeuralNetwork f1 0.675444 trained in 13.05 seconds




9_Default_RandomForest f1 0.78765 trained in 23.48 seconds
10_Default_ExtraTrees f1 0.743227 trained in 16.59 seconds
11_Default_NearestNeighbors f1 0.639112 trained in 9.4 seconds
* Step not_so_random will try to check up to 61 models
21_LightGBM f1 0.815142 trained in 18.61 seconds




12_Xgboost f1 0.794984 trained in 23.32 seconds
30_CatBoost f1 0.841133 trained in 20.51 seconds
39_RandomForest f1 0.811321 trained in 28.48 seconds
48_ExtraTrees f1 0.783558 trained in 20.52 seconds
57_NeuralNetwork f1 0.559256 trained in 14.77 seconds




66_NearestNeighbors f1 0.652527 trained in 10.59 seconds
22_LightGBM f1 0.823965 trained in 18.99 seconds




13_Xgboost f1 0.714286 trained in 27.42 seconds
31_CatBoost f1 0.838471 trained in 19.26 seconds
40_RandomForest f1 0.753433 trained in 23.1 seconds
49_ExtraTrees f1 0.712846 trained in 19.3 seconds
58_NeuralNetwork f1 0.708504 trained in 18.35 seconds




67_NearestNeighbors f1 0.620647 trained in 11.26 seconds
23_LightGBM f1 0.838988 trained in 20.68 seconds




14_Xgboost f1 0.783387 trained in 31.94 seconds
32_CatBoost f1 0.849148 trained in 44.36 seconds
41_RandomForest f1 0.745993 trained in 26.77 seconds
50_ExtraTrees f1 0.710237 trained in 19.53 seconds
59_NeuralNetwork f1 0.681959 trained in 16.36 seconds




68_NearestNeighbors f1 0.620647 trained in 12.24 seconds
24_LightGBM f1 0.836431 trained in 30.66 seconds




15_Xgboost f1 0.757196 trained in 32.89 seconds
33_CatBoost f1 0.814452 trained in 16.11 seconds
42_RandomForest f1 0.774436 trained in 26.26 seconds
51_ExtraTrees f1 0.721559 trained in 24.57 seconds
60_NeuralNetwork f1 0.678727 trained in 16.72 seconds




69_NearestNeighbors f1 0.620647 trained in 13.17 seconds
25_LightGBM f1 0.827329 trained in 24.17 seconds




16_Xgboost f1 0.655194 trained in 33.88 seconds
34_CatBoost f1 0.828589 trained in 19.94 seconds
43_RandomForest f1 0.741347 trained in 25.22 seconds
52_ExtraTrees f1 0.684178 trained in 28.14 seconds
61_NeuralNetwork f1 0.705308 trained in 19.52 seconds




70_NearestNeighbors f1 0.652527 trained in 15.77 seconds
26_LightGBM f1 0.836341 trained in 26.44 seconds




17_Xgboost f1 0.783375 trained in 37.11 seconds
35_CatBoost f1 0.819247 trained in 20.33 seconds
44_RandomForest f1 0.802218 trained in 30.31 seconds
53_ExtraTrees f1 0.762572 trained in 28.6 seconds
62_NeuralNetwork f1 0.658278 trained in 25.96 seconds




71_NearestNeighbors f1 0.620647 trained in 16.67 seconds
27_LightGBM f1 0.832718 trained in 28.83 seconds




18_Xgboost f1 0.734036 trained in 35.27 seconds
36_CatBoost f1 0.850277 trained in 27.29 seconds
45_RandomForest f1 0.815366 trained in 36.58 seconds
54_ExtraTrees f1 0.764059 trained in 29.04 seconds
63_NeuralNetwork f1 0.661999 trained in 21.25 seconds




72_NearestNeighbors f1 0.620647 trained in 16.67 seconds
28_LightGBM f1 0.834988 trained in 32.0 seconds




19_Xgboost f1 0.775126 trained in 33.23 seconds
37_CatBoost f1 0.858021 trained in 34.05 seconds
46_RandomForest f1 0.745887 trained in 30.38 seconds
55_ExtraTrees f1 0.678313 trained in 26.68 seconds
64_NeuralNetwork f1 0.658551 trained in 20.94 seconds




29_LightGBM f1 0.838789 trained in 26.2 seconds




20_Xgboost f1 0.750157 trained in 38.77 seconds
38_CatBoost f1 0.824455 trained in 22.71 seconds
47_RandomForest f1 0.774074 trained in 27.57 seconds
56_ExtraTrees f1 0.734644 trained in 25.97 seconds
65_NeuralNetwork f1 0.664935 trained in 21.36 seconds




* Step golden_features will try to check up to 3 models
None 10
Add Golden Feature: feature_22_sum_feature_2
Add Golden Feature: feature_11_diff_feature_12
Add Golden Feature: feature_8_multiply_feature_2
Add Golden Feature: feature_21_multiply_feature_3
Add Golden Feature: feature_5_diff_feature_21
Add Golden Feature: feature_17_diff_feature_20
Add Golden Feature: feature_15_ratio_feature_11
Add Golden Feature: feature_11_ratio_feature_15
Add Golden Feature: feature_4_diff_feature_17
Add Golden Feature: feature_11_sum_feature_1
Created 10 Golden Features in 15.2 seconds.
37_CatBoost_GoldenFeatures f1 0.84469 trained in 51.54 seconds
36_CatBoost_GoldenFeatures f1 0.837787 trained in 30.92 seconds
32_CatBoost_GoldenFeatures f1 0.83839 trained in 61.59 seconds
* Step kmeans_features will try to check up to 3 models




37_CatBoost_KMeansFeatures f1 0.833542 trained in 44.19 seconds




36_CatBoost_KMeansFeatures f1 0.809435 trained in 37.11 seconds




32_CatBoost_KMeansFeatures f1 0.815546 trained in 85.24 seconds
* Step insert_random_feature will try to check up to 1 model
37_CatBoost_RandomFeature f1 0.848225 trained in 47.64 seconds
Drop features ['feature_25', 'feature_5', 'feature_10', 'feature_12', 'feature_13', 'feature_7', 'feature_17', 'feature_20', 'feature_21', 'random_feature', 'feature_24', 'feature_18', 'feature_8']
* Step features_selection will try to check up to 6 models
37_CatBoost_SelectedFeatures f1 0.871007 trained in 28.64 seconds
23_LightGBM_SelectedFeatures f1 0.85625 trained in 28.9 seconds




6_Default_Xgboost_SelectedFeatures f1 0.827671 trained in 29.82 seconds
45_RandomForest_SelectedFeatures f1 0.813559 trained in 32.02 seconds
48_ExtraTrees_SelectedFeatures f1 0.789306 trained in 26.51 seconds
58_NeuralNetwork_SelectedFeatures f1 0.707535 trained in 23.55 seconds




* Step hill_climbing_1 will try to check up to 28 models
73_CatBoost_SelectedFeatures f1 0.865727 trained in 29.24 seconds
74_CatBoost_SelectedFeatures f1 0.874222 trained in 30.58 seconds
75_CatBoost f1 0.844743 trained in 32.25 seconds
76_CatBoost f1 0.848485 trained in 37.86 seconds
77_LightGBM_SelectedFeatures f1 0.85625 trained in 30.1 seconds
78_LightGBM_SelectedFeatures f1 0.85625 trained in 30.24 seconds
79_CatBoost f1 0.843924 trained in 26.68 seconds
80_LightGBM f1 0.838988 trained in 26.92 seconds
81_LightGBM f1 0.838988 trained in 26.98 seconds
82_LightGBM f1 0.838789 trained in 27.23 seconds
83_LightGBM f1 0.838789 trained in 28.38 seconds




84_Xgboost_SelectedFeatures f1 0.839875 trained in 33.74 seconds




85_Xgboost f1 0.82419 trained in 35.24 seconds
86_RandomForest f1 0.80803 trained in 38.83 seconds
87_RandomForest_SelectedFeatures f1 0.820737 trained in 30.29 seconds
88_RandomForest f1 0.814491 trained in 36.12 seconds




89_Xgboost f1 0.786802 trained in 36.68 seconds
90_ExtraTrees_SelectedFeatures f1 0.79156 trained in 29.59 seconds
91_ExtraTrees f1 0.780269 trained in 30.98 seconds
92_ExtraTrees f1 0.765319 trained in 33.46 seconds
93_DecisionTree f1 0.688807 trained in 20.1 seconds
94_NeuralNetwork_SelectedFeatures f1 0.749842 trained in 27.06 seconds




95_NeuralNetwork f1 0.700129 trained in 25.74 seconds




96_DecisionTree f1 0.688807 trained in 20.68 seconds
97_NearestNeighbors f1 0.652527 trained in 22.73 seconds
98_NearestNeighbors f1 0.652527 trained in 22.87 seconds
99_NearestNeighbors f1 0.639112 trained in 33.12 seconds
100_DecisionTree f1 0.706651 trained in 23.31 seconds
* Step hill_climbing_2 will try to check up to 12 models


KeyboardInterrupt: 

In [None]:
loaded = AutoML(
    mode="Compete",
    ml_task="binary_classification",
    total_time_limit=TRAIN_TIME_LIMIT_MLJAR,
    eval_metric="f1",
    random_state=SEED,
    results_path="output\\mljar\\20240114_002215",
)

print(valid_x.shape, valid_y.shape)

print(train_x.shape, train_y.shape)
predictions = loaded.predict_proba(valid_x.copy().reshape(-1))

score = balanced_accuracy_score(valid_y, predictions)

print(f"Model Balanced Accuracy: {score}")

In [None]:
output_path = path.join(OUTPUT_DIR_MLJAR, UNIQUE_ID, "mljar_model_proba.txt")
dump_proba(automl, test_x, output_path)

In [None]:
def ensemble_predict(X, model1, model2):
    pred1 = model1.predict_proba(pd.DataFrame(X)).values[:, 1]
    pred2 = model2.predict_proba(X)[:, 1]
    print(pred1, pred2)
    avg_pred = (pred1 + pred2) / 2

    return avg_pred


final_predictions = ensemble_predict(test_x, predictor, automl)

In [None]:
os.makedirs(path.join("ensamble", UNIQUE_ID), exist_ok=True)

np.savetxt(
    path.join("ensamble", UNIQUE_ID, "123manual_model_pred.txt"),
    final_predictions,
    delimiter="\n",
    comments="",
    header='"313201"',
)