In [1]:
# !pip install mljar-supervised

In [1]:
import os
import time
from os import path

import joblib
import numpy as np
import pandas as pd
from autogluon.tabular import TabularPredictor
from scipy.stats import randint, uniform
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.feature_selection import SelectKBest, VarianceThreshold, f_classif
from sklearn.linear_model import ElasticNet, LogisticRegression
from sklearn.metrics import balanced_accuracy_score, mutual_info_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import shuffle
from supervised.automl import AutoML  # mljar-supervised

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Constants

SEED = 42
N_JOBS = -1
RANDOM_SEARCH_N_ITER = 80
TRAIN_TIME_LIMIT_AUTOGLUON = 60 * 60 * 1
TRAIN_TIME_LIMIT_MLJAR = 60 * 60 * 4
TRAIN_TIME_LIMIT_AUTO_SKLEARN = 60 * 30
OUTPUT_DIR_MANUAL = path.join("output", "manual")
OUTPUT_DIR_AUTOGLUON = path.join("output", "autogluon")
OUTPUT_DIR_MLJAR = path.join("output", "mljar")
OUTPUT_DIR_AUTO_SKLEARN = path.join("output", "auto_sklearn")
UNIQUE_ID = time.strftime("%Y%m%d_%H%M%S")
APPLY_REMOVE_LOW_VARIANCE_FEATURES = True
APPLY_REMOVE_CORRELATED_FEATURES = True
APPLY_REMOVE_RANDOM_FEATURES = False
APPLY_ANOVA = True
ANOVE_FEATURES = 20

In [3]:
# prepare output directories
for output_dir in [
    OUTPUT_DIR_MANUAL,
    OUTPUT_DIR_AUTOGLUON,
    OUTPUT_DIR_MLJAR,
    OUTPUT_DIR_AUTO_SKLEARN,
]:
    if not path.exists(path.join(output_dir, UNIQUE_ID)):
        print(f"Creating output directory {path.join(output_dir, UNIQUE_ID)}")
        os.makedirs(path.join(output_dir, UNIQUE_ID))

Creating output directory output\manual\20240113_100932
Creating output directory output\autogluon\20240113_100932
Creating output directory output\mljar\20240113_100932
Creating output directory output\auto_sklearn\20240113_100932


In [4]:
def remove_highly_correlated_features(train_x, valid_x, test_x, threshold=0.95):
    # Calculate correlation matrix
    corr_matrix = np.corrcoef(train_x, rowvar=False)
    # Select upper triangle of correlation matrix
    upper = np.triu(corr_matrix, k=1)
    # Find indices of feature columns with correlation greater than threshold
    to_drop = [i for i in range(upper.shape[1]) if any(upper[:, i] > threshold)]

    # Drop features from train, validation, and test set
    train_x = np.delete(train_x, to_drop, axis=1)
    valid_x = np.delete(valid_x, to_drop, axis=1)
    test_x = np.delete(test_x, to_drop, axis=1)

    return train_x, valid_x, test_x

In [5]:
# Remove Low Variance Columns
def remove_low_variance_features(train_x, valid_x, test_x, threshold=(0.8 * (1 - 0.8))):
    sel = VarianceThreshold(threshold=threshold)
    sel.fit(train_x)
    train_x = train_x[:, sel.get_support(indices=True)]
    valid_x = valid_x[:, sel.get_support(indices=True)]
    test_x = test_x[:, sel.get_support(indices=True)]
    return train_x, valid_x, test_x

In [6]:
# Remove Random Columns
def remove_random_features(
    train_x: np.ndarray,
    train_y: np.ndarray,
    valid_x: np.ndarray,
    test_x: np.ndarray,
    importance=0.005,
):
    tree: DecisionTreeClassifier = DecisionTreeClassifier(random_state=0)
    tree.fit(train_x, train_y)
    importances = tree.feature_importances_

    # Assume columns with very low importance are "random"
    # This threshold can be adjusted based on domain knowledge
    important_indices = [i for i, imp in enumerate(importances) if imp > importance]
    train_x = train_x[:, important_indices]
    valid_x = valid_x[:, important_indices]
    test_x = test_x[:, important_indices]
    return train_x, valid_x, test_x

In [7]:
def anova_filter(
    train_x: np.ndarray,
    train_y: np.ndarray,
    valid_x: np.ndarray,
    test_x: np.ndarray,
    k: int = 50,
):
    selector = SelectKBest(f_classif, k=k)
    selector.fit(train_x, train_y)

    train_x = selector.transform(train_x)
    valid_x = selector.transform(valid_x)
    test_x = selector.transform(test_x)
    return train_x, valid_x, test_x

In [54]:
def dump_proba(model, test_x, output_path_proba):
    proba = model.predict_proba(test_x)
    np.savetxt(
        output_path_proba,
        proba[:, 1],
        delimiter="\n",
        header='"313201_313212"',
        comments="",
        # fmt="%.19f",
    )

In [55]:
def dump_model(model, output_path_model):
    joblib.dump(model, output_path_model)

In [8]:
prefix = ""

_test_x = pd.read_table(prefix + "artificial_test.data", sep=" ", header=None)
_test_x.drop(_test_x.columns[500], axis=1, inplace=True)
_train_y = pd.read_table(prefix + "artificial_train.labels", header=None)
_train_x = pd.read_table(prefix + "artificial_train.data", sep=" ", header=None)
_train_x.drop(_train_x.columns[500], axis=1, inplace=True)

In [9]:
_train_x, _train_y = shuffle(_train_x, _train_y, random_state=42)

In [11]:
def get_train_and_validation_data():
    split = 400
    train_x, valid_x = _train_x[split:].values, _train_x[:split].values
    train_y, valid_y = _train_y[split:].values, _train_y[:split].values
    return train_x, train_y, valid_x, valid_y

In [12]:
train_x, train_y, valid_x, valid_y = get_train_and_validation_data()
train_y = train_y.reshape(-1, 1)
valid_y = valid_y.reshape(-1, 1)
print(train_x.shape, train_y.shape, valid_x.shape, valid_y.shape)

(1600, 500) (1600, 1) (400, 500) (400, 1)


In [14]:
if APPLY_REMOVE_CORRELATED_FEATURES:
    train_x, valid_x, test_x = remove_highly_correlated_features(
        train_x, valid_x, _test_x
    )
    print("train_x.shape: ", train_x.shape)
    print("valid_x.shape: ", valid_x.shape)
    print("test_x.shape: ", test_x.shape)

train_x.shape:  (1600, 490)
valid_x.shape:  (400, 490)
test_x.shape:  (600, 490)


In [15]:
if APPLY_REMOVE_LOW_VARIANCE_FEATURES:
    train_x, valid_x, test_x = remove_low_variance_features(train_x, valid_x, test_x)
    print("train_x.shape: ", train_x.shape)
    print("valid_x.shape: ", valid_x.shape)
    print("test_x.shape: ", test_x.shape)

train_x.shape:  (1600, 490)
valid_x.shape:  (400, 490)
test_x.shape:  (600, 490)


In [16]:
if APPLY_REMOVE_RANDOM_FEATURES:
    train_x, valid_x, test_x = remove_random_features(
        train_x=train_x, train_y=train_y, valid_x=valid_x, test_x=test_x
    )
    print("train_x.shape: ", train_x.shape)
    print("valid_x.shape: ", valid_x.shape)
    print("test_x.shape: ", test_x.shape)

In [17]:
if APPLY_ANOVA:
    train_x, valid_x, test_x = anova_filter(
        train_x=train_x,
        train_y=train_y,
        valid_x=valid_x,
        test_x=test_x,
        k=ANOVE_FEATURES,
    )
    print("train_x.shape: ", train_x.shape)
    print("valid_x.shape: ", valid_x.shape)
    print("test_x.shape: ", test_x.shape)

train_x.shape:  (1600, 20)
valid_x.shape:  (400, 20)
test_x.shape:  (600, 20)




In [18]:
print("train_x.shape: ", train_x.shape)
print("train_y.shape: ", train_y.shape)
print("valid_x.shape: ", valid_x.shape)
print("valid_y.shape: ", valid_y.shape)
print("test_x.shape: ", test_x.shape)

train_x.shape:  (1600, 20)
train_y.shape:  (1600, 1)
valid_x.shape:  (400, 20)
valid_y.shape:  (400, 1)
test_x.shape:  (600, 20)


In [18]:
# sanity check
(
    original_train_x,
    original_train_y,
    original_valid_x,
    original_valid_y,
) = get_train_and_validation_data()
for y, original_y in zip([train_y, valid_y], [original_train_y, original_valid_y]):
    assert y.shape == original_y.shape

### manual model

In [59]:
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import RandomizedSearchCV

base_classifiers_1 = [
    (
        "mlp",
        make_pipeline(
            StandardScaler(),
            MLPClassifier(
                random_state=SEED,
                max_iter=1000,
                tol=1e-3,
                solver="lbfgs",
                hidden_layer_sizes=(100, 300, 200, 100),
                alpha=0.001,
                early_stopping=True,
            ),
        ),
    ),
    (
        "gbc",
        make_pipeline(
            StandardScaler(),
            GradientBoostingClassifier(
                random_state=SEED,
                max_features=None,
                n_estimators=1000,
                max_depth=30,
                min_samples_leaf=2,
                min_samples_split=5,
            ),
        ),
    ),
    (
        "rf",
        make_pipeline(
            StandardScaler(),
            RandomForestClassifier(
                random_state=SEED,
                n_estimators=1000,
                max_depth=30,
                min_samples_leaf=4,
                min_samples_split=2,
            ),
        ),
    ),
]

base_classifiers_2 = [
    (
        "mlp",
        make_pipeline(
            StandardScaler(),
            MLPClassifier(
                random_state=SEED,
                max_iter=1000,
                tol=1e-3,
                solver="lbfgs",
                hidden_layer_sizes=(100, 300, 200, 100),
                alpha=0.001,
                early_stopping=True,
            ),
        ),
    ),
    (
        "gbc",
        make_pipeline(
            StandardScaler(),
            GradientBoostingClassifier(
                random_state=SEED,
                max_features=None,
                n_estimators=1000,
                max_depth=30,
                min_samples_leaf=2,
                min_samples_split=5,
            ),
        ),
    ),
]

param_distributions = {
    # Parameters for the first Stacking Layer
    "stacked_ensemble_1__mlp__mlpclassifier__alpha": uniform(0.0001, 1),
    "stacked_ensemble_1__mlp__mlpclassifier__learning_rate_init": uniform(0.001, 0.1),
    "stacked_ensemble_1__mlp__mlpclassifier__hidden_layer_sizes": [
        (70, 90),
        (70, 80, 90),
        (70, 100, 80, 90),
    ],
    "stacked_ensemble_1__gbc__gradientboostingclassifier__n_estimators": [
        32,
        64,
        100,
        200,
        400,
    ],
    "stacked_ensemble_1__gbc__gradientboostingclassifier__max_depth": [
        10,
        20,
        30,
        40,
        50,
        60,
    ],
    "stacked_ensemble_1__gbc__gradientboostingclassifier__min_samples_split": [
        4,
        8,
        12,
        16,
        20,
    ],
    "stacked_ensemble_1__gbc__gradientboostingclassifier__min_samples_leaf": [
        2,
        4,
        6,
        8,
        10,
    ],
    "stacked_ensemble_1__rf__randomforestclassifier__n_estimators": [
        200,
        400,
        600,
        800,
        1000,
        1200,
    ],
    "stacked_ensemble_1__rf__randomforestclassifier__max_depth": [
        10,
        20,
        30,
        40,
        50,
        60,
        70,
        80,
        90,
        100,
    ],
    "stacked_ensemble_1__rf__randomforestclassifier__min_samples_split": [2, 3, 4, 5],
    "stacked_ensemble_1__rf__randomforestclassifier__min_samples_leaf": [4, 6, 10],
    # Parameters for the Second Stacking Layer
    "stacked_ensemble_2__mlp__mlpclassifier__alpha": uniform(0.0001, 1),
    "stacked_ensemble_2__mlp__mlpclassifier__learning_rate_init": uniform(0.001, 0.1),
    "stacked_ensemble_2__mlp__mlpclassifier__hidden_layer_sizes": [
        (50, 100),
        (50, 100, 100),
        (50, 150, 100, 100),
    ],
    "stacked_ensemble_2__gbc__gradientboostingclassifier__n_estimators": [
        32,
        64,
        100,
        200,
        400,
    ],
    "stacked_ensemble_2__gbc__gradientboostingclassifier__max_depth": [
        10,
        20,
        30,
        40,
        50,
        60,
    ],
    "stacked_ensemble_2__gbc__gradientboostingclassifier__min_samples_split": [
        4,
        8,
        12,
        16,
        20,
    ],
    "stacked_ensemble_2__gbc__gradientboostingclassifier__min_samples_leaf": [
        2,
        4,
        6,
        8,
        10,
    ],
    # Parameters for the Final Estimator
    "stacked_ensemble_1__final_estimator__C": uniform(0.01, 10),
    "stacked_ensemble_2__final_estimator__C": uniform(0.01, 10),
    # Parameters for the Committee
    "gbc__n_estimators": [32, 64, 100, 200, 400],
    "gbc__max_depth": [10, 20, 30, 40, 50, 60],
    "gbc__min_samples_split": [4, 8, 12, 16, 20],
    "gbc__min_samples_leaf": [2, 4, 6, 8, 10],
    "rf__n_estimators": [200, 400, 600, 800, 1000, 1200],
    "rf__max_depth": [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
    "rf__min_samples_split": [2, 3, 4, 5],
    "rf__min_samples_leaf": [4, 6, 10],
}

# First Stacking Layer
stacked_ensamble_1 = StackingClassifier(
    estimators=base_classifiers_1, final_estimator=LogisticRegression(), cv=5
)

# Second Stacking Layer
stacked_ensamble_2 = StackingClassifier(
    estimators=base_classifiers_2, final_estimator=LogisticRegression(), cv=5
)
# Define the committee of models
committee_models = [
    ("stacked_ensemble_1", stacked_ensamble_1),
    ("stacked_ensemble_2", stacked_ensamble_2),
    (
        "gbc",
        make_pipeline(
            StandardScaler(),
            GradientBoostingClassifier(
                random_state=SEED,
                max_features=None,
                n_estimators=1000,
                max_depth=30,
                min_samples_leaf=2,
                min_samples_split=5,
            ),
        ),
    ),
    (
        "rf",
        make_pipeline(
            StandardScaler(),
            RandomForestClassifier(
                random_state=SEED,
                n_estimators=1000,
                max_depth=30,
                min_samples_leaf=4,
                min_samples_split=2,
            ),
        ),
    ),
    (
        "mlp",
        make_pipeline(
            StandardScaler(),
            MLPClassifier(
                random_state=SEED,
                max_iter=1000,
                tol=1e-3,
                solver="lbfgs",
                hidden_layer_sizes=(100, 300, 200, 100),
                alpha=0.001,
                early_stopping=True,
            ),
        ),
    ),
]

# Create the committee model
committee_model = VotingClassifier(committee_models, voting="soft")
committee_model.fit(train_x, train_y.ravel())
# print(committee_model)
# Perform randomized search
# random_search = RandomizedSearchCV(
#     committee_model,
#     param_distributions=param_distributions,
#     scoring="balanced_accuracy",
#     n_iter=1,
#     cv=5,
#     verbose=4,
#     random_state=SEED,
#     n_jobs=8,
# )

# random_search.fit(train_x, train_y.ravel())

In [60]:
y_pred = random_search.predict(valid_x)
balanced_accuracy = balanced_accuracy_score(valid_y, y_pred)

print(f"Model Balanced Accuracy: {balanced_accuracy}")

Model Balanced Accuracy: 0.7892775874571778


In [61]:
print(random_search.best_params_)

{'gbc__max_depth': 40, 'gbc__min_samples_leaf': 10, 'gbc__min_samples_split': 12, 'gbc__n_estimators': 400, 'rf__max_depth': 50, 'rf__min_samples_leaf': 10, 'rf__min_samples_split': 3, 'rf__n_estimators': 600, 'stacked_ensemble_1__final_estimator__C': 1.0097491581800289, 'stacked_ensemble_1__gbc__gradientboostingclassifier__max_depth': 30, 'stacked_ensemble_1__gbc__gradientboostingclassifier__min_samples_leaf': 10, 'stacked_ensemble_1__gbc__gradientboostingclassifier__min_samples_split': 16, 'stacked_ensemble_1__gbc__gradientboostingclassifier__n_estimators': 100, 'stacked_ensemble_1__mlp__mlpclassifier__alpha': 0.020684494295802446, 'stacked_ensemble_1__mlp__mlpclassifier__hidden_layer_sizes': (70, 80, 90), 'stacked_ensemble_1__mlp__mlpclassifier__learning_rate_init': 0.07319987722668247, 'stacked_ensemble_1__rf__randomforestclassifier__max_depth': 60, 'stacked_ensemble_1__rf__randomforestclassifier__min_samples_leaf': 6, 'stacked_ensemble_1__rf__randomforestclassifier__min_samples_sp

In [57]:
output_path_proba = path.join(OUTPUT_DIR_MANUAL, UNIQUE_ID, "manual_model_proba.txt")
output_path_model = path.join(OUTPUT_DIR_MANUAL, UNIQUE_ID, "manual_model.pkl")
dump_proba(random_search, test_x, output_path_proba)
dump_model(random_search, output_path_model)

### Autogluon

In [24]:
train_data = np.concatenate((train_x, train_y), axis=1)
train_data_pd = pd.DataFrame(train_data)
train_data_pd.rename(columns={train_data_pd.columns[-1]: "class"}, inplace=True)

valid_data = np.concatenate((valid_x, valid_y), axis=1)
valid_data_pd = pd.DataFrame(data=valid_data)
valid_data_pd.rename(columns={valid_data_pd.columns[-1]: "class"}, inplace=True)

In [25]:
print(train_data_pd.shape, valid_data_pd.shape)

(1600, 21) (400, 21)


In [26]:
save_path = path.join(OUTPUT_DIR_AUTOGLUON, UNIQUE_ID)
predictor = TabularPredictor(
    label="class",
    path=save_path,
    eval_metric="balanced_accuracy",
    problem_type="binary",
).fit(
    train_data=train_data_pd,
    # tuning_data=valid_data_pd,
    time_limit=TRAIN_TIME_LIMIT_AUTOGLUON,
    presets="best_quality",
    hyperparameters="default",
)

Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 18000 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: output\autogluon\20240112_224713/ds_sub_fit/sub_fit_ho.
2024-01-13 00:55:03,168	INFO util.py:159 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
Beginning AutoGluon training ... Time limit = 4500s
AutoGluon will save models to "output\autogluon\20240112_224713/ds_sub_fit

In [27]:
predictor.leaderboard()

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.856918,balanced_accuracy,0.223003,23.07676,0.0,0.716527,2,True,14
1,CatBoost_BAG_L1,0.850658,balanced_accuracy,0.018607,9.182181,0.018607,9.182181,1,True,7
2,XGBoost_BAG_L1,0.836931,balanced_accuracy,0.0469,4.043653,0.0469,4.043653,1,True,11
3,LightGBMLarge_BAG_L1,0.834436,balanced_accuracy,0.0223,6.413916,0.0223,6.413916,1,True,13
4,LightGBM_BAG_L1,0.833785,balanced_accuracy,0.053802,3.582635,0.053802,3.582635,1,True,4
5,KNeighborsUnif_BAG_L1,0.812483,balanced_accuracy,0.017794,0.015814,0.017794,0.015814,1,True,1
6,KNeighborsDist_BAG_L1,0.812483,balanced_accuracy,0.032118,0.0,0.032118,0.0,1,True,2
7,LightGBMXT_BAG_L1,0.808818,balanced_accuracy,0.078383,3.165688,0.078383,3.165688,1,True,3
8,RandomForestGini_BAG_L1,0.78701,balanced_accuracy,0.081724,0.698938,0.081724,0.698938,1,True,5
9,RandomForestEntr_BAG_L1,0.779543,balanced_accuracy,0.068326,0.733628,0.068326,0.733628,1,True,6


In [28]:
predictor.evaluate(valid_data_pd)

{'balanced_accuracy': 0.8092820884699057,
 'accuracy': 0.81,
 'mcc': 0.6219324893905216,
 'roc_auc': 0.8983771348553424,
 'f1': 0.7978723404255319,
 'precision': 0.8379888268156425,
 'recall': 0.7614213197969543}

In [29]:
output_path = path.join(OUTPUT_DIR_AUTOGLUON, UNIQUE_ID, "autogluon_model_pred.txt")
dump_proba(predictor, test_x, output_path_proba)

### MLJar

In [30]:
automl = AutoML(
    mode="Compete",
    ml_task="binary_classification",
    total_time_limit=TRAIN_TIME_LIMIT_MLJAR,
    eval_metric="f1",
    random_state=SEED,
    results_path=path.join(OUTPUT_DIR_MLJAR, UNIQUE_ID),
)

automl.fit(train_x, train_y.ravel())

AutoML directory: output\mljar\20240112_224713
The task is binary_classification with evaluation metric f1
AutoML will use algorithms: ['Decision Tree', 'Linear', 'Random Forest', 'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network', 'Nearest Neighbors']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree f1 0.7125 trained in 1.89 seconds
Adjust validation. Remove: 1_DecisionTree
Validation strategy: 10-fold CV Shuffle,Stratify
* Step simple_algorithms will try to check up to 4 models
1_DecisionTree f1 0.686493 trained in 3.48 seconds
2_DecisionTree f1 0.618812 trained in 3.33 seconds
3_DecisionTree f1 0.611549 train



6_Default_Xgboost f1 0.809793 trained in 10.55 seconds
7_Default_CatBoost f1 0.834586 trained in 8.15 seconds
8_Default_NeuralNetwork f1 0.694427 trained in 5.85 seconds
9_Default_RandomForest f1 0.751928 trained in 9.8 seconds
10_Default_ExtraTrees f1 0.731865 trained in 8.18 seconds
11_Default_NearestNeighbors f1 0.667473 trained in 4.58 seconds
* Step not_so_random will try to check up to 61 models
21_LightGBM f1 0.798223 trained in 9.1 seconds




12_Xgboost f1 0.788303 trained in 14.08 seconds
30_CatBoost f1 0.825357 trained in 9.53 seconds
39_RandomForest f1 0.790396 trained in 12.85 seconds
48_ExtraTrees f1 0.774566 trained in 9.82 seconds
57_NeuralNetwork f1 0.704373 trained in 6.18 seconds
66_NearestNeighbors f1 0.669476 trained in 5.27 seconds
22_LightGBM f1 0.820089 trained in 9.48 seconds




13_Xgboost f1 0.706176 trained in 12.41 seconds
31_CatBoost f1 0.826467 trained in 9.97 seconds
40_RandomForest f1 0.729646 trained in 11.75 seconds
49_ExtraTrees f1 0.664585 trained in 10.73 seconds
58_NeuralNetwork f1 0.682603 trained in 7.33 seconds
67_NearestNeighbors f1 0.639656 trained in 5.52 seconds
23_LightGBM f1 0.822002 trained in 9.21 seconds




14_Xgboost f1 0.773333 trained in 15.81 seconds
32_CatBoost f1 0.819095 trained in 17.63 seconds
41_RandomForest f1 0.72973 trained in 11.88 seconds
50_ExtraTrees f1 0.690462 trained in 9.62 seconds
59_NeuralNetwork f1 0.694905 trained in 7.79 seconds
68_NearestNeighbors f1 0.639656 trained in 5.92 seconds
24_LightGBM f1 0.823159 trained in 10.77 seconds




15_Xgboost f1 0.746667 trained in 15.56 seconds
33_CatBoost f1 0.808833 trained in 9.83 seconds
42_RandomForest f1 0.740403 trained in 11.78 seconds
51_ExtraTrees f1 0.707712 trained in 11.13 seconds
60_NeuralNetwork f1 0.700443 trained in 8.37 seconds
69_NearestNeighbors f1 0.639656 trained in 6.53 seconds
25_LightGBM f1 0.816479 trained in 9.9 seconds




16_Xgboost f1 0.65878 trained in 15.73 seconds
34_CatBoost f1 0.798762 trained in 9.83 seconds
43_RandomForest f1 0.728443 trained in 13.61 seconds
52_ExtraTrees f1 0.675937 trained in 12.12 seconds
61_NeuralNetwork f1 0.687742 trained in 8.33 seconds
70_NearestNeighbors f1 0.669476 trained in 7.4 seconds
26_LightGBM f1 0.826935 trained in 10.6 seconds




17_Xgboost f1 0.769133 trained in 14.56 seconds
35_CatBoost f1 0.805783 trained in 10.96 seconds
44_RandomForest f1 0.775353 trained in 14.28 seconds
53_ExtraTrees f1 0.734426 trained in 13.55 seconds
62_NeuralNetwork f1 0.704835 trained in 9.62 seconds
71_NearestNeighbors f1 0.639656 trained in 7.72 seconds
27_LightGBM f1 0.817555 trained in 15.03 seconds




18_Xgboost f1 0.722222 trained in 16.58 seconds
36_CatBoost f1 0.840075 trained in 12.3 seconds
45_RandomForest f1 0.787412 trained in 16.46 seconds
54_ExtraTrees f1 0.750983 trained in 14.23 seconds
63_NeuralNetwork f1 0.707886 trained in 9.75 seconds
72_NearestNeighbors f1 0.639656 trained in 8.22 seconds
28_LightGBM f1 0.831461 trained in 13.93 seconds




19_Xgboost f1 0.763987 trained in 17.2 seconds
37_CatBoost f1 0.837989 trained in 15.86 seconds
46_RandomForest f1 0.713548 trained in 15.06 seconds
55_ExtraTrees f1 0.6792 trained in 13.1 seconds
64_NeuralNetwork f1 0.691503 trained in 10.4 seconds
29_LightGBM f1 0.834171 trained in 14.13 seconds




20_Xgboost f1 0.752538 trained in 18.57 seconds
38_CatBoost f1 0.801757 trained in 11.35 seconds
47_RandomForest f1 0.739214 trained in 16.36 seconds
56_ExtraTrees f1 0.710006 trained in 13.78 seconds
65_NeuralNetwork f1 0.699054 trained in 10.82 seconds
* Step golden_features will try to check up to 3 models
None 10
Add Golden Feature: feature_18_sum_feature_2
Add Golden Feature: feature_6_multiply_feature_2
Add Golden Feature: feature_17_multiply_feature_3
Add Golden Feature: feature_5_diff_feature_17
Add Golden Feature: feature_12_ratio_feature_9
Add Golden Feature: feature_9_ratio_feature_12
Add Golden Feature: feature_4_diff_feature_14
Add Golden Feature: feature_9_sum_feature_1
Add Golden Feature: feature_19_sum_feature_10
Add Golden Feature: feature_13_multiply_feature_3
Created 10 Golden Features in 7.97 seconds.
36_CatBoost_GoldenFeatures f1 0.838226 trained in 24.7 seconds
37_CatBoost_GoldenFeatures f1 0.826683 trained in 19.22 seconds
7_Default_CatBoost_GoldenFeatures f1 0.8



36_CatBoost_KMeansFeatures f1 0.824845 trained in 21.15 seconds




37_CatBoost_KMeansFeatures f1 0.794246 trained in 21.51 seconds




7_Default_CatBoost_KMeansFeatures f1 0.79722 trained in 17.46 seconds
* Step insert_random_feature will try to check up to 1 model
36_CatBoost_RandomFeature f1 0.836862 trained in 16.23 seconds
Drop features ['feature_15', 'feature_16', 'feature_13', 'random_feature', 'feature_6', 'feature_10', 'feature_8', 'feature_17', 'feature_14']
* Step features_selection will try to check up to 6 models
36_CatBoost_SelectedFeatures f1 0.869182 trained in 14.92 seconds
29_LightGBM_SelectedFeatures f1 0.863436 trained in 13.88 seconds




6_Default_Xgboost_SelectedFeatures f1 0.819775 trained in 16.93 seconds
39_RandomForest_SelectedFeatures f1 0.796129 trained in 16.3 seconds
48_ExtraTrees_SelectedFeatures f1 0.780392 trained in 15.53 seconds
63_NeuralNetwork_SelectedFeatures f1 0.800247 trained in 11.75 seconds
* Step hill_climbing_1 will try to check up to 24 models
73_CatBoost_SelectedFeatures f1 0.860902 trained in 15.38 seconds
74_LightGBM_SelectedFeatures f1 0.863436 trained in 14.36 seconds
75_LightGBM_SelectedFeatures f1 0.863436 trained in 14.65 seconds
76_CatBoost f1 0.838951 trained in 15.65 seconds
77_CatBoost_GoldenFeatures f1 0.822278 trained in 17.15 seconds
78_LightGBM f1 0.834171 trained in 15.98 seconds
79_LightGBM f1 0.834171 trained in 16.01 seconds
80_LightGBM f1 0.831461 trained in 16.51 seconds




81_Xgboost_SelectedFeatures f1 0.822642 trained in 17.53 seconds




82_Xgboost f1 0.803571 trained in 17.63 seconds
83_NeuralNetwork_SelectedFeatures f1 0.785222 trained in 13.06 seconds
84_RandomForest_SelectedFeatures f1 0.79771 trained in 17.91 seconds
85_RandomForest f1 0.802597 trained in 18.1 seconds




86_Xgboost f1 0.782774 trained in 20.23 seconds
87_RandomForest f1 0.786458 trained in 19.48 seconds
88_ExtraTrees_SelectedFeatures f1 0.780365 trained in 16.5 seconds
89_ExtraTrees f1 0.774903 trained in 16.96 seconds
90_ExtraTrees f1 0.754717 trained in 16.99 seconds
91_DecisionTree f1 0.641318 trained in 11.3 seconds
92_NearestNeighbors f1 0.669476 trained in 12.05 seconds
93_NearestNeighbors f1 0.669476 trained in 12.05 seconds
94_NearestNeighbors f1 0.667473 trained in 12.2 seconds
95_DecisionTree f1 0.641318 trained in 11.55 seconds
96_DecisionTree f1 0.675887 trained in 11.67 seconds
* Step hill_climbing_2 will try to check up to 28 models
97_CatBoost_SelectedFeatures f1 0.859473 trained in 15.08 seconds
98_CatBoost_SelectedFeatures f1 0.865709 trained in 17.78 seconds
99_LightGBM_SelectedFeatures f1 0.866958 trained in 16.76 seconds
100_LightGBM_SelectedFeatures f1 0.846058 trained in 17.02 seconds
101_LightGBM_SelectedFeatures f1 0.866958 trained in 16.96 seconds
102_LightGBM_



109_Xgboost_SelectedFeatures f1 0.82197 trained in 21.3 seconds




110_Xgboost_SelectedFeatures f1 0.809886 trained in 19.18 seconds




111_Xgboost f1 0.810777 trained in 19.16 seconds
112_RandomForest f1 0.782383 trained in 21.95 seconds
113_NeuralNetwork_SelectedFeatures f1 0.69024 trained in 14.83 seconds
114_NeuralNetwork_SelectedFeatures f1 0.785759 trained in 15.38 seconds
115_RandomForest_SelectedFeatures f1 0.792092 trained in 19.32 seconds
116_RandomForest_SelectedFeatures f1 0.792574 trained in 20.26 seconds
117_NeuralNetwork_SelectedFeatures f1 0.802947 trained in 15.36 seconds
118_NeuralNetwork_SelectedFeatures f1 0.785441 trained in 16.0 seconds
119_ExtraTrees_SelectedFeatures f1 0.764783 trained in 18.53 seconds
120_ExtraTrees_SelectedFeatures f1 0.767846 trained in 19.03 seconds
121_ExtraTrees f1 0.765049 trained in 19.78 seconds
122_NeuralNetwork f1 0.689345 trained in 16.18 seconds
123_NeuralNetwork f1 0.705658 trained in 16.33 seconds
124_DecisionTree f1 0.688946 trained in 13.98 seconds
* Step boost_on_errors will try to check up to 1 model
36_CatBoost_SelectedFeatures_BoostOnErrors f1 0.859848 train



81_Xgboost_SelectedFeatures_Stacked f1 0.874222 trained in 23.81 seconds
117_NeuralNetwork_SelectedFeatures_Stacked f1 0.855549 trained in 19.7 seconds
85_RandomForest_Stacked f1 0.882462 trained in 43.26 seconds
48_ExtraTrees_SelectedFeatures_Stacked f1 0.883375 trained in 23.46 seconds
98_CatBoost_SelectedFeatures_Stacked f1 0.887654 trained in 41.61 seconds
101_LightGBM_SelectedFeatures_Stacked f1 0.883085 trained in 20.83 seconds




109_Xgboost_SelectedFeatures_Stacked f1 0.874378 trained in 23.12 seconds
63_NeuralNetwork_SelectedFeatures_Stacked f1 0.857669 trained in 17.04 seconds
84_RandomForest_SelectedFeatures_Stacked f1 0.882462 trained in 43.86 seconds
88_ExtraTrees_SelectedFeatures_Stacked f1 0.882426 trained in 22.26 seconds
105_CatBoost_SelectedFeatures_Stacked f1 0.890259 trained in 22.84 seconds
99_LightGBM_SelectedFeatures_Stacked f1 0.883085 trained in 21.4 seconds




6_Default_Xgboost_SelectedFeatures_Stacked f1 0.872274 trained in 24.86 seconds
114_NeuralNetwork_SelectedFeatures_Stacked f1 0.849231 trained in 17.8 seconds
39_RandomForest_SelectedFeatures_Stacked f1 0.883261 trained in 40.83 seconds
89_ExtraTrees_Stacked f1 0.883692 trained in 23.43 seconds
73_CatBoost_SelectedFeatures_Stacked f1 0.883807 trained in 28.2 seconds
75_LightGBM_SelectedFeatures_Stacked f1 0.892746 trained in 22.62 seconds




111_Xgboost_Stacked f1 0.875309 trained in 25.78 seconds
118_NeuralNetwork_SelectedFeatures_Stacked f1 0.859278 trained in 18.75 seconds
116_RandomForest_SelectedFeatures_Stacked f1 0.883519 trained in 41.63 seconds
48_ExtraTrees_Stacked f1 0.885873 trained in 23.2 seconds
97_CatBoost_SelectedFeatures_Stacked f1 0.890267 trained in 25.2 seconds
74_LightGBM_SelectedFeatures_Stacked f1 0.892746 trained in 23.2 seconds




110_Xgboost_SelectedFeatures_Stacked f1 0.878863 trained in 25.23 seconds
83_NeuralNetwork_SelectedFeatures_Stacked f1 0.8554 trained in 18.93 seconds
115_RandomForest_SelectedFeatures_Stacked f1 0.884804 trained in 37.76 seconds
120_ExtraTrees_SelectedFeatures_Stacked f1 0.882426 trained in 23.01 seconds
106_CatBoost_SelectedFeatures_Stacked f1 0.889851 trained in 40.0 seconds
29_LightGBM_SelectedFeatures_Stacked f1 0.892746 trained in 23.43 seconds




6_Default_Xgboost_Stacked f1 0.874845 trained in 25.63 seconds
63_NeuralNetwork_Stacked f1 0.851968 trained in 18.91 seconds
39_RandomForest_Stacked f1 0.884497 trained in 42.76 seconds
121_ExtraTrees_Stacked f1 0.88228 trained in 23.8 seconds
107_CatBoost_Stacked f1 0.891224 trained in 24.83 seconds
100_LightGBM_SelectedFeatures_Stacked f1 0.893723 trained in 22.71 seconds




82_Xgboost_Stacked f1 0.880745 trained in 26.06 seconds
123_NeuralNetwork_Stacked f1 0.854489 trained in 19.75 seconds
45_RandomForest_Stacked f1 0.891358 trained in 42.13 seconds
119_ExtraTrees_SelectedFeatures_Stacked f1 0.885185 trained in 23.65 seconds
36_CatBoost_Stacked f1 0.892902 trained in 30.2 seconds
104_LightGBM_SelectedFeatures_Stacked f1 0.893723 trained in 23.36 seconds




12_Xgboost_Stacked f1 0.878412 trained in 24.7 seconds
62_NeuralNetwork_Stacked f1 0.855906 trained in 21.98 seconds
87_RandomForest_Stacked f1 0.88642 trained in 41.83 seconds
90_ExtraTrees_Stacked f1 0.883951 trained in 25.15 seconds
76_CatBoost_Stacked f1 0.887787 trained in 30.36 seconds
102_LightGBM_SelectedFeatures_Stacked f1 0.893723 trained in 23.86 seconds




86_Xgboost_Stacked f1 0.878261 trained in 25.04 seconds
57_NeuralNetwork_Stacked f1 0.85838 trained in 20.63 seconds
112_RandomForest_Stacked f1 0.880588 trained in 40.45 seconds
54_ExtraTrees_Stacked f1 0.880347 trained in 26.32 seconds
79_LightGBM_Stacked f1 0.893407 trained in 24.48 seconds




14_Xgboost_Stacked f1 0.876084 trained in 25.75 seconds
60_NeuralNetwork_Stacked f1 0.860308 trained in 20.78 seconds
44_RandomForest_Stacked f1 0.884354 trained in 44.12 seconds
53_ExtraTrees_Stacked f1 0.88642 trained in 26.8 seconds
* Step ensemble_stacked will try to check up to 1 model
Ensemble_Stacked f1 0.894147 trained in 67.51 seconds
AutoML fit time: 3843.84 seconds
AutoML best model: Ensemble_Stacked


In [31]:
print(valid_x.shape, valid_y.shape)
print(train_x.shape, train_y.shape)
predictions = automl.predict(valid_x)
score = balanced_accuracy_score(valid_y, predictions)
print(f"Model Balanced Accuracy: {score}")

(400, 20) (400, 1)
(1600, 20) (1600, 1)
Model Balanced Accuracy: 0.837200870195794


In [32]:
output_path = path.join(OUTPUT_DIR_MLJAR, UNIQUE_ID, "mljar_model_proba.txt")
dump_proba(automl, test_x, output_path_proba)

In [34]:
def ensemble_predict(X, model1, model2):
    pred1 = model1.predict_proba(pd.DataFrame(X)).values[:, 1]
    pred2 = model2.predict_proba(X)[:, 1]
    print(pred1, pred2)
    avg_pred = (pred1 + pred2) / 2

    return avg_pred


final_predictions = ensemble_predict(test_x, predictor, automl)

[0.19823167 0.12950306 0.36527926 0.84787917 0.35678351 0.94533753
 0.24744089 0.11513202 0.18806101 0.97940791 0.76719964 0.92781764
 0.15279761 0.4274509  0.67261863 0.04377132 0.17906018 0.98433673
 0.49650881 0.04347825 0.31233412 0.137513   0.06588629 0.65558058
 0.78581023 0.96418273 0.26022434 0.46065176 0.63711911 0.04180147
 0.09258254 0.74501765 0.83167541 0.71207303 0.87524205 0.18559894
 0.74777424 0.97198254 0.7338838  0.78832066 0.8278321  0.91062081
 0.69085479 0.87862921 0.46174797 0.95589107 0.132443   0.75506824
 0.23379146 0.34170339 0.8642723  0.41976583 0.39354694 0.80079597
 0.57761717 0.09371804 0.93512791 0.24425934 0.33750027 0.16853186
 0.89413518 0.35173431 0.4335978  0.63558245 0.77356994 0.28098822
 0.39471498 0.65327847 0.51277983 0.78379697 0.83189362 0.19509181
 0.84669989 0.95740187 0.88411015 0.21349069 0.76758844 0.53296971
 0.88658172 0.41792718 0.22615737 0.70556444 0.46914053 0.97699976
 0.91040391 0.72218162 0.47050691 0.63883471 0.65635705 0.7752

In [35]:
os.makedirs(path.join("ensamble", UNIQUE_ID), exist_ok=True)

np.savetxt(
    path.join("ensamble", UNIQUE_ID, "123manual_model_pred.txt"),
    final_predictions,
    delimiter="\n",
    comments="",
    header='"313201"',
)

[0.16060101 0.11675699 0.47165692 0.85701416 0.24282749 0.92251028
 0.19031214 0.11236925 0.15568195 0.93380879 0.82581031 0.90862745
 0.14247695 0.61476481 0.76963495 0.08026247 0.15153862 0.94243135
 0.60576912 0.08297015 0.27831707 0.12345862 0.07902624 0.76905202
 0.82770592 0.92287652 0.18852246 0.31780032 0.75122329 0.07091373
 0.10564501 0.80290649 0.84186128 0.78580802 0.8586484  0.16199179
 0.81018406 0.93460239 0.80191058 0.83523301 0.84779614 0.90170043
 0.77665165 0.87132314 0.32550262 0.91735104 0.11685607 0.81563887
 0.17217613 0.23751473 0.86851574 0.29118838 0.36789962 0.83830197
 0.69406734 0.11050372 0.91022092 0.17669649 0.59594377 0.14134135
 0.8756909  0.23588966 0.27986014 0.70835393 0.8317875  0.20892941
 0.30903274 0.76800095 0.68457345 0.81238893 0.82819176 0.16244649
 0.87209202 0.87950575 0.82600939 0.1660666  0.81863084 0.68636407
 0.88086648 0.26743077 0.18121714 0.7857726  0.33824038 0.94217517
 0.85885816 0.80069713 0.5311321  0.75476745 0.76460955 0.8220

### Auto SKLearn TODO OUT

In [None]:
# !pip install auto-sklearn
# !pip install ydata-profiling
# from autosklearn.classification import AutoSklearnClassifier
from autosklearn.experimental.askl2 import AutoSklearn2Classifier
from autosklearn.metrics import balanced_accuracy

In [None]:
settings = {
    "time_left_for_this_task": TRAIN_TIME_LIMIT_AUTO_SKLEARN,
    "seed": SEED,
    "metric": balanced_accuracy,
    "n_jobs": -1,
}

In [None]:
askl2 = AutoSklearn2Classifier(**settings)
askl2.fit(train_x, train_y)

In [None]:
leaderboard = askl2.leaderboard(sort_by="model_id", ensemble_only=True)
print(leaderboard)

In [None]:
predictions = askl2.predict(valid_x)
balanced_accuracy_score(valid_y, predictions)

In [None]:
proba = askl2.predict_proba(test_x)
output_path = path.join(OUTPUT_DIR_AUTO_SKLEARN, UNIQUE_ID, "manual_model.txt")
np.savetxt(output_path, proba, delimiter="\n")
askl2.save(path.join(OUTPUT_DIR_AUTO_SKLEARN, UNIQUE_ID, "manual_model.pkl"))