In [None]:
# !pip install mljar-supervised

In [1]:
import os
import time
from os import path

import joblib
import numpy as np
import pandas as pd
from autogluon.tabular import TabularPredictor
from scipy.stats import randint, uniform
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.feature_selection import SelectKBest, VarianceThreshold, f_classif
from sklearn.linear_model import ElasticNet, LogisticRegression
from sklearn.metrics import balanced_accuracy_score, mutual_info_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import shuffle
from supervised.automl import AutoML  # mljar-supervised

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Constants

SEED = 42
N_JOBS = -1
RANDOM_SEARCH_N_ITER = 50
TRAIN_TIME_LIMIT_AUTOGLUON = 60 * 30
TRAIN_TIME_LIMIT_MLJAR = 60 * 30
TRAIN_TIME_LIMIT_AUTO_SKLEARN = 60 * 30
OUTPUT_DIR_MANUAL = path.join("output", "manual")
OUTPUT_DIR_AUTOGLUON = path.join("output", "autogluon")
OUTPUT_DIR_MLJAR = path.join("output", "mljar")
OUTPUT_DIR_AUTO_SKLEARN = path.join("output", "auto_sklearn")
UNIQUE_ID = time.strftime("%Y%m%d_%H%M%S")
APPLY_REMOVE_LOW_VARIANCE_FEATURES = True
APPLY_REMOVE_CORRELATED_FEATURES = True
APPLY_REMOVE_RANDOM_FEATURES = True
APPLY_ANOVA = True
ANOVE_FEATURES = 10

In [3]:
# prepare output directories
for output_dir in [
    OUTPUT_DIR_MANUAL,
    OUTPUT_DIR_AUTOGLUON,
    OUTPUT_DIR_MLJAR,
    OUTPUT_DIR_AUTO_SKLEARN,
]:
    if not path.exists(path.join(output_dir, UNIQUE_ID)):
        print(f"Creating output directory {path.join(output_dir, UNIQUE_ID)}")
        os.makedirs(path.join(output_dir, UNIQUE_ID))

Creating output directory output\manual\20240111_191521
Creating output directory output\autogluon\20240111_191521
Creating output directory output\mljar\20240111_191521
Creating output directory output\auto_sklearn\20240111_191521


In [4]:
# Remove Highly Correlated Columns
# def remove_highly_correlated_features(train_x, valid_x, test_x, threshold=0.95):
#     corr_matrix = np.corrcoef(train_x, rowvar=False)
#     upper = np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
#     to_drop = np.where(np.abs(corr_matrix[upper]) > threshold)[0]
#     print(to_drop)
#     train_x = np.delete(train_x, to_drop, axis=1)
#     valid_x = np.delete(valid_x, to_drop, axis=1)
#     test_x = np.delete(test_x, to_drop, axis=1)
#     return train_x, valid_x, test_x


def remove_highly_correlated_features(train_x, valid_x, test_x, threshold=0.95):
    # Calculate correlation matrix
    corr_matrix = np.corrcoef(train_x, rowvar=False)
    # Select upper triangle of correlation matrix
    upper = np.triu(corr_matrix, k=1)
    # Find indices of feature columns with correlation greater than threshold
    to_drop = [i for i in range(upper.shape[1]) if any(upper[:, i] > threshold)]

    # Drop features from train, validation, and test set
    train_x = np.delete(train_x, to_drop, axis=1)
    valid_x = np.delete(valid_x, to_drop, axis=1)
    test_x = np.delete(test_x, to_drop, axis=1)

    return train_x, valid_x, test_x


# pandas
# # Remove Highly Correlated Columns
# def remove_highly_correlated_features(train_x, valid_x, text_x, threshold=0.95):
#     corr_matrix = train_x.corr().abs()
#     # Select upper triangle of correlation matrix
#     upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
#     # Find index of feature columns with correlation greater than threshold
#     to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
#     train_x = train_x.drop(to_drop, axis=1)
#     valid_x = valid_x.drop(to_drop, axis=1)
#     text_x = text_x.drop(to_drop, axis=1)
#     return train_x, valid_x, text_x

In [5]:
# Remove Low Variance Columns
def remove_low_variance_features(train_x, valid_x, test_x, threshold=(0.8 * (1 - 0.8))):
    sel = VarianceThreshold(threshold=threshold)
    sel.fit(train_x)
    train_x = train_x[:, sel.get_support(indices=True)]
    valid_x = valid_x[:, sel.get_support(indices=True)]
    test_x = test_x[:, sel.get_support(indices=True)]
    return train_x, valid_x, test_x

In [6]:
# Remove Random Columns
def remove_random_features(
    train_x: np.ndarray,
    train_y: np.ndarray,
    valid_x: np.ndarray,
    test_x: np.ndarray,
    importance=0.005,
):
    tree: DecisionTreeClassifier = DecisionTreeClassifier(random_state=0)
    tree.fit(train_x, train_y)
    importances = tree.feature_importances_

    # Assume columns with very low importance are "random"
    # This threshold can be adjusted based on domain knowledge
    important_indices = [i for i, imp in enumerate(importances) if imp > importance]
    train_x = train_x[:, important_indices]
    valid_x = valid_x[:, important_indices]
    test_x = test_x[:, important_indices]
    return train_x, valid_x, test_x

In [7]:
def anova_filter(
    train_x: np.ndarray,
    train_y: np.ndarray,
    valid_x: np.ndarray,
    test_x: np.ndarray,
    k: int = 50,
):
    # Using ANOVA F-test to select features
    selector = SelectKBest(
        f_classif, k=k
    )  # Change k to select the number of features you want
    selector.fit(train_x, train_y)

    # Get F-values and p-values for each feature
    # f_values = selector.scores_
    # p_values = selector.pvalues_

    # Selecting features (you can use a threshold or select top k features)
    # selected_features = train_x.columns[selector.get_support()]

    # Transforming train_x to include only the selected features
    train_x = selector.transform(train_x)
    valid_x = selector.transform(valid_x)
    test_x = selector.transform(test_x)
    return train_x, valid_x, test_x

In [43]:
prefix = ""

_test_x = pd.read_table(prefix + "artificial_test.data", sep=" ", header=None)
_test_x.drop(_test_x.columns[500], axis=1, inplace=True)
_train_y = pd.read_table(prefix + "artificial_train.labels", header=None)
_train_x = pd.read_table(prefix + "artificial_train.data", sep=" ", header=None)
_train_x.drop(_train_x.columns[500], axis=1, inplace=True)

In [44]:
_train_x, _train_y = shuffle(_train_x, _train_y, random_state=42)

In [45]:
def get_train_and_validation_data():
    split = 400
    train_x, valid_x = _train_x[split:].values, _train_x[:split].values
    train_y, valid_y = _train_y[split:].values, _train_y[:split].values
    return train_x, train_y, valid_x, valid_y

In [46]:
train_x, train_y, valid_x, valid_y = get_train_and_validation_data()
train_y = train_y.reshape(-1, 1)
valid_y = valid_y.reshape(-1, 1)
print(train_x.shape, train_y.shape, valid_x.shape, valid_y.shape)

(1600, 500) (1600, 1) (400, 500) (400, 1)


In [47]:
if APPLY_REMOVE_CORRELATED_FEATURES:
    train_x, valid_x, test_x = remove_highly_correlated_features(
        train_x, valid_x, _test_x
    )
    print("train_x.shape: ", train_x.shape)

train_x.shape:  (1600, 490)


In [48]:
if APPLY_REMOVE_LOW_VARIANCE_FEATURES:
    train_x, valid_x, test_x = remove_low_variance_features(train_x, valid_x, test_x)
    print("train_x.shape: ", train_x.shape)

train_x.shape:  (1600, 490)


In [49]:
if APPLY_REMOVE_RANDOM_FEATURES:
    train_x, valid_x, test_x = remove_random_features(
        train_x=train_x, train_y=train_y, valid_x=valid_x, test_x=test_x
    )
    print("train_x.shape: ", train_x.shape)

train_x.shape:  (1600, 40)


In [50]:
if APPLY_ANOVA:
    train_x, valid_x, test_x = anova_filter(
        train_x=train_x,
        train_y=train_y,
        valid_x=valid_x,
        test_x=test_x,
        k=ANOVE_FEATURES,
    )
    print("train_x.shape: ", train_x.shape)

train_x.shape:  (1600, 10)




In [51]:
# convert to dataframe
# train_x = pd.DataFrame(train_x)
# valid_x = pd.DataFrame(valid_x)

In [52]:
print("train_x.shape: ", train_x.shape)
print("train_y.shape: ", train_y.shape)

train_x.shape:  (1600, 10)
train_y.shape:  (1600, 1)


In [53]:
# label = "class"
# train_y = train_y.rename(columns={0: label})
# valid_y = valid_y.rename(columns={0: label})
# train_data = pd.concat([train_x, train_y[label]], axis=1)

In [40]:
# sanity check
(
    original_train_x,
    original_train_y,
    original_valid_x,
    original_valid_y,
) = get_train_and_validation_data()
for y, original_y in zip([train_y, valid_y], [original_train_y, original_valid_y]):
    assert y.shape == original_y.shape

### manual model

In [41]:
# train_y = train_y.ravel()
# valid_y = valid_y.ravel()

In [21]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import balanced_accuracy_score

base_classifiers_1 = [
    (
        "rf",
        make_pipeline(
            StandardScaler(),
            RandomForestClassifier(n_estimators=100, random_state=SEED),
        ),
    ),
    ("svc", make_pipeline(StandardScaler(), SVC(random_state=SEED))),
    ("dt", make_pipeline(StandardScaler(), DecisionTreeClassifier(random_state=SEED))),
    (
        "elasticnet",
        make_pipeline(
            StandardScaler(),
            ElasticNet(
                alpha=0.0001, l1_ratio=0.15, max_iter=1000, tol=1e-3, random_state=SEED
            ),
        ),
    ),
    (
        "mlp",
        make_pipeline(
            StandardScaler(),
            MLPClassifier(
                random_state=SEED,
                max_iter=1000,
                tol=1e-3,
                hidden_layer_sizes=(100, 300, 200, 100),
            ),
        ),
    ),
    (
        "gbc",
        make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=SEED)),
    ),
]

# First Stacking Layer
first_layer = StackingClassifier(
    estimators=base_classifiers_1, final_estimator=LogisticRegression(), cv=5
)

base_classifiers_2 = [
    ("first_layer", first_layer),
    (
        "mlp",
        make_pipeline(
            StandardScaler(),
            MLPClassifier(
                random_state=SEED,
                max_iter=1000,
                tol=1e-3,
                hidden_layer_sizes=(100, 300, 200, 100),
            ),
        ),
    ),
    (
        "gbc",
        make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=SEED)),
    ),
]


param_distributions = {
    # Parameters for the First Stacking Layer
    "first_layer__rf__randomforestclassifier__n_estimators": randint(50, 200),
    "first_layer__rf__randomforestclassifier__max_depth": randint(15, 50),
    "first_layer__rf__randomforestclassifier__min_samples_split": randint(4, 20),
    "first_layer__rf__randomforestclassifier__min_samples_leaf": randint(4, 20),
    "first_layer__svc__svc__C": uniform(0.1, 10),
    "first_layer__svc__svc__gamma": ["scale", "auto"],
    "first_layer__dt__decisiontreeclassifier__max_depth": randint(15, 50),
    "first_layer__dt__decisiontreeclassifier__min_samples_split": randint(5, 20),
    "first_layer__dt__decisiontreeclassifier__min_samples_leaf": randint(5, 20),
    "first_layer__elasticnet__elasticnet__alpha": uniform(0.0001, 1),
    "first_layer__elasticnet__elasticnet__l1_ratio": uniform(0, 1),
    "first_layer__mlp__mlpclassifier__alpha": uniform(0.0001, 1),
    "first_layer__mlp__mlpclassifier__learning_rate_init": uniform(0.001, 0.1),
    "first_layer__mlp__mlpclassifier__hidden_layer_sizes": [
        (50, 40),
        (50, 100, 50),
        (50, 150, 100, 50),
    ],
    # Parameters for the Second Stacking Layer
    "mlp__mlpclassifier__alpha": uniform(0.0001, 1),
    "mlp__mlpclassifier__learning_rate_init": uniform(0.001, 0.1),
    "mlp__mlpclassifier__hidden_layer_sizes": [
        (50, 40),
        (50, 100, 50),
        (50, 150, 100, 50),
    ],
    "gbc__gradientboostingclassifier__n_estimators": randint(50, 200),
    "gbc__gradientboostingclassifier__max_depth": randint(15, 50),
    "gbc__gradientboostingclassifier__min_samples_split": randint(4, 20),
    "gbc__gradientboostingclassifier__min_samples_leaf": randint(4, 20),
    # Parameters for the Final Estimator
    "final_estimator__C": uniform(0.01, 10),
}


# Second Stacking Layer
stacked_ensemble_model = StackingClassifier(
    estimators=base_classifiers_2, final_estimator=LogisticRegression(), cv=5
)
# Define the committee of models
committee_models = [
    ("stacked_ensemble", stacked_ensemble_model),
    ("gbc", GradientBoostingClassifier(random_state=SEED)),
    ("rf", RandomForestClassifier(random_state=SEED))
]

# Create the committee model
committee_model = VotingClassifier(committee_models)

# Perform randomized search
random_search = RandomizedSearchCV(
    committee_model,
    param_distributions=param_distributions,
    scoring="balanced_accuracy",
    n_iter=RANDOM_SEARCH_N_ITER,
    cv=5,
    verbose=4,
    random_state=SEED,
    n_jobs=8,
)

random_search.fit(train_x, train_y)

y_pred = random_search.predict(valid_x)

balanced_accuracy = balanced_accuracy_score(valid_y, y_pred)

print(f"Model Balanced Accuracy: {balanced_accuracy}")


# random_search = RandomizedSearchCV(
#     stacked_ensemble_model,
#     param_distributions=param_distributions,
#     scoring="balanced_accuracy",
#     n_iter=RANDOM_SEARCH_N_ITER,
#     cv=5,
#     verbose=4,
#     random_state=SEED,
#     n_jobs=8,
# )

# random_search.fit(train_x, train_y)

# y_pred = random_search.predict(valid_x)

# balanced_accuracy = balanced_accuracy_score(valid_y, y_pred)

# print(f"Model Balanced Accuracy: {balanced_accuracy}")

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Model Balanced Accuracy: 0.8572803880873197


In [22]:
print(random_search.best_params_)

{'final_estimator__C': 1.962429877980445, 'first_layer__dt__decisiontreeclassifier__max_depth': 33, 'first_layer__dt__decisiontreeclassifier__min_samples_leaf': 8, 'first_layer__dt__decisiontreeclassifier__min_samples_split': 7, 'first_layer__elasticnet__elasticnet__alpha': 0.768654014306309, 'first_layer__elasticnet__elasticnet__l1_ratio': 0.04360377175443375, 'first_layer__mlp__mlpclassifier__alpha': 0.994650510797341, 'first_layer__mlp__mlpclassifier__hidden_layer_sizes': (50, 100, 50), 'first_layer__mlp__mlpclassifier__learning_rate_init': 0.09639285770025874, 'first_layer__rf__randomforestclassifier__max_depth': 20, 'first_layer__rf__randomforestclassifier__min_samples_leaf': 6, 'first_layer__rf__randomforestclassifier__min_samples_split': 15, 'first_layer__rf__randomforestclassifier__n_estimators': 86, 'first_layer__svc__svc__C': 9.383185625877253, 'first_layer__svc__svc__gamma': 'scale', 'gbc__gradientboostingclassifier__max_depth': 45, 'gbc__gradientboostingclassifier__min_samp

In [23]:
proba = random_search.predict_proba(test_x)
output_path = path.join(OUTPUT_DIR_MANUAL, UNIQUE_ID, "manual_model.txt")
np.savetxt(output_path, proba[:, 1], delimiter="\n")
joblib.dump(
    random_search, path.join(OUTPUT_DIR_MANUAL, UNIQUE_ID, "manual_model_pred.pkl")
)
# random_search.save(path.join(OUTPUT_DIR_MANUAL, UNIQUE_ID, "manual_model.pkl"))

['output\\manual\\20240111_191521\\manual_model_pred.pkl']

### Autogluon

In [54]:

print(train_x.shape, train_y.shape)
print(valid_x.shape, valid_y.shape)

(1600, 10) (1600, 1)
(400, 10) (400, 1)


In [55]:
train_data = np.concatenate((train_x, train_y), axis=1)
train_data_pd = pd.DataFrame(train_data)
train_data_pd.rename(columns={train_data_pd.columns[-1]: "class"}, inplace=True)

valid_data = np.concatenate((valid_x, valid_y), axis=1)
valid_data_pd = pd.DataFrame(valid_data)
valid_data_pd.rename(columns={valid_data_pd.columns[-1]: "class"}, inplace=True)

In [56]:
print(train_data_pd.shape, valid_data_pd.shape)

(1600, 11) (400, 11)


In [57]:
save_path = path.join(OUTPUT_DIR_AUTOGLUON, UNIQUE_ID)
predictor = TabularPredictor(
    label="class",
    path=save_path,
    eval_metric="balanced_accuracy",
    problem_type="binary",
).fit(
    train_data_pd,
    time_limit=TRAIN_TIME_LIMIT_AUTOGLUON,
    presets="best_quality",
    hyperparameters="default",
)

Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 1800 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: output\autogluon\20240111_191521/ds_sub_fit/sub_fit_ho.
2024-01-11 20:44:47,880	INFO util.py:159 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
Beginning AutoGluon training ... Time limit = 450s
AutoGluon will save models to "output\autogluon\20240111_191521/ds_sub_fit/s

In [58]:
predictor.leaderboard()

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L3,0.891881,balanced_accuracy,1.303733,93.181199,0.002497,1.990012,3,True,26
1,CatBoost_BAG_L2,0.891248,balanced_accuracy,1.281687,88.950525,0.018195,14.548193,2,True,19
2,LightGBMLarge_BAG_L2,0.890607,balanced_accuracy,1.323258,83.322735,0.059767,8.920403,2,True,25
3,XGBoost_BAG_L2,0.889975,balanced_accuracy,1.30817,79.942836,0.044678,5.540503,2,True,23
4,LightGBM_BAG_L2,0.889366,balanced_accuracy,1.287584,80.495197,0.024093,6.092865,2,True,16
5,LightGBMXT_BAG_L2,0.888107,balanced_accuracy,1.283041,76.642994,0.019549,2.240662,2,True,15
6,WeightedEnsemble_L2,0.887503,balanced_accuracy,1.07643,59.366956,0.006011,1.38451,2,True,14
7,NeuralNetTorch_BAG_L2,0.884334,balanced_accuracy,1.323828,85.430761,0.060336,11.028428,2,True,24
8,NeuralNetFastAI_BAG_L2,0.875031,balanced_accuracy,1.403416,84.852895,0.139925,10.450563,2,True,22
9,ExtraTreesGini_BAG_L2,0.874998,balanced_accuracy,1.436731,75.409857,0.173239,1.007524,2,True,20


In [59]:
predictor.evaluate(valid_data_pd)

{'balanced_accuracy': 0.8722087469680678,
 'accuracy': 0.8725,
 'mcc': 0.7452659076092105,
 'roc_auc': 0.9420869695681529,
 'f1': 0.8682170542635659,
 'precision': 0.8842105263157894,
 'recall': 0.8527918781725888}

In [60]:
proba = predictor.predict_proba(pd.DataFrame(test_x))
output_path = path.join(OUTPUT_DIR_AUTOGLUON, UNIQUE_ID, "manual_model_pred.txt")
np.savetxt(output_path, proba.values[:, 1], delimiter="\n")
# predictor.save(path.join(OUTPUT_DIR_AUTOGLUON, UNIQUE_ID, "manual_model.pkl"))

### MLJar

In [89]:
automl = AutoML(
    mode="Compete",
    ml_task="binary_classification",
    total_time_limit=TRAIN_TIME_LIMIT_MLJAR,
    eval_metric="f1",
    random_state=SEED,
    results_path=path.join(OUTPUT_DIR_MLJAR, UNIQUE_ID),
)

automl.fit(train_x, train_y.ravel())

AutoML directory: output\mljar\20240111_191521
The task is binary_classification with evaluation metric f1
AutoML will use algorithms: ['Decision Tree', 'Linear', 'Random Forest', 'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network', 'Nearest Neighbors']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree f1 0.742515 trained in 6.15 seconds
Disable stacking for split validation
* Step simple_algorithms will try to check up to 3 models
2_DecisionTree f1 0.618182 trained in 5.17 seconds
3_DecisionTree f1 0.618182 trained in 5.29 seconds
4_Linear f1 0.634146 trained in 6.69 seconds
* Step default_algorithms will try to 



6_Default_Xgboost f1 0.819876 trained in 5.88 seconds
7_Default_CatBoost f1 0.860606 trained in 7.91 seconds
8_Default_NeuralNetwork f1 0.813953 trained in 6.04 seconds
9_Default_RandomForest f1 0.742515 trained in 8.51 seconds
10_Default_ExtraTrees f1 0.722892 trained in 6.19 seconds
11_Default_NearestNeighbors f1 0.783626 trained in 5.71 seconds
* Step not_so_random will try to check up to 61 models
21_LightGBM f1 0.795031 trained in 5.57 seconds




12_Xgboost f1 0.825 trained in 9.71 seconds
30_CatBoost f1 0.831325 trained in 6.02 seconds
39_RandomForest f1 0.773006 trained in 6.47 seconds
48_ExtraTrees f1 0.809816 trained in 6.31 seconds
57_NeuralNetwork f1 0.737968 trained in 5.19 seconds
66_NearestNeighbors f1 0.75 trained in 5.51 seconds
22_LightGBM f1 0.829268 trained in 8.45 seconds




13_Xgboost f1 0.730539 trained in 8.53 seconds
31_CatBoost f1 0.812121 trained in 6.19 seconds
40_RandomForest f1 0.701754 trained in 6.56 seconds
49_ExtraTrees f1 0.708861 trained in 7.71 seconds
58_NeuralNetwork f1 0.745098 trained in 7.16 seconds
67_NearestNeighbors f1 0.773006 trained in 7.06 seconds
23_LightGBM f1 0.840764 trained in 8.25 seconds




14_Xgboost f1 0.807453 trained in 7.96 seconds
32_CatBoost f1 0.822086 trained in 8.27 seconds
41_RandomForest f1 0.686747 trained in 6.25 seconds
50_ExtraTrees f1 0.716763 trained in 7.68 seconds
59_NeuralNetwork f1 0.823529 trained in 6.51 seconds
68_NearestNeighbors f1 0.773006 trained in 5.6 seconds
24_LightGBM f1 0.840764 trained in 7.94 seconds




15_Xgboost f1 0.745342 trained in 7.93 seconds
33_CatBoost f1 0.824242 trained in 7.99 seconds
42_RandomForest f1 0.754717 trained in 5.8 seconds
51_ExtraTrees f1 0.729412 trained in 6.32 seconds
60_NeuralNetwork f1 0.835443 trained in 5.4 seconds
69_NearestNeighbors f1 0.773006 trained in 5.12 seconds
25_LightGBM f1 0.834356 trained in 5.86 seconds




16_Xgboost f1 0.678363 trained in 6.82 seconds
34_CatBoost f1 0.817073 trained in 5.56 seconds
43_RandomForest f1 0.733728 trained in 5.66 seconds
52_ExtraTrees f1 0.72 trained in 5.54 seconds
61_NeuralNetwork f1 0.829268 trained in 5.34 seconds
70_NearestNeighbors f1 0.75 trained in 5.14 seconds
26_LightGBM f1 0.85 trained in 5.91 seconds




17_Xgboost f1 0.772152 trained in 5.64 seconds
35_CatBoost f1 0.824242 trained in 5.69 seconds
44_RandomForest f1 0.792453 trained in 5.92 seconds
53_ExtraTrees f1 0.746988 trained in 6.89 seconds
62_NeuralNetwork f1 0.8125 trained in 5.62 seconds
71_NearestNeighbors f1 0.773006 trained in 5.28 seconds
27_LightGBM f1 0.833333 trained in 6.62 seconds




18_Xgboost f1 0.760736 trained in 6.22 seconds
36_CatBoost f1 0.860606 trained in 5.77 seconds
45_RandomForest f1 0.792453 trained in 5.97 seconds
54_ExtraTrees f1 0.754491 trained in 5.71 seconds
63_NeuralNetwork f1 0.820513 trained in 5.37 seconds
72_NearestNeighbors f1 0.773006 trained in 5.5 seconds
28_LightGBM f1 0.832298 trained in 5.96 seconds




19_Xgboost f1 0.836364 trained in 6.96 seconds
37_CatBoost f1 0.831325 trained in 6.64 seconds
46_RandomForest f1 0.721519 trained in 5.82 seconds
55_ExtraTrees f1 0.690476 trained in 5.86 seconds
64_NeuralNetwork f1 0.825 trained in 6.01 seconds
29_LightGBM f1 0.843373 trained in 5.72 seconds




20_Xgboost f1 0.770186 trained in 6.68 seconds
38_CatBoost f1 0.814371 trained in 5.66 seconds
47_RandomForest f1 0.77707 trained in 5.86 seconds
56_ExtraTrees f1 0.722892 trained in 5.77 seconds
65_NeuralNetwork f1 0.8125 trained in 5.44 seconds
* Step golden_features will try to check up to 3 models
None 10
Add Golden Feature: feature_9_multiply_feature_8
Add Golden Feature: feature_4_ratio_feature_1
Add Golden Feature: feature_1_ratio_feature_4
Add Golden Feature: feature_9_sum_feature_5
Add Golden Feature: feature_7_multiply_feature_4
Add Golden Feature: feature_5_sum_feature_4
Add Golden Feature: feature_4_ratio_feature_2
Add Golden Feature: feature_2_ratio_feature_4
Add Golden Feature: feature_10_multiply_feature_9
Add Golden Feature: feature_4_diff_feature_8
Created 10 Golden Features in 13.82 seconds.
36_CatBoost_GoldenFeatures f1 0.86747 trained in 20.09 seconds
7_Default_CatBoost_GoldenFeatures f1 0.826347 trained in 5.67 seconds
26_LightGBM_GoldenFeatures f1 0.853659 trained



36_CatBoost_KMeansFeatures f1 0.802469 trained in 6.21 seconds




7_Default_CatBoost_KMeansFeatures f1 0.807229 trained in 5.79 seconds




26_LightGBM_KMeansFeatures f1 0.817073 trained in 6.07 seconds
* Step insert_random_feature will try to check up to 1 model
36_CatBoost_GoldenFeatures_RandomFeature f1 0.845238 trained in 6.04 seconds
Drop features ['random_feature', 'feature_2_ratio_feature_4']
* Step features_selection will try to check up to 6 models
36_CatBoost_GoldenFeatures_SelectedFeatures f1 0.860606 trained in 6.22 seconds
26_LightGBM_GoldenFeatures_SelectedFeatures f1 0.848101 trained in 6.61 seconds




19_Xgboost_SelectedFeatures f1 0.836364 trained in 7.64 seconds
60_NeuralNetwork_SelectedFeatures f1 0.835443 trained in 6.05 seconds
48_ExtraTrees_SelectedFeatures f1 0.809816 trained in 6.72 seconds
44_RandomForest_SelectedFeatures f1 0.792453 trained in 6.28 seconds
* Step hill_climbing_1 will try to check up to 28 models
73_CatBoost_GoldenFeatures f1 0.864198 trained in 6.74 seconds
74_CatBoost f1 0.8 trained in 5.69 seconds
75_CatBoost f1 0.840237 trained in 7.02 seconds
76_CatBoost_GoldenFeatures_SelectedFeatures f1 0.872727 trained in 6.72 seconds
77_LightGBM_GoldenFeatures f1 0.853659 trained in 6.42 seconds
78_LightGBM_GoldenFeatures f1 0.853659 trained in 6.91 seconds
79_LightGBM f1 0.85 trained in 7.41 seconds
80_LightGBM f1 0.85 trained in 7.29 seconds
81_LightGBM_GoldenFeatures_SelectedFeatures f1 0.848101 trained in 7.87 seconds
82_LightGBM_GoldenFeatures_SelectedFeatures f1 0.848101 trained in 7.87 seconds




83_Xgboost_SelectedFeatures f1 0.792453 trained in 7.09 seconds




84_Xgboost_SelectedFeatures f1 0.834356 trained in 7.64 seconds




85_Xgboost f1 0.792453 trained in 7.35 seconds




86_Xgboost f1 0.834356 trained in 7.83 seconds
87_NeuralNetwork f1 0.825581 trained in 6.71 seconds
88_NeuralNetwork_SelectedFeatures f1 0.825581 trained in 6.71 seconds
89_NeuralNetwork f1 0.768116 trained in 7.02 seconds
* Step hill_climbing_2 will try to check up to 21 models
90_CatBoost_GoldenFeatures_SelectedFeatures f1 0.814815 trained in 7.44 seconds
91_CatBoost_GoldenFeatures f1 0.832298 trained in 7.57 seconds
92_LightGBM_GoldenFeatures f1 0.843373 trained in 6.92 seconds
93_LightGBM_GoldenFeatures f1 0.8625 trained in 8.44 seconds
94_LightGBM_GoldenFeatures f1 0.843373 trained in 7.14 seconds
95_LightGBM_GoldenFeatures f1 0.8625 trained in 8.29 seconds
96_LightGBM_GoldenFeatures f1 0.843373 trained in 7.27 seconds
97_LightGBM_GoldenFeatures f1 0.8625 trained in 8.31 seconds




98_Xgboost f1 0.792899 trained in 8.25 seconds
* Step ensemble will try to check up to 1 model
Ensemble f1 0.89441 trained in 30.66 seconds
AutoML fit time: 943.06 seconds
AutoML best model: Ensemble


In [90]:
print (valid_x.shape, valid_y.shape)
print(train_x.shape, train_y.shape)
predictions = automl.predict(valid_x)
score = balanced_accuracy_score(valid_y, predictions)
print(f"Model Balanced Accuracy: {score}")

(400, 10) (400, 1)
(1600, 10) (1600, 1)
Model Balanced Accuracy: 0.8721337300892701


In [91]:
proba = automl.predict_proba(test_x)
output_path = path.join(OUTPUT_DIR_MLJAR, UNIQUE_ID, "mljar_model_proba.txt")
np.savetxt(output_path, proba[:, 1], delimiter="\n")

In [None]:
print(proba)

In [None]:
def ensemble_predict(X, model1, model2):
    # Get predictions from each model
    pred1 = model1.predict_proba(pd.DataFrame(X)).values[:, 1]
    pred2 = model2.predict_proba(X)[:, 1]
    # pred3 = model3.predict_proba(X)[:,1]
    print(pred1, pred2)
    # Average the probabilities for the positive class
    avg_pred = (pred1 + pred2) / 2

    # Convert to binary predictions (you might adjust the threshold as needed)
    # final_pred = [1 if p >= 0.5 else 0 for p in avg_pred]
    return avg_pred


# Example of using the ensemble
final_predictions = ensemble_predict(test_x, predictor, automl)

In [None]:
print(final_predictions)
os.makedirs(path.join("ensamble", UNIQUE_ID), exist_ok=True)
np.savetxt(
    path.join("ensamble", UNIQUE_ID, "123manual_model_pred.txt"), final_predictions, delimiter="\n"
)

### Auto SKLearn

In [None]:
# !pip install auto-sklearn
# !pip install ydata-profiling
# from autosklearn.classification import AutoSklearnClassifier
from autosklearn.experimental.askl2 import AutoSklearn2Classifier
from autosklearn.metrics import balanced_accuracy

In [None]:
settings = {
    "time_left_for_this_task": TRAIN_TIME_LIMIT_AUTO_SKLEARN,
    "seed": SEED,
    "metric": balanced_accuracy,
    "n_jobs": -1,
}

In [None]:
askl2 = AutoSklearn2Classifier(**settings)
askl2.fit(train_x, train_y)

In [None]:
leaderboard = askl2.leaderboard(sort_by="model_id", ensemble_only=True)
print(leaderboard)

In [None]:
predictions = askl2.predict(valid_x)
balanced_accuracy_score(valid_y, predictions)

In [None]:
proba = askl2.predict_proba(test_x)
output_path = path.join(OUTPUT_DIR_AUTO_SKLEARN, UNIQUE_ID, "manual_model.txt")
np.savetxt(output_path, proba, delimiter="\n")
askl2.save(path.join(OUTPUT_DIR_AUTO_SKLEARN, UNIQUE_ID, "manual_model.pkl"))