In [1]:
# !pip install mljar-supervised

In [2]:
import os
import time
from os import path

import joblib
import numpy as np
import pandas as pd
from autogluon.tabular import TabularPredictor
from scipy.stats import randint, uniform
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.feature_selection import SelectKBest, VarianceThreshold, f_classif
from sklearn.linear_model import ElasticNet, LogisticRegression
from sklearn.metrics import balanced_accuracy_score, mutual_info_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import shuffle
from supervised.automl import AutoML  # mljar-supervised

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Constants

SEED = 42
N_JOBS = -1
RANDOM_SEARCH_N_ITER = 80
TRAIN_TIME_LIMIT_AUTOGLUON = 60 * 60 * 1
TRAIN_TIME_LIMIT_MLJAR = 60 * 60 * 4
TRAIN_TIME_LIMIT_AUTO_SKLEARN = 60 * 30
OUTPUT_DIR_MANUAL = path.join("output", "manual")
OUTPUT_DIR_AUTOGLUON = path.join("output", "autogluon")
OUTPUT_DIR_MLJAR = path.join("output", "mljar")
OUTPUT_DIR_AUTO_SKLEARN = path.join("output", "auto_sklearn")
UNIQUE_ID = time.strftime("%Y%m%d_%H%M%S")
APPLY_REMOVE_LOW_VARIANCE_FEATURES = True
APPLY_REMOVE_CORRELATED_FEATURES = True
APPLY_REMOVE_RANDOM_FEATURES = False
APPLY_ANOVA = True
ANOVE_FEATURES = 20

In [4]:
# prepare output directories
for output_dir in [
    OUTPUT_DIR_MANUAL,
    OUTPUT_DIR_AUTOGLUON,
    OUTPUT_DIR_MLJAR,
    OUTPUT_DIR_AUTO_SKLEARN,
]:
    if not path.exists(path.join(output_dir, UNIQUE_ID)):
        print(f"Creating output directory {path.join(output_dir, UNIQUE_ID)}")
        os.makedirs(path.join(output_dir, UNIQUE_ID))

Creating output directory output\manual\20240114_135717
Creating output directory output\autogluon\20240114_135717
Creating output directory output\mljar\20240114_135717
Creating output directory output\auto_sklearn\20240114_135717


In [5]:
def remove_highly_correlated_features(train_x, valid_x, test_x, threshold=0.95):
    # Calculate correlation matrix
    corr_matrix = np.corrcoef(train_x, rowvar=False)
    # Select upper triangle of correlation matrix
    upper = np.triu(corr_matrix, k=1)
    # Find indices of feature columns with correlation greater than threshold
    to_drop = [i for i in range(upper.shape[1]) if any(upper[:, i] > threshold)]

    # Drop features from train, validation, and test set
    train_x = np.delete(train_x, to_drop, axis=1)
    valid_x = np.delete(valid_x, to_drop, axis=1)
    test_x = np.delete(test_x, to_drop, axis=1)

    return train_x, valid_x, test_x

In [6]:
# Remove Low Variance Columns
def remove_low_variance_features(train_x, valid_x, test_x, threshold=(0.8 * (1 - 0.8))):
    sel = VarianceThreshold(threshold=threshold)
    sel.fit(train_x)
    train_x = train_x[:, sel.get_support(indices=True)]
    valid_x = valid_x[:, sel.get_support(indices=True)]
    test_x = test_x[:, sel.get_support(indices=True)]
    return train_x, valid_x, test_x

In [7]:
# Remove Random Columns
def remove_random_features(
    train_x: np.ndarray,
    train_y: np.ndarray,
    valid_x: np.ndarray,
    test_x: np.ndarray,
    importance=0.005,
):
    tree: DecisionTreeClassifier = DecisionTreeClassifier(random_state=0)
    tree.fit(train_x, train_y)
    importances = tree.feature_importances_

    # Assume columns with very low importance are "random"
    # This threshold can be adjusted based on domain knowledge
    important_indices = [i for i, imp in enumerate(importances) if imp > importance]
    train_x = train_x[:, important_indices]
    valid_x = valid_x[:, important_indices]
    test_x = test_x[:, important_indices]
    return train_x, valid_x, test_x

In [8]:
def anova_filter(
    train_x: np.ndarray,
    train_y: np.ndarray,
    valid_x: np.ndarray,
    test_x: np.ndarray,
    k: int = 50,
):
    selector = SelectKBest(f_classif, k=k)
    selector.fit(train_x, train_y)

    train_x = selector.transform(train_x)
    valid_x = selector.transform(valid_x)
    test_x = selector.transform(test_x)
    return train_x, valid_x, test_x

In [9]:
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.feature_selection import RFE, RFECV


def perform_feature_selection(
    train_x, train_y, valid_x, test_x, n_features_to_select=None
):
    estimator_et = ExtraTreesClassifier()
    rfe_et = RFE(estimator=estimator_et, n_features_to_select=n_features_to_select)
    rfe_et.fit(train_x, train_y)
    train_x = train_x[:,rfe_et.support_]
    valid_x = valid_x[:,rfe_et.support_]
    test_x = test_x[:,rfe_et.support_]
    print(train_x.shape, valid_x.shape, test_x.shape)

    estimator_rf = RandomForestClassifier()
    rfe_rf = RFE(estimator=estimator_rf, n_features_to_select=n_features_to_select)
    rfe_rf.fit(train_x, train_y)
    train_x = train_x[:,rfe_rf.support_]
    valid_x = valid_x[:,rfe_rf.support_]
    test_x = test_x[:,rfe_rf.support_]
    print(train_x.shape, valid_x.shape, test_x.shape)

    rfecv_et = RFECV(estimator=estimator_et)
    rfecv_et.fit(train_x, train_y)
    train_x = train_x[:,rfecv_et.support_]
    valid_x = valid_x[:,rfecv_et.support_]
    test_x = test_x[:,rfecv_et.support_]
    print(train_x.shape, valid_x.shape, test_x.shape)

    rfecv_rf = RFECV(estimator=estimator_rf)
    rfecv_rf.fit(train_x, train_y)
    train_x = train_x[:,rfecv_rf.support_]
    valid_x = valid_x[:,rfecv_rf.support_]
    test_x = test_x[:,rfecv_rf.support_]
    print(train_x.shape, valid_x.shape, test_x.shape)

    return train_x, train_y, valid_x, test_x

In [10]:
def dump_proba(model, test_x, output_path_proba):
    proba = model.predict_proba(test_x)

    if isinstance(proba, pd.DataFrame):
        proba = proba.values

    np.savetxt(
        output_path_proba,
        proba[:, 1],
        delimiter="\n",
        header='"313201_313212"',
        comments="",
        # fmt="%.19f",
    )

In [11]:
def dump_model(model, output_path_model):
    joblib.dump(model, output_path_model)

In [12]:
prefix = ""

_test_x = pd.read_table(prefix + "artificial_test.data", sep=" ", header=None)
_test_x.drop(_test_x.columns[500], axis=1, inplace=True)
_train_y = pd.read_table(prefix + "artificial_train.labels", header=None)
_train_x = pd.read_table(prefix + "artificial_train.data", sep=" ", header=None)
_train_x.drop(_train_x.columns[500], axis=1, inplace=True)

In [13]:
_train_x, _train_y = shuffle(_train_x, _train_y, random_state=42)

In [14]:
def get_train_and_validation_data():
    split = 400
    train_x, valid_x = _train_x[split:].values, _train_x[:split].values
    train_y, valid_y = _train_y[split:].values, _train_y[:split].values
    return train_x, train_y, valid_x, valid_y

In [15]:
train_x, train_y, valid_x, valid_y = get_train_and_validation_data()
train_y = train_y.reshape(-1, 1)
valid_y = valid_y.reshape(-1, 1)
print(train_x.shape, train_y.shape, valid_x.shape, valid_y.shape)

(1600, 500) (1600, 1) (400, 500) (400, 1)


In [16]:
train_x, valid_x, test_x = perform_feature_selection(
    train_x, train_y.copy().ravel(), valid_x, _test_x, n_features_to_select=480
)
print(train_x.shape, valid_x.shape, test_x.shape)

InvalidIndexError: (slice(None, None, None), array([ True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True]))

In [None]:
if APPLY_REMOVE_CORRELATED_FEATURES:
    train_x, valid_x, test_x = remove_highly_correlated_features(
        train_x, valid_x, _test_x
    )
    print("train_x.shape: ", train_x.shape)
    print("valid_x.shape: ", valid_x.shape)
    print("test_x.shape: ", test_x.shape)

In [None]:
if APPLY_REMOVE_LOW_VARIANCE_FEATURES:
    train_x, valid_x, test_x = remove_low_variance_features(train_x, valid_x, test_x)
    print("train_x.shape: ", train_x.shape)
    print("valid_x.shape: ", valid_x.shape)
    print("test_x.shape: ", test_x.shape)

In [None]:
if APPLY_REMOVE_RANDOM_FEATURES:
    train_x, valid_x, test_x = remove_random_features(
        train_x=train_x, train_y=train_y, valid_x=valid_x, test_x=test_x
    )
    print("train_x.shape: ", train_x.shape)
    print("valid_x.shape: ", valid_x.shape)
    print("test_x.shape: ", test_x.shape)

In [None]:
if APPLY_ANOVA:
    train_x, valid_x, test_x = anova_filter(
        train_x=train_x,
        train_y=train_y,
        valid_x=valid_x,
        test_x=test_x,
        k=ANOVE_FEATURES,
    )
    print("train_x.shape: ", train_x.shape)
    print("valid_x.shape: ", valid_x.shape)
    print("test_x.shape: ", test_x.shape)

In [None]:
print("train_x.shape: ", train_x.shape)
print("train_y.shape: ", train_y.shape)
print("valid_x.shape: ", valid_x.shape)
print("valid_y.shape: ", valid_y.shape)
print("test_x.shape: ", test_x.shape)

In [None]:
# sanity check
(
    original_train_x,
    original_train_y,
    original_valid_x,
    original_valid_y,
) = get_train_and_validation_data()
for y, original_y in zip([train_y, valid_y], [original_train_y, original_valid_y]):
    assert y.shape == original_y.shape

### manual model

In [None]:
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import RandomizedSearchCV

base_classifiers_1 = [
    (
        "mlp",
        make_pipeline(
            StandardScaler(),
            MLPClassifier(
                random_state=SEED,
                max_iter=1000,
                tol=1e-3,
                solver="lbfgs",
                hidden_layer_sizes=(100, 300, 200, 100),
                alpha=0.001,
                early_stopping=True,
            ),
        ),
    ),
    (
        "gbc",
        make_pipeline(
            StandardScaler(),
            GradientBoostingClassifier(
                random_state=SEED,
                max_features=None,
                n_estimators=1000,
                max_depth=30,
                min_samples_leaf=2,
                min_samples_split=5,
            ),
        ),
    ),
    (
        "rf",
        make_pipeline(
            StandardScaler(),
            RandomForestClassifier(
                random_state=SEED,
                n_estimators=1000,
                max_depth=30,
                min_samples_leaf=4,
                min_samples_split=2,
            ),
        ),
    ),
]

base_classifiers_2 = [
    (
        "mlp",
        make_pipeline(
            StandardScaler(),
            MLPClassifier(
                random_state=SEED,
                max_iter=1000,
                tol=1e-3,
                solver="lbfgs",
                hidden_layer_sizes=(100, 300, 200, 100),
                alpha=0.001,
                early_stopping=True,
            ),
        ),
    ),
    (
        "gbc",
        make_pipeline(
            StandardScaler(),
            GradientBoostingClassifier(
                random_state=SEED,
                max_features=None,
                n_estimators=1000,
                max_depth=30,
                min_samples_leaf=2,
                min_samples_split=5,
            ),
        ),
    ),
]

param_distributions = {
    # Parameters for the first Stacking Layer
    "stacked_ensemble_1__mlp__mlpclassifier__alpha": uniform(0.0001, 1),
    "stacked_ensemble_1__mlp__mlpclassifier__learning_rate_init": uniform(0.001, 0.1),
    "stacked_ensemble_1__mlp__mlpclassifier__hidden_layer_sizes": [
        (70, 90),
        (70, 80, 90),
        (70, 100, 80, 90),
    ],
    "stacked_ensemble_1__gbc__gradientboostingclassifier__n_estimators": [
        32,
        64,
        100,
        200,
        400,
    ],
    "stacked_ensemble_1__gbc__gradientboostingclassifier__max_depth": [
        10,
        20,
        30,
        40,
        50,
        60,
    ],
    "stacked_ensemble_1__gbc__gradientboostingclassifier__min_samples_split": [
        4,
        8,
        12,
        16,
        20,
    ],
    "stacked_ensemble_1__gbc__gradientboostingclassifier__min_samples_leaf": [
        2,
        4,
        6,
        8,
        10,
    ],
    "stacked_ensemble_1__rf__randomforestclassifier__n_estimators": [
        200,
        400,
        600,
        800,
        1000,
        1200,
    ],
    "stacked_ensemble_1__rf__randomforestclassifier__max_depth": [
        10,
        20,
        30,
        40,
        50,
        60,
        70,
        80,
        90,
        100,
    ],
    "stacked_ensemble_1__rf__randomforestclassifier__min_samples_split": [2, 3, 4, 5],
    "stacked_ensemble_1__rf__randomforestclassifier__min_samples_leaf": [4, 6, 10],
    # Parameters for the Second Stacking Layer
    "stacked_ensemble_2__mlp__mlpclassifier__alpha": uniform(0.0001, 1),
    "stacked_ensemble_2__mlp__mlpclassifier__learning_rate_init": uniform(0.001, 0.1),
    "stacked_ensemble_2__mlp__mlpclassifier__hidden_layer_sizes": [
        (50, 100),
        (50, 100, 100),
        (50, 150, 100, 100),
    ],
    "stacked_ensemble_2__gbc__gradientboostingclassifier__n_estimators": [
        32,
        64,
        100,
        200,
        400,
    ],
    "stacked_ensemble_2__gbc__gradientboostingclassifier__max_depth": [
        10,
        20,
        30,
        40,
        50,
        60,
    ],
    "stacked_ensemble_2__gbc__gradientboostingclassifier__min_samples_split": [
        4,
        8,
        12,
        16,
        20,
    ],
    "stacked_ensemble_2__gbc__gradientboostingclassifier__min_samples_leaf": [
        2,
        4,
        6,
        8,
        10,
    ],
    # Parameters for the Final Estimator
    "stacked_ensemble_1__final_estimator__C": uniform(0.01, 10),
    "stacked_ensemble_2__final_estimator__C": uniform(0.01, 10),
    # Parameters for the Committee
    "gbc__n_estimators": [32, 64, 100, 200, 400],
    "gbc__max_depth": [10, 20, 30, 40, 50, 60],
    "gbc__min_samples_split": [4, 8, 12, 16, 20],
    "gbc__min_samples_leaf": [2, 4, 6, 8, 10],
    "rf__n_estimators": [200, 400, 600, 800, 1000, 1200],
    "rf__max_depth": [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
    "rf__min_samples_split": [2, 3, 4, 5],
    "rf__min_samples_leaf": [4, 6, 10],
}

# First Stacking Layer
stacked_ensamble_1 = StackingClassifier(
    estimators=base_classifiers_1, final_estimator=LogisticRegression(), cv=5
)

# Second Stacking Layer
stacked_ensamble_2 = StackingClassifier(
    estimators=base_classifiers_2, final_estimator=LogisticRegression(), cv=5
)
# Define the committee of models
committee_models = [
    ("stacked_ensemble_1", stacked_ensamble_1),
    ("stacked_ensemble_2", stacked_ensamble_2),
    (
        "gbc",
        make_pipeline(
            StandardScaler(),
            GradientBoostingClassifier(
                random_state=SEED,
                max_features=None,
                n_estimators=1000,
                max_depth=30,
                min_samples_leaf=2,
                min_samples_split=5,
            ),
        ),
    ),
    (
        "rf",
        make_pipeline(
            StandardScaler(),
            RandomForestClassifier(
                random_state=SEED,
                n_estimators=1000,
                max_depth=30,
                min_samples_leaf=4,
                min_samples_split=2,
            ),
        ),
    ),
    (
        "mlp",
        make_pipeline(
            StandardScaler(),
            MLPClassifier(
                random_state=SEED,
                max_iter=1000,
                tol=1e-3,
                solver="lbfgs",
                hidden_layer_sizes=(100, 300, 200, 100),
                alpha=0.001,
                early_stopping=True,
            ),
        ),
    ),
]

# Create the committee model
committee_model = VotingClassifier(committee_models, voting="soft")
committee_model.fit(train_x, train_y.ravel())
# print(committee_model)
# Perform randomized search
# random_search = RandomizedSearchCV(
#     committee_model,
#     param_distributions=param_distributions,
#     scoring="balanced_accuracy",
#     n_iter=1,
#     cv=5,
#     verbose=4,
#     random_state=SEED,
#     n_jobs=8,
# )

# random_search.fit(train_x, train_y.ravel())

In [None]:
y_pred = committee_model.predict(valid_x)
balanced_accuracy = balanced_accuracy_score(valid_y, y_pred)

print(f"Model Balanced Accuracy: {balanced_accuracy}")

In [None]:
print(committee_model.best_params_)

In [None]:
output_path_proba = path.join(OUTPUT_DIR_MANUAL, UNIQUE_ID, "manual_model_proba.txt")
output_path_model = path.join(OUTPUT_DIR_MANUAL, UNIQUE_ID, "manual_model.pkl")
dump_proba(committee_model, test_x, output_path_proba)
dump_model(committee_model, output_path_model)

### Autogluon

In [None]:
train_data = np.concatenate((train_x, train_y), axis=1)
train_data_pd = pd.DataFrame(train_data)
train_data_pd.rename(columns={train_data_pd.columns[-1]: "class"}, inplace=True)

valid_data = np.concatenate((valid_x, valid_y), axis=1)
valid_data_pd = pd.DataFrame(data=valid_data)
valid_data_pd.rename(columns={valid_data_pd.columns[-1]: "class"}, inplace=True)

In [None]:
print(train_data_pd.shape, valid_data_pd.shape)

In [None]:
save_path = path.join(OUTPUT_DIR_AUTOGLUON, UNIQUE_ID)
predictor = TabularPredictor(
    label="class",
    path=save_path,
    eval_metric="balanced_accuracy",
    problem_type="binary",
).fit(
    train_data=train_data_pd,
    # tuning_data=valid_data_pd,
    time_limit=TRAIN_TIME_LIMIT_AUTOGLUON,
    presets="best_quality",
    fit_weighted_ensemble=True,
    fit_full_last_level_weighted_ensemble=True,
    full_weighted_ensemble_additionally=True,
    num_bag_folds=4,
    num_bag_sets=20,
    num_stack_levels=2,
    hyperparameter_tune_kwargs={"num_trials": 1, "searcher": "auto"},
)

In [None]:
predictor.leaderboard()

In [None]:
predictor.evaluate(valid_data_pd)

In [None]:
output_path = path.join(OUTPUT_DIR_AUTOGLUON, UNIQUE_ID, "autogluon_model_pred.txt")
dump_proba(predictor, pd.DataFrame(test_x_pd), output_path_proba)

### MLJar

In [None]:
automl = AutoML(
    mode="Compete",
    ml_task="binary_classification",
    total_time_limit=TRAIN_TIME_LIMIT_MLJAR,
    eval_metric="f1",
    random_state=SEED,
    results_path=path.join(OUTPUT_DIR_MLJAR, UNIQUE_ID),
)

automl.fit(train_x, train_y.ravel())

In [None]:
print(valid_x.shape, valid_y.shape)
print(train_x.shape, train_y.shape)
predictions = automl.predict(valid_x)
score = balanced_accuracy_score(valid_y, predictions)
print(f"Model Balanced Accuracy: {score}")

In [None]:
output_path = path.join(OUTPUT_DIR_MLJAR, UNIQUE_ID, "mljar_model_proba.txt")
dump_proba(automl, test_x, output_path_proba)

In [None]:
def ensemble_predict(X, model1, model2):
    pred1 = model1.predict_proba(pd.DataFrame(X)).values[:, 1]
    pred2 = model2.predict_proba(X)[:, 1]
    print(pred1, pred2)
    avg_pred = (pred1 + pred2) / 2

    return avg_pred


final_predictions = ensemble_predict(test_x, predictor, automl)

In [None]:
os.makedirs(path.join("ensamble", UNIQUE_ID), exist_ok=True)

np.savetxt(
    path.join("ensamble", UNIQUE_ID, "123manual_model_pred.txt"),
    final_predictions,
    delimiter="\n",
    comments="",
    header='"313201"',
)

### Auto SKLearn TODO OUT

In [None]:
# !pip install auto-sklearn
# !pip install ydata-profiling
# from autosklearn.classification import AutoSklearnClassifier
from autosklearn.experimental.askl2 import AutoSklearn2Classifier
from autosklearn.metrics import balanced_accuracy

In [None]:
settings = {
    "time_left_for_this_task": TRAIN_TIME_LIMIT_AUTO_SKLEARN,
    "seed": SEED,
    "metric": balanced_accuracy,
    "n_jobs": -1,
}

In [None]:
askl2 = AutoSklearn2Classifier(**settings)
askl2.fit(train_x, train_y)

In [None]:
leaderboard = askl2.leaderboard(sort_by="model_id", ensemble_only=True)
print(leaderboard)

In [None]:
predictions = askl2.predict(valid_x)
balanced_accuracy_score(valid_y, predictions)

In [None]:
proba = askl2.predict_proba(test_x)
output_path = path.join(OUTPUT_DIR_AUTO_SKLEARN, UNIQUE_ID, "manual_model.txt")
np.savetxt(output_path, proba, delimiter="\n")
askl2.save(path.join(OUTPUT_DIR_AUTO_SKLEARN, UNIQUE_ID, "manual_model.pkl"))