### Imports

In [1]:
import os
import time
from os import path

import joblib
import numpy as np
import pandas as pd
from autogluon.tabular import TabularPredictor
from catboost import CatBoostClassifier
from sklearn.ensemble import (
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    StackingClassifier,
    VotingClassifier,
)
from sklearn.feature_selection import (
    RFE,
    RFECV,
    SelectKBest,
    VarianceThreshold,
    f_classif,
)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import shuffle
from supervised.automl import AutoML
from xgboost import XGBClassifier

  from .autonotebook import tqdm as notebook_tqdm


### Constants

In [2]:
SEED = 42
N_JOBS = -1
TRAIN_TIME_LIMIT_AUTOGLUON = 60 * 1 * 1
TRAIN_TIME_LIMIT_MLJAR = 60 * 1 * 1
OUTPUT_DIR_MANUAL = path.join("output", "manual")
OUTPUT_DIR_AUTOGLUON = path.join("output", "autogluon")
OUTPUT_DIR_MLJAR = path.join("output", "mljar")
UNIQUE_ID = time.strftime("%Y%m%d_%H%M%S")
APPLY_REMOVE_LOW_VARIANCE_FEATURES = False
APPLY_REMOVE_CORRELATED_FEATURES = False
APPLY_REMOVE_RANDOM_FEATURES = False
APPLY_RECURSEIVE_FEATURE_ELIMINATION = True
APPLY_ANOVA = False
ANOVE_FEATURES = 25

### Make sure the output directories exist

In [3]:
for output_dir in [OUTPUT_DIR_MANUAL, OUTPUT_DIR_AUTOGLUON, OUTPUT_DIR_MLJAR]:
    if not path.exists(path.join(output_dir, UNIQUE_ID)):
        print(f"Creating output directory {path.join(output_dir, UNIQUE_ID)}")
        os.makedirs(path.join(output_dir, UNIQUE_ID))

Creating output directory output\manual\20240115_225836
Creating output directory output\autogluon\20240115_225836
Creating output directory output\mljar\20240115_225836


### Define utility functions

In [4]:
def remove_highly_correlated_features(train_x, valid_x, test_x, threshold=0.95):
    corr_matrix = np.corrcoef(train_x, rowvar=False)
    upper = np.triu(corr_matrix, k=1)
    to_drop = [i for i in range(upper.shape[1]) if any(upper[:, i] > threshold)]

    train_x = np.delete(train_x, to_drop, axis=1)
    valid_x = np.delete(valid_x, to_drop, axis=1)
    test_x = np.delete(test_x, to_drop, axis=1)

    return train_x, valid_x, test_x

In [5]:
def remove_low_variance_features(train_x, valid_x, test_x, threshold=(0.8 * (1 - 0.8))):
    sel = VarianceThreshold(threshold=threshold)
    sel.fit(train_x)
    train_x = train_x[:, sel.get_support(indices=True)]
    valid_x = valid_x[:, sel.get_support(indices=True)]
    test_x = test_x[:, sel.get_support(indices=True)]
    return train_x, valid_x, test_x

In [6]:
def remove_random_features(
    train_x: np.ndarray,
    train_y: np.ndarray,
    valid_x: np.ndarray,
    test_x: np.ndarray,
    importance=0.005,
):
    tree: DecisionTreeClassifier = DecisionTreeClassifier(random_state=0)
    tree.fit(train_x, train_y)
    importances = tree.feature_importances_

    important_indices = [i for i, imp in enumerate(importances) if imp > importance]
    train_x = train_x[:, important_indices]
    valid_x = valid_x[:, important_indices]
    test_x = test_x[:, important_indices]
    return train_x, valid_x, test_x

In [7]:
def anova_filter(
    train_x: np.ndarray,
    train_y: np.ndarray,
    valid_x: np.ndarray,
    test_x: np.ndarray,
    k: int = 50,
):
    selector = SelectKBest(f_classif, k=k)
    selector.fit(train_x, train_y)

    train_x = selector.transform(train_x)
    valid_x = selector.transform(valid_x)
    test_x = selector.transform(test_x)
    return train_x, valid_x, test_x

In [8]:
def dump_proba(model, test_x, output_path_proba):
    proba = model.predict_proba(test_x)

    if isinstance(proba, pd.DataFrame):
        proba = proba.values

    np.savetxt(
        output_path_proba,
        proba[:, 1],
        delimiter="\n",
        header='"313201_313212"',
        comments="",
    )

In [9]:
def dump_model(model, output_path_model):
    joblib.dump(model, output_path_model)

In [10]:
def perform_feature_selection(train_x, train_y, valid_x, test_x):
    estimator_et = ExtraTreesClassifier(random_state=0)
    rfe_et = RFE(estimator=estimator_et, n_features_to_select=250)
    rfe_et.fit(train_x, train_y)
    train_x = train_x[:, rfe_et.support_]
    valid_x = valid_x[:, rfe_et.support_]
    test_x = test_x[:, rfe_et.support_]
    print(train_x.shape, valid_x.shape, test_x.shape)

    estimator_rf = RandomForestClassifier(random_state=0)
    rfe_rf = RFE(estimator=estimator_rf, n_features_to_select=125)
    rfe_rf.fit(train_x, train_y)
    train_x = train_x[:, rfe_rf.support_]
    valid_x = valid_x[:, rfe_rf.support_]
    test_x = test_x[:, rfe_rf.support_]
    print(train_x.shape, valid_x.shape, test_x.shape)

    rfecv_et = RFECV(estimator=estimator_et, cv=3, min_features_to_select=25)
    rfecv_et.fit(train_x, train_y)
    train_x = train_x[:, rfecv_et.support_]
    valid_x = valid_x[:, rfecv_et.support_]
    test_x = test_x[:, rfecv_et.support_]
    print(train_x.shape, valid_x.shape, test_x.shape)

    rfecv_rf = RFECV(estimator=estimator_rf, cv=3, min_features_to_select=15)
    rfecv_rf.fit(train_x, train_y)
    train_x = train_x[:, rfecv_rf.support_]
    valid_x = valid_x[:, rfecv_rf.support_]
    test_x = test_x[:, rfecv_rf.support_]
    print(train_x.shape, valid_x.shape, test_x.shape)

    selected_columns = np.where(rfecv_rf.support_)[0]

    os.makedirs(path.join("output", UNIQUE_ID), exist_ok=True)

    np.savetxt(
        path.join("output", UNIQUE_ID, "selected_features.txt"),
        selected_columns,
        fmt="%d",
    )

    return train_x, valid_x, test_x

### Load data

In [11]:
prefix = ""

_test_x = pd.read_table(prefix + "artificial_test.data", sep=" ", header=None)
_test_x.drop(_test_x.columns[500], axis=1, inplace=True)
_train_y = pd.read_table(prefix + "artificial_train.labels", header=None)
_train_x = pd.read_table(prefix + "artificial_train.data", sep=" ", header=None)
_train_x.drop(_train_x.columns[500], axis=1, inplace=True)

In [12]:
_test_x = np.array(_test_x, dtype=float, copy=True)
_train_x = np.array(_train_x, dtype=float, copy=True)
_train_y = np.array(_train_y, dtype=float, copy=True)

In [13]:
_train_x, _train_y = shuffle(_train_x, _train_y, random_state=42)

In [14]:
def get_train_and_validation_data():
    split = 400
    train_x, valid_x = _train_x[split:].copy(), _train_x[:split].copy()
    train_y, valid_y = _train_y[split:].copy(), _train_y[:split].copy()
    return train_x, train_y, valid_x, valid_y

In [15]:
train_x, train_y, valid_x, valid_y = get_train_and_validation_data()
print(train_x.shape, train_y.shape, valid_x.shape, valid_y.shape)

(1600, 500) (1600, 1) (400, 500) (400, 1)


### Perform feature selection

In [16]:
if APPLY_RECURSEIVE_FEATURE_ELIMINATION:
    train_x, valid_x, test_x = perform_feature_selection(
        train_x, train_y.copy().ravel(), valid_x, _test_x
    )
    print(train_x.shape, valid_x.shape, test_x.shape)

(1600, 250) (400, 250) (600, 250)
(1600, 125) (400, 125) (600, 125)
(1600, 25) (400, 25) (600, 25)
(1600, 20) (400, 20) (600, 20)
(1600, 20) (400, 20) (600, 20)


In [17]:
if APPLY_REMOVE_CORRELATED_FEATURES:
    train_x, valid_x, test_x = remove_highly_correlated_features(
        train_x, valid_x, _test_x
    )
    print("train_x.shape: ", train_x.shape)
    print("valid_x.shape: ", valid_x.shape)
    print("test_x.shape: ", test_x.shape)

In [18]:
if APPLY_REMOVE_LOW_VARIANCE_FEATURES:
    train_x, valid_x, test_x = remove_low_variance_features(train_x, valid_x, test_x)
    print("train_x.shape: ", train_x.shape)
    print("valid_x.shape: ", valid_x.shape)
    print("test_x.shape: ", test_x.shape)

In [19]:
if APPLY_REMOVE_RANDOM_FEATURES:
    train_x, valid_x, test_x = remove_random_features(
        train_x=train_x, train_y=train_y, valid_x=valid_x, test_x=test_x
    )
    print("train_x.shape: ", train_x.shape)
    print("valid_x.shape: ", valid_x.shape)
    print("test_x.shape: ", test_x.shape)

In [20]:
if APPLY_ANOVA:
    train_x, valid_x, test_x = anova_filter(
        train_x=train_x,
        train_y=train_y,
        valid_x=valid_x,
        test_x=test_x,
        k=ANOVE_FEATURES,
    )
    print("train_x.shape: ", train_x.shape)
    print("valid_x.shape: ", valid_x.shape)
    print("test_x.shape: ", test_x.shape)

In [21]:
print("train_x.shape: ", train_x.shape)
print("train_y.shape: ", train_y.shape)
print("valid_x.shape: ", valid_x.shape)
print("valid_y.shape: ", valid_y.shape)
print("test_x.shape: ", test_x.shape)

train_x.shape:  (1600, 20)
train_y.shape:  (1600, 1)
valid_x.shape:  (400, 20)
valid_y.shape:  (400, 1)
test_x.shape:  (600, 20)


### Train manual model

In [22]:
base_classifiers_1 = [
    (
        "mlp",
        make_pipeline(
            StandardScaler(),
            MLPClassifier(
                random_state=SEED,
                max_iter=1000,
                early_stopping=True,
                tol=1e-3,
                solver="lbfgs",
                hidden_layer_sizes=(100, 300, 200, 100),
                alpha=0.001,
            ),
        ),
    ),
    (
        "rf",
        make_pipeline(
            StandardScaler(),
            RandomForestClassifier(
                random_state=SEED,
                n_estimators=500,
                max_depth=30,
                min_samples_leaf=4,
                min_samples_split=2,
            ),
        ),
    ),
]

base_classifiers_2 = [
    (
        "mlp",
        make_pipeline(
            StandardScaler(),
            MLPClassifier(
                random_state=SEED,
                max_iter=1000,
                early_stopping=True,
                tol=1e-3,
                solver="lbfgs",
                hidden_layer_sizes=(100, 300, 200, 100),
                alpha=0.001,
            ),
        ),
    ),
    (
        "gbc",
        make_pipeline(
            StandardScaler(),
            GradientBoostingClassifier(
                random_state=SEED,
                max_features=None,
                n_estimators=500,
                max_depth=30,
                min_samples_leaf=2,
                min_samples_split=5,
            ),
        ),
    ),
]

stacked_ensamble_1 = StackingClassifier(
    estimators=base_classifiers_1, final_estimator=LogisticRegression(), cv=5
)

stacked_ensamble_2 = StackingClassifier(
    estimators=base_classifiers_2, final_estimator=LogisticRegression(), cv=5
)

committee_models = [
    ("stacked_ensemble_1", stacked_ensamble_1),
    ("stacked_ensemble_2", stacked_ensamble_2),
    (
        "gbc",
        make_pipeline(
            StandardScaler(),
            GradientBoostingClassifier(
                random_state=SEED,
                max_features=None,
                n_estimators=500,
                max_depth=30,
                min_samples_leaf=2,
                min_samples_split=5,
            ),
        ),
    ),
    (
        "et",
        make_pipeline(
            StandardScaler(),
            ExtraTreesClassifier(
                random_state=SEED,
                n_estimators=500,
                max_depth=30,
                min_samples_leaf=4,
                min_samples_split=2,
            ),
        ),
    ),
    (
        "rf",
        make_pipeline(
            StandardScaler(),
            RandomForestClassifier(
                random_state=SEED,
                n_estimators=500,
                max_depth=30,
                min_samples_leaf=4,
                min_samples_split=2,
            ),
        ),
    ),
    (
        "mlp",
        make_pipeline(
            StandardScaler(),
            MLPClassifier(
                random_state=SEED,
                max_iter=1000,
                early_stopping=True,
                tol=1e-3,
                solver="lbfgs",
                hidden_layer_sizes=(100, 300, 200, 100),
                alpha=0.001,
            ),
        ),
    ),
    (
        "cb",
        make_pipeline(
            StandardScaler(),
            CatBoostClassifier(
                iterations=500,
                learning_rate=0.03,
                depth=6,
                l2_leaf_reg=3,
                border_count=32,
                cat_features=None,
                loss_function="Logloss",
                eval_metric="Accuracy",
                random_seed=SEED,
                early_stopping_rounds=50,
                verbose=100,
            ),
        ),
    ),
    (
        "xgb",
        make_pipeline(
            StandardScaler(),
            XGBClassifier(
                random_state=SEED,
                use_label_encoder=False,
                eval_metric=balanced_accuracy_score,
                n_estimators=500,
                learning_rate=0.02,
                max_depth=6,
                min_child_weight=1,
                subsample=0.8,
                colsample_bytree=0.8,
                gamma=0,
                reg_alpha=0.1,
                reg_lambda=1.0,
                scale_pos_weight=1,
            ),
        ),
    ),
]

committee_model = VotingClassifier(committee_models, voting="soft")
committee_model.fit(train_x.copy(), train_y.copy().ravel())

0:	learn: 0.7775000	total: 141ms	remaining: 1m 10s
100:	learn: 0.9162500	total: 336ms	remaining: 1.33s
200:	learn: 0.9418750	total: 582ms	remaining: 866ms
300:	learn: 0.9631250	total: 837ms	remaining: 553ms
400:	learn: 0.9775000	total: 1.11s	remaining: 274ms
499:	learn: 0.9862500	total: 1.37s	remaining: 0us


In [25]:
y_pred = committee_model.predict(train_x)
balanced_accuracy = balanced_accuracy_score(train_y, y_pred)
print("Committee Model Score:", balanced_accuracy)


Committee Model Score: 1.0


In [26]:
y_pred = committee_model.predict(valid_x)
balanced_accuracy = balanced_accuracy_score(valid_y, y_pred)

print(f"Model Balanced Accuracy: {balanced_accuracy}")

Model Balanced Accuracy: 0.8999774949363607


In [None]:
output_path_proba = path.join(OUTPUT_DIR_MANUAL, UNIQUE_ID, "manual_model_proba.txt")
output_path_model = path.join(OUTPUT_DIR_MANUAL, UNIQUE_ID, "manual_model.pkl")
dump_proba(committee_model, test_x, output_path_proba)
dump_model(committee_model, output_path_model)

### Train model with Autogloun

In [None]:
train_data = np.concatenate((train_x.copy(), train_y.copy()), axis=1)
train_data_pd = pd.DataFrame(train_data, copy=True)
train_data_pd.rename(columns={train_data_pd.columns[-1]: "class"}, inplace=True)

valid_data = np.concatenate((valid_x.copy(), valid_y.copy()), axis=1)
valid_data_pd = pd.DataFrame(data=valid_data, copy=True)
valid_data_pd.rename(columns={valid_data_pd.columns[-1]: "class"}, inplace=True)

print(train_data_pd.shape, valid_data_pd.shape)

In [None]:
save_path = path.join(OUTPUT_DIR_AUTOGLUON, UNIQUE_ID)
predictor = TabularPredictor(
    label="class",
    path=save_path,
    eval_metric="balanced_accuracy",
    problem_type="binary",
).fit(
    train_data=train_data_pd,
    time_limit=TRAIN_TIME_LIMIT_AUTOGLUON,
    presets="best_quality",
    hyperparameters="default",
    fit_weighted_ensemble=True,
    fit_full_last_level_weighted_ensemble=True,
    full_weighted_ensemble_additionally=True,
    # num_bag_folds=15,
    # num_bag_sets=25,
    num_stack_levels=3,
    auto_stack=True,
    dynamic_stacking=True,
    feature_generator="auto",
    # hyperparameter_tune_kwargs={
    #     "scheduler": "local",
    #     "searcher": "auto",
    #     "time_out": 1200,
    #     "num_trials": 30,
    # },
)

In [None]:
predictor.leaderboard()

In [None]:
predictor.evaluate(valid_data_pd)

In [None]:
output_path_proba = path.join(
    OUTPUT_DIR_AUTOGLUON, UNIQUE_ID, "autogluon_model_proba.txt"
)
dump_proba(predictor, pd.DataFrame(test_x), output_path_proba)

### Train model with MLJAR

In [None]:
automl = AutoML(
    mode="Compete",
    ml_task="binary_classification",
    total_time_limit=TRAIN_TIME_LIMIT_MLJAR,
    eval_metric="f1",
    random_state=SEED,
    results_path=path.join(OUTPUT_DIR_MLJAR, UNIQUE_ID, "tmp"),
)
train_y = train_y.copy().reshape(-1)
print(train_y)
automl.fit(train_x.copy(), train_y)

In [None]:
print(valid_x.shape, valid_y.shape)
print(train_x.shape, train_y.shape)
predictions = automl.predict_proba(valid_y.copy())

score = balanced_accuracy_score(valid_y, predictions)

print(f"Model Balanced Accuracy: {score}")

In [None]:
output_path = path.join(OUTPUT_DIR_MLJAR, UNIQUE_ID, "mljar_model_proba.txt")
dump_proba(automl, test_x, output_path)