In [1]:
# !pip install mljar-supervised

In [2]:
import os
import time
from os import path

import joblib
import numpy as np
import pandas as pd
from autogluon.tabular import TabularPredictor
from scipy.stats import randint, uniform
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.feature_selection import SelectKBest, VarianceThreshold, f_classif
from sklearn.linear_model import ElasticNet, LogisticRegression
from sklearn.metrics import balanced_accuracy_score, mutual_info_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import shuffle
from supervised.automl import AutoML  # mljar-supervised

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Constants

SEED = 42
N_JOBS = -1
RANDOM_SEARCH_N_ITER = 10
TRAIN_TIME_LIMIT_AUTOGLUON = 60 * 1
TRAIN_TIME_LIMIT_MLJAR = 60 * 1
TRAIN_TIME_LIMIT_AUTO_SKLEARN = 60 * 1
OUTPUT_DIR_MANUAL = path.join("output", "manual")
OUTPUT_DIR_AUTOGLUON = path.join("output", "autogluon")
OUTPUT_DIR_MLJAR = path.join("output", "mljar")
OUTPUT_DIR_AUTO_SKLEARN = path.join("output", "auto_sklearn")
UNIQUE_ID = time.strftime("%Y%m%d_%H%M%S")
APPLY_REMOVE_LOW_VARIANCE_FEATURES = True
APPLY_REMOVE_CORRELATED_FEATURES = True
APPLY_REMOVE_RANDOM_FEATURES = True
APPLY_ANOVA = True

In [4]:
# prepare output directories
for output_dir in [
    OUTPUT_DIR_MANUAL,
    OUTPUT_DIR_AUTOGLUON,
    OUTPUT_DIR_MLJAR,
    OUTPUT_DIR_AUTO_SKLEARN,
]:
    if not path.exists(path.join(output_dir, UNIQUE_ID)):
        print(f"Creating output directory {path.join(output_dir, UNIQUE_ID)}")
        os.makedirs(path.join(output_dir, UNIQUE_ID))

Creating output directory output\manual\20240110_170735
Creating output directory output\autogluon\20240110_170735
Creating output directory output\mljar\20240110_170735
Creating output directory output\auto_sklearn\20240110_170735


In [5]:
# Remove Highly Correlated Columns
# def remove_highly_correlated_features(train_x, valid_x, test_x, threshold=0.95):
#     corr_matrix = np.corrcoef(train_x, rowvar=False)
#     upper = np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
#     to_drop = np.where(np.abs(corr_matrix[upper]) > threshold)[0]
#     print(to_drop)
#     train_x = np.delete(train_x, to_drop, axis=1)
#     valid_x = np.delete(valid_x, to_drop, axis=1)
#     test_x = np.delete(test_x, to_drop, axis=1)
#     return train_x, valid_x, test_x

def remove_highly_correlated_features(train_x, valid_x, test_x, threshold=0.95):
    # Calculate correlation matrix
    corr_matrix = np.corrcoef(train_x, rowvar=False)
    # Select upper triangle of correlation matrix
    upper = np.triu(corr_matrix, k=1)
    # Find indices of feature columns with correlation greater than threshold
    to_drop = [i for i in range(upper.shape[1]) if any(upper[:, i] > threshold)]
    
    # Drop features from train, validation, and test set
    train_x = np.delete(train_x, to_drop, axis=1)
    valid_x = np.delete(valid_x, to_drop, axis=1)
    test_x = np.delete(test_x, to_drop, axis=1)
    
    return train_x, valid_x, test_x
# pandas
# # Remove Highly Correlated Columns
# def remove_highly_correlated_features(train_x, valid_x, text_x, threshold=0.95):
#     corr_matrix = train_x.corr().abs()
#     # Select upper triangle of correlation matrix
#     upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
#     # Find index of feature columns with correlation greater than threshold
#     to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
#     train_x = train_x.drop(to_drop, axis=1)
#     valid_x = valid_x.drop(to_drop, axis=1)
#     text_x = text_x.drop(to_drop, axis=1)
#     return train_x, valid_x, text_x


In [6]:
# Remove Low Variance Columns
def remove_low_variance_features(train_x, valid_x, test_x, threshold=(0.8 * (1 - 0.8))):
    sel = VarianceThreshold(threshold=threshold)
    sel.fit(train_x)
    train_x = train_x[:, sel.get_support(indices=True)]
    valid_x = valid_x[:, sel.get_support(indices=True)]
    test_x = test_x[:, sel.get_support(indices=True)]
    return train_x, valid_x, test_x

In [7]:
# Remove Random Columns
def remove_random_features(
    train_x: np.ndarray,
    train_y: np.ndarray,
    valid_x: np.ndarray,
    test_x: np.ndarray,
    importance=0.005,
):
    tree: DecisionTreeClassifier = DecisionTreeClassifier(random_state=0)
    tree.fit(train_x, train_y)
    importances = tree.feature_importances_

    # Assume columns with very low importance are "random"
    # This threshold can be adjusted based on domain knowledge
    important_indices = [i for i, imp in enumerate(importances) if imp > importance]
    train_x = train_x[:, important_indices]
    valid_x = valid_x[:, important_indices]
    test_x = test_x[:, important_indices]
    return train_x, valid_x, test_x

In [8]:
def anova_filter(
    train_x: np.ndarray,
    train_y: np.ndarray,
    valid_x: np.ndarray,
    test_x: np.ndarray,
    k: int = 50,
):
    # Using ANOVA F-test to select features
    selector = SelectKBest(
        f_classif, k=k
    )  # Change k to select the number of features you want
    selector.fit(train_x, train_y)

    # Get F-values and p-values for each feature
    # f_values = selector.scores_
    # p_values = selector.pvalues_

    # Selecting features (you can use a threshold or select top k features)
    # selected_features = train_x.columns[selector.get_support()]

    # Transforming train_x to include only the selected features
    train_x = selector.transform(train_x)
    valid_x = selector.transform(valid_x)
    test_x = selector.transform(test_x)
    return train_x, valid_x, test_x

In [9]:
prefix = ""

_test_x = pd.read_table(prefix + "artificial_test.data", sep=" ", header=None)
_test_x.drop(_test_x.columns[500], axis=1, inplace=True)
_train_y = pd.read_table(prefix + "artificial_train.labels", header=None)
_train_x = pd.read_table(prefix + "artificial_train.data", sep=" ", header=None)
_train_x.drop(_train_x.columns[500], axis=1, inplace=True)

In [10]:
_train_x, _train_y = shuffle(_train_x, _train_y, random_state=42)

In [11]:
def get_train_and_validation_data():
    split = 400
    train_x, valid_x = _train_x[split:].values, _train_x[:split].values
    train_y, valid_y = _train_y[split:].values, _train_y[:split].values
    return train_x, train_y, valid_x, valid_y

In [12]:
train_x, train_y, valid_x, valid_y = get_train_and_validation_data()
print(train_x.shape, train_y.shape, valid_x.shape, valid_y.shape)

(1600, 500) (1600, 1) (400, 500) (400, 1)


In [13]:
if APPLY_REMOVE_CORRELATED_FEATURES:
    train_x, valid_x, test_x = remove_highly_correlated_features(
        train_x, valid_x, _test_x
    )
    print("train_x.shape: ", train_x.shape)

train_x.shape:  (1600, 490)


In [14]:
if APPLY_REMOVE_LOW_VARIANCE_FEATURES:
    train_x, valid_x, test_x = remove_low_variance_features(train_x, valid_x, test_x)
    print("train_x.shape: ", train_x.shape)

train_x.shape:  (1600, 490)


In [15]:
if APPLY_REMOVE_RANDOM_FEATURES:
    train_x, valid_x, test_x = remove_random_features(
        train_x=train_x, train_y=train_y, valid_x=valid_x, test_x=test_x
    )
    print("train_x.shape: ", train_x.shape)

train_x.shape:  (1600, 40)


In [16]:
if APPLY_ANOVA:
    train_x, valid_x, test_x = anova_filter(
        train_x=train_x, train_y=train_y, valid_x=valid_x, test_x=test_x, k=40
    )
    print("train_x.shape: ", train_x.shape)

train_x.shape:  (1600, 40)




In [17]:
# convert to dataframe
# train_x = pd.DataFrame(train_x)
# valid_x = pd.DataFrame(valid_x)

In [18]:
print("train_x.shape: ", train_x.shape)
print("train_y.shape: ", train_y.shape)

train_x.shape:  (1600, 40)
train_y.shape:  (1600, 1)


In [19]:
# label = "class"
# train_y = train_y.rename(columns={0: label})
# valid_y = valid_y.rename(columns={0: label})
# train_data = pd.concat([train_x, train_y[label]], axis=1)

In [20]:
# sanity check
(
    original_train_x,
    original_train_y,
    original_valid_x,
    original_valid_y,
) = get_train_and_validation_data()
for y, original_y in zip([train_y, valid_y], [original_train_y, original_valid_y]):
    assert y.shape == original_y.shape

### manual model

In [None]:
# train_y = train_y.ravel()
# valid_y = valid_y.ravel()

In [None]:
base_classifiers = [
    (
        "rf",
        make_pipeline(
            StandardScaler(), RandomForestClassifier(n_estimators=100, random_state=42)
        ),
    ),
    ("svc", make_pipeline(StandardScaler(), SVC(random_state=42))),
    ("dt", make_pipeline(StandardScaler(), DecisionTreeClassifier(random_state=42))),
    (
        "elasticnet",
        make_pipeline(
            StandardScaler(),
            ElasticNet(
                alpha=0.0001, l1_ratio=0.15, max_iter=1000, tol=1e-3, random_state=42
            ),
        ),
    ),
    (
        "mlp",
        make_pipeline(
            StandardScaler(),
            MLPClassifier(
                random_state=42,
                max_iter=1000,
                tol=1e-3,
                hidden_layer_sizes=(100, 300, 200, 100),
            ),
        ),
    ),
]

param_distributions = {
    "stackingclassifier__rf__randomforestclassifier__n_estimators": randint(50, 200),
    "stackingclassifier__rf__randomforestclassifier__max_depth": randint(3, 50),
    "stackingclassifier__rf__randomforestclassifier__min_samples_split": randint(2, 20),
    "stackingclassifier__rf__randomforestclassifier__min_samples_leaf": randint(1, 20),
    "stackingclassifier__svc__svc__C": uniform(0.1, 10),
    "stackingclassifier__svc__svc__gamma": ["scale", "auto"],
    "stackingclassifier__dt__decisiontreeclassifier__max_depth": randint(3, 50),
    "stackingclassifier__dt__decisiontreeclassifier__min_samples_split": randint(2, 20),
    "stackingclassifier__dt__decisiontreeclassifier__min_samples_leaf": randint(1, 20),
    "stackingclassifier__elasticnet__elasticnet__alpha": uniform(0.0001, 1),
    "stackingclassifier__elasticnet__elasticnet__l1_ratio": uniform(0, 1),
    "stackingclassifier__mlp__mlpclassifier__alpha": uniform(0.0001, 1),
    "stackingclassifier__mlp__mlpclassifier__learning_rate_init": uniform(0.001, 0.1),
    "stackingclassifier__mlp__mlpclassifier__hidden_layer_sizes": [
        (100, 300, 200, 100),
        (100, 300, 200, 100, 50),
        (100, 300, 200, 100, 50, 25),
    ],
    "stackingclassifier__final_estimator__C": uniform(0.01, 10),
}


stacked_ensemble_model = make_pipeline(
    StackingClassifier(
        estimators=base_classifiers,
        final_estimator=LogisticRegression(),
        cv=5,
    )
)

random_search = RandomizedSearchCV(
    stacked_ensemble_model,
    param_distributions=param_distributions,
    scoring="balanced_accuracy",
    n_iter=1,
    cv=5,
    verbose=4,
    random_state=SEED,
    n_jobs=-1,
)
random_search.fit(train_x, train_y)
y_pred = random_search.predict(valid_x)
balanced_accuracy = balanced_accuracy_score(valid_y, y_pred)
print(f"Model Balanced Accuracy: {balanced_accuracy}")

In [None]:
proba = random_search.predict_proba(test_x)
output_path = path.join(OUTPUT_DIR_MANUAL, UNIQUE_ID, "manual_model.txt")
np.savetxt(output_path, proba, delimiter="\n")
joblib.dump(random_search, path.join(OUTPUT_DIR_MANUAL, UNIQUE_ID, "manual_model_pred.pkl"))
# random_search.save(path.join(OUTPUT_DIR_MANUAL, UNIQUE_ID, "manual_model.pkl"))

### Autogluon

In [21]:
print(train_x.shape, train_y.shape)
print(valid_x.shape, valid_y.shape )


(1600, 40) (1600, 1)
(400, 40) (400, 1)


In [24]:
train_data= np.concatenate((train_x, train_y), axis=1 )
train_data_pd = pd.DataFrame(train_data)
train_data_pd.rename(columns={train_data_pd.columns[-1]: "class"}, inplace=True)

valid_data = np.concatenate((valid_x, valid_y), axis=1)
valid_data_pd = pd.DataFrame(valid_data)
valid_data_pd.rename(columns={valid_data_pd.columns[-1]: "class"}, inplace=True)

In [25]:
print(train_data_pd.shape, valid_data_pd.shape)

(1600, 41) (400, 41)


In [27]:
save_path = path.join(OUTPUT_DIR_AUTOGLUON, UNIQUE_ID)
predictor = TabularPredictor(
    label="class", path=save_path, eval_metric="balanced_accuracy", problem_type="binary"
).fit(
    train_data_pd,
    time_limit=TRAIN_TIME_LIMIT_AUTOGLUON,
    presets="best_quality",
    hyperparameters="default",
)

Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 60 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: autogluon_save/ds_sub_fit/sub_fit_ho.
2024-01-10 17:08:54,168	INFO util.py:159 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
Beginning AutoGluon training ... Time limit = 15s
AutoGluon will save models to "autogluon_save/ds_sub_fit/sub_fit_ho"
AutoGluon Version:  1.0.0
Py

In [28]:
predictor.leaderboard()

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,CatBoost_BAG_L1,0.859984,balanced_accuracy,0.000551,14.284969,0.000551,14.284969,1,True,7
1,WeightedEnsemble_L2,0.859984,balanced_accuracy,0.016125,14.967341,0.015574,0.682373,2,True,8
2,LightGBM_BAG_L1,0.851847,balanced_accuracy,0.037408,5.016441,0.037408,5.016441,1,True,4
3,KNeighborsDist_BAG_L1,0.815672,balanced_accuracy,0.032716,0.006662,0.032716,0.006662,1,True,2
4,KNeighborsUnif_BAG_L1,0.815672,balanced_accuracy,0.033736,0.009012,0.033736,0.009012,1,True,1
5,RandomForestEntr_BAG_L1,0.803855,balanced_accuracy,0.185975,0.9904,0.185975,0.9904,1,True,6
6,LightGBMXT_BAG_L1,0.79569,balanced_accuracy,0.01999,3.819994,0.01999,3.819994,1,True,3
7,RandomForestGini_BAG_L1,0.792605,balanced_accuracy,0.180362,1.098191,0.180362,1.098191,1,True,5


In [31]:
predictor.evaluate(valid_data_pd)

{'balanced_accuracy': 0.8550423845365207,
 'accuracy': 0.855,
 'mcc': 0.7100137552699213,
 'roc_auc': 0.9167062589082544,
 'f1': 0.8535353535353536,
 'precision': 0.8492462311557789,
 'recall': 0.8578680203045685}

In [34]:
proba = predictor.predict_proba(pd.DataFrame(test_x))
output_path = path.join(OUTPUT_DIR_AUTOGLUON, UNIQUE_ID, "manual_model_pred.txt")
np.savetxt(output_path, proba, delimiter="\n")
# predictor.save(path.join(OUTPUT_DIR_AUTOGLUON, UNIQUE_ID, "manual_model.pkl"))

### MLJar

In [36]:
automl = AutoML(
    mode="Compete",
    ml_task="binary_classification",
    total_time_limit=TRAIN_TIME_LIMIT_MLJAR,
    eval_metric="f1",
    random_state=SEED,
    results_path=path.join(OUTPUT_DIR_MLJAR, UNIQUE_ID),
)

automl.fit(train_x, train_y.ravel())

AutoML directory: output\mljar\20240110_170735
The task is binary_classification with evaluation metric f1
AutoML will use algorithms: ['Decision Tree', 'Linear', 'Random Forest', 'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network', 'Nearest Neighbors']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree f1 0.757764 trained in 4.76 seconds
Disable stacking for split validation
* Step simple_algorithms will try to check up to 3 models
2_DecisionTree f1 0.618182 trained in 3.4 seconds
3_DecisionTree f1 0.618182 trained in 3.15 seconds
4_Linear f1 0.591195 trained in 4.4 seconds
Skip default_algorithms because of the t



5_Xgboost f1 0.797546 trained in 4.86 seconds
23_CatBoost f1 0.851852 trained in 4.44 seconds
Skip golden_features because of the time limit.
* Step kmeans_features will try to check up to 3 models




14_LightGBM_KMeansFeatures f1 0.817073 trained in 4.69 seconds
Not enough time to perform features selection. Skip
Time needed for features selection ~ 10.0 seconds
Please increase total_time_limit to at least (156 seconds) to have features selection
Skip insert_random_feature because no parameters were generated.
Skip features_selection because no parameters were generated.
* Step hill_climbing_1 will try to check up to 10 models
24_CatBoost f1 0.829268 trained in 4.45 seconds
* Step hill_climbing_2 will try to check up to 13 models
25_CatBoost f1 0.851852 trained in 4.07 seconds
* Step ensemble will try to check up to 1 model
Ensemble f1 0.851852 trained in 2.88 seconds
AutoML fit time: 67.57 seconds
AutoML best model: 23_CatBoost


In [37]:
predictions = automl.predict(valid_x)
balanced_accuracy_score(valid_y, predictions)

0.8248855992598334

In [39]:
proba = automl.predict_proba(test_x)
output_path = path.join(OUTPUT_DIR_MLJAR, UNIQUE_ID, "manual_model_proba.txt")
np.savetxt(output_path, proba, delimiter="\n")

### Auto SKLearn

In [None]:
# !pip install auto-sklearn
# !pip install ydata-profiling
from autosklearn.classification import AutoSklearnClassifier
from autosklearn.experimental.askl2 import AutoSklearn2Classifier
from autosklearn.metrics import balanced_accuracy

In [None]:
settings = {
    "time_left_for_this_task": TRAIN_TIME_LIMIT_AUTO_SKLEARN,
    "seed": SEED,
    "metric": balanced_accuracy,
    "n_jobs": -1,
}

In [None]:
askl2 = AutoSklearn2Classifier(**settings)
askl2.fit(train_x, train_y)

In [None]:
leaderboard = askl2.leaderboard(sort_by="model_id", ensemble_only=True)
print(leaderboard)

In [None]:
predictions = askl2.predict(valid_x)
balanced_accuracy_score(valid_y, predictions)

In [None]:
proba = askl2.predict_proba(test_x)
output_path = path.join(OUTPUT_DIR_AUTO_SKLEARN, UNIQUE_ID, "manual_model.txt")
np.savetxt(output_path, proba, delimiter="\n")
askl2.save(path.join(OUTPUT_DIR_AUTO_SKLEARN, UNIQUE_ID, "manual_model.pkl"))