In [1]:
import os

import numpy as np
import pandas as pd
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

from data import *


SEED = 662
DATA_DIR = "processed"

data = get_divided_edu(remove_boring_columns(combined_transformed_train_with_num_pov))

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import warnings

warnings.filterwarnings("ignore")

In [3]:
outliers_mask = pd.read_csv("outliers.csv", header=None).values.flatten()

In [5]:
import traceback

import lightgbm as lgb
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsTransformer
from sklearn.pipeline import Pipeline

from models import *


def load_objective(
    X: pd.DataFrame,
    y: pd.DataFrame,
    y_binarized: pd.DataFrame,
    get_trained_model: callable,
):
    X.replace(-999, np.nan, inplace=True)

    def objective(trial: optuna.Trial):
        chi2_threshold = trial.suggest_int("chi2_threshold", 2, 100)
        cv = trial.suggest_int("cv", 3, 5)
        imputer_strategy = trial.suggest_categorical(
            "imputer_strategy", ["mean", "median", "most_frequent"]
        )

        train_losses, valid_losses, valid_loss_shift, n_iters = [], [], [], []
        X_train, X_valid, y_train, y_valid, y_train_binarized, y_valid_binarized = (
            train_test_split(X, y, y_binarized, test_size=1 / cv, random_state=SEED)
        )  # lazy, just do 1 round of cv

        preprocessor = get_preprocessor(
            ordinal_transformer=Pipeline([("imputer", "passthrough")]),
            imputer_strategy=[
                "most_frequent",
                "most_frequent",
                imputer_strategy,
                imputer_strategy,
            ],
            remainder="drop",
        )

        X_train_processed = preprocessor.fit_transform(X_train)
        selector = SelectPercentile(chi2, percentile=chi2_threshold)
        X_train = selector.fit_transform(X_train_processed, y_train)
        X_valid = preprocessor.transform(X_valid)
        X_valid = selector.transform(X_valid)

        model = get_trained_model(
            trial, seed=SEED, X_train=X_train, y_train=y_train - 1
        )
        # train_losses.append(
        #     log_loss(y_train_binarized, model.predict_proba(X_train), normalize=False)
        #     / len(y_train_binarized)
        # )

        y_pred = model.predict_proba(X_valid)
        for column_index in [
            column for column in range(10) if column not in model.classes_
        ]:
            y_pred = np.insert(y_pred, column_index - 1, 0, axis=1)

        valid_losses.append(
            log_loss(y_valid_binarized, y_pred, normalize=False)
            / len(y_valid_binarized)
        )
        # valid_loss_shift.append(valid_losses[-1] - train_losses[-1])
        # n_iters.append(model.n_iter_[0])

        mean_valid_loss = np.mean(valid_losses)
        trial.set_user_attr(
            "selected_columns", X_train_processed.columns[selector.get_support()]
        )
        # trial.set_user_attr("n_iter", np.mean(n_iters))
        # trial.set_user_attr("train_loss", np.mean(train_losses))
        # trial.set_user_attr("valid_loss_shift", np.mean(valid_loss_shift))
        return mean_valid_loss

    return objective


def get_lr(trial, seed, X_train, y_train):
    model = suggest_logistic_regression(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_linear_svc(trial, seed, X_train, y_train):
    model, calibrated_model = suggest_linear_svc(trial, seed)
    model.fit(X_train, y_train)
    calibrated_model.fit(X_train, y_train)
    return calibrated_model


def get_kernel_svc(trial, seed, X_train, y_train):
    model = suggest_kernel_svc(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_naive_bayes(trial, seed, X_train, y_train):
    model = suggest_naive_bayes(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_knn(trial, seed, X_train, y_train):
    model = suggest_knn_classifier(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_random_forest(trial, seed, X_train, y_train):
    model = suggest_random_forest(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_gradient_boosting(trial, seed, X_train, y_train):
    model = suggest_gradient_boosting(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_mlp(trial, seed, X_train, y_train):
    model = suggest_mlp_classifier(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_lightgbm(trial, seed, X_train, y_train):
    callbacks = [lgb.log_evaluation(period=0)]
    model = suggest_lightgbm(trial, SEED)
    model.fit(X_train, y_train, callbacks=callbacks)
    return model


models = {
    "naive_bayes": (get_naive_bayes, 500),
    "knn": (get_knn, 500),
    "lr": (get_lr, 1000),
    "linear_svc": (get_linear_svc, 1000),
    "kernel_svc": (get_kernel_svc, 300),
    "random_forest": (get_random_forest, 800),
    "gradient_boosting": (get_gradient_boosting, 200),
    "lightgbm": (get_lightgbm, 200),
    "mlp": (get_mlp, 200),
}


optuna.logging.set_verbosity(optuna.logging.ERROR)
data = get_divided_edu(
    remove_boring_columns(transform_all_house(combined_transformed_train_with_num_pov))
)
poi = data[1][~outliers_mask]
nunique = poi.nunique()
cols_to_drop = nunique[nunique == 1].index
poi = poi.drop(cols_to_drop, axis=1)
# poi = poi.drop(columns=["edu_q09", "edu_q10", "edu_q11", "edu_q12", "edu_q13"])
print(poi.shape)
for index, (model_name, (get_trained_model, rounds)) in enumerate(models.items()):
    best_values, best_params = [], []
    print(model_name)
    try:
        X, y, y_binarized = (
            poi.drop(Y_COLUMNS + ["num_pov"], axis=1),
            poi["num_pov"],
            poi[Y_COLUMNS],
        )
        study = optuna.create_study(direction="minimize")
        study.optimize(
            load_objective(X, y, y_binarized, get_trained_model),
            n_trials=rounds,
            n_jobs=-1,
            show_progress_bar=True,
        )
        best_values.append(study.best_value)
        best_params.append(study.best_params)
        print(optuna.importance.get_param_importances(study))
        print(study.best_trial.user_attrs)
    except Exception as e:
        print(traceback.format_exc())
    print(best_values), print(best_params)
    print("=====================================")
    print()
    # break

(4881, 35)
naive_bayes


Best trial: 32. Best value: 2.00161: 100%|██████████| 500/500 [00:20<00:00, 24.88it/s]


{'chi2_threshold': 0.9932872258071062, 'cv': 0.0058879059372613255, 'imputer_strategy': 0.0008248682556324379}
{'selected_columns': Index(['ordinal__edu_q06'], dtype='object')}
[2.001609325083451]
[{'chi2_threshold': 2, 'cv': 3, 'imputer_strategy': 'mean'}]

knn


Best trial: 19. Best value: 7.06509: 100%|██████████| 500/500 [00:26<00:00, 18.63it/s]


{'n_neighbors': 0.9601252818213456, 'weights': 0.017820394299680607, 'chi2_threshold': 0.00978547034503867, 'leaf_size': 0.005802063717589262, 'cv': 0.0029339622777829273, 'algorithm': 0.0018940323346240495, 'p': 0.0011884592333968314, 'imputer_strategy': 0.00045033597054207683}
{'selected_columns': Index(['ordinal__edu_q06'], dtype='object')}
[7.0650941276079475]
[{'chi2_threshold': 2, 'cv': 3, 'imputer_strategy': 'median', 'n_neighbors': 10, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 35, 'p': 1}]

lr


Best trial: 931. Best value: 1.91006: 100%|██████████| 1000/1000 [03:58<00:00,  4.20it/s]


{'chi2_threshold': 0.580650318064757, 'cv': 0.4009988122270501, 'l1_ratio': 0.011671258285288595, 'imputer_strategy': 0.006218404036645019, 'C': 0.00046120738625931765}
{'selected_columns': Index(['binary__house_q02_2.0', 'binary__house_q07_1.0',
       'binary__house_q07_2.0', 'binary__house_q11_2.0',
       'binary__house_q17_1.0', 'categorical__house_q03_2.0',
       'categorical__house_q03_infrequent_sklearn',
       'categorical__house_q06_4.0',
       'categorical__house_q06_infrequent_sklearn',
       'categorical__edu_q11_13.0', 'categorical__edu_q11_infrequent_sklearn',
       'categorical__edu_q17_13.0', 'categorical__edu_q17_infrequent_sklearn',
       'numerical__house_q05y', 'numerical__house_q09', 'numerical__house_q15',
       'numerical__house_q16', 'numerical__house_q21', 'numerical__house_q22',
       'numerical__edu_q07', 'numerical__edu_q18', 'ordinal__house_q13',
       'ordinal__house_q19', 'ordinal__edu_q01', 'ordinal__edu_q02',
       'ordinal__edu_q04', 'ordina

Best trial: 941. Best value: 1.91087: 100%|██████████| 1000/1000 [01:08<00:00, 14.54it/s]


{'calibration_method': 0.7282543793361159, 'chi2_threshold': 0.18339652681752805, 'penalty': 0.06641990463183205, 'C': 0.00809171041515526, 'cv': 0.006819975508905582, 'imputer_strategy': 0.004945318224525804, 'intercept_scaling': 0.002072185065937584}
{'selected_columns': Index(['binary__house_q02_2.0', 'binary__house_q07_1.0',
       'binary__house_q07_2.0', 'binary__house_q11_2.0',
       'binary__house_q17_1.0', 'categorical__house_q03_2.0',
       'categorical__house_q03_infrequent_sklearn',
       'categorical__house_q06_4.0',
       'categorical__house_q06_infrequent_sklearn',
       'categorical__edu_q11_13.0', 'categorical__edu_q11_infrequent_sklearn',
       'categorical__edu_q17_13.0', 'categorical__edu_q17_infrequent_sklearn',
       'numerical__house_q05y', 'numerical__house_q09', 'numerical__house_q15',
       'numerical__house_q16', 'numerical__house_q21', 'numerical__house_q22',
       'numerical__edu_q07', 'numerical__edu_q18', 'ordinal__house_q13',
       'ordinal__ho

Best trial: 214. Best value: 1.92147: 100%|██████████| 300/300 [01:10<00:00,  4.28it/s]


{'kernel': 0.5887223375363113, 'chi2_threshold': 0.21013175599572342, 'cv': 0.06946624655220296, 'imputer_strategy': 0.06624575205516607, 'C': 0.06543390786059625}
{'selected_columns': Index(['binary__house_q02_2.0', 'binary__house_q07_1.0',
       'binary__house_q07_2.0', 'binary__house_q11_2.0',
       'binary__house_q17_1.0', 'categorical__house_q03_2.0',
       'categorical__house_q03_infrequent_sklearn',
       'categorical__house_q06_4.0',
       'categorical__house_q06_infrequent_sklearn',
       'categorical__edu_q11_13.0', 'categorical__edu_q11_infrequent_sklearn',
       'categorical__edu_q17_13.0', 'categorical__edu_q17_infrequent_sklearn',
       'numerical__house_q05y', 'numerical__house_q09', 'numerical__house_q15',
       'numerical__house_q16', 'numerical__house_q21', 'numerical__house_q22',
       'numerical__edu_q07', 'numerical__edu_q18', 'ordinal__house_q13',
       'ordinal__house_q19', 'ordinal__edu_q01', 'ordinal__edu_q02',
       'ordinal__edu_q04', 'ordinal__ed

Best trial: 789. Best value: 1.90746: 100%|██████████| 800/800 [05:17<00:00,  2.52it/s]


{'max_features': 0.4062941914556629, 'max_depth': 0.2513593736360131, 'chi2_threshold': 0.23424485631647965, 'min_samples_leaf': 0.03534648145560918, 'n_estimators': 0.03229248437885589, 'imputer_strategy': 0.016356684002626676, 'min_samples_split': 0.012173091444852582, 'cv': 0.009793909130409329, 'criterion': 0.0021389281794907946}
{'selected_columns': Index(['binary__house_q02_2.0', 'binary__house_q07_1.0',
       'binary__house_q07_2.0', 'binary__house_q11_2.0',
       'binary__house_q17_1.0', 'categorical__house_q03_2.0',
       'categorical__house_q03_infrequent_sklearn',
       'categorical__house_q06_4.0',
       'categorical__house_q06_infrequent_sklearn',
       'categorical__edu_q11_13.0', 'categorical__edu_q11_infrequent_sklearn',
       'categorical__edu_q17_13.0', 'categorical__edu_q17_infrequent_sklearn',
       'numerical__house_q05y', 'numerical__house_q09', 'numerical__house_q15',
       'numerical__house_q16', 'numerical__house_q21', 'numerical__house_q22',
       'n

Best trial: 142. Best value: 1.91005: 100%|██████████| 200/200 [43:19<00:00, 13.00s/it]


{'learning_rate': 0.9115849081212969, 'chi2_threshold': 0.04022404404217753, 'min_samples_split': 0.01849138381949921, 'imputer_strategy': 0.015024008341846544, 'subsample': 0.012930941127553356, 'max_features': 0.0006247481179534219, 'max_depth': 0.0005668769929542068, 'criterion': 0.00022660802733016078, 'n_estimators': 0.00018098571119581628, 'min_samples_leaf': 0.00012023086847185143, 'cv': 2.526482972100971e-05}
{'selected_columns': Index(['binary__house_q02_2.0', 'binary__house_q07_2.0',
       'categorical__house_q03_2.0',
       'categorical__house_q03_infrequent_sklearn',
       'categorical__house_q06_4.0',
       'categorical__house_q06_infrequent_sklearn',
       'categorical__edu_q11_13.0', 'categorical__edu_q11_infrequent_sklearn',
       'categorical__edu_q17_13.0', 'categorical__edu_q17_infrequent_sklearn',
       'numerical__house_q09', 'numerical__house_q15', 'numerical__house_q16',
       'numerical__house_q22', 'numerical__edu_q07', 'numerical__edu_q18',
       'ord

Best trial: 186. Best value: 1.91586: 100%|██████████| 200/200 [53:38<00:00, 16.09s/it]


{'learning_rate': 0.6220992277836801, 'max_depth': 0.23720888175838936, 'chi2_threshold': 0.037797146921662264, 'lambda_l2': 0.03691063174820478, 'lambda_l1': 0.030972285962625086, 'n_estimators': 0.020786308114274933, 'imputer_strategy': 0.014128009367571168, 'cv': 9.750834359234798e-05}
{'selected_columns': Index(['binary__house_q02_2.0', 'binary__house_q07_2.0',
       'categorical__house_q03_2.0',
       'categorical__house_q03_infrequent_sklearn',
       'categorical__house_q06_4.0',
       'categorical__house_q06_infrequent_sklearn',
       'categorical__edu_q11_13.0', 'categorical__edu_q11_infrequent_sklearn',
       'categorical__edu_q17_13.0', 'categorical__edu_q17_infrequent_sklearn',
       'numerical__house_q09', 'numerical__house_q15', 'numerical__house_q16',
       'numerical__house_q22', 'numerical__edu_q07', 'numerical__edu_q18',
       'ordinal__house_q13', 'ordinal__house_q19', 'ordinal__edu_q01',
       'ordinal__edu_q02', 'ordinal__edu_q04', 'ordinal__edu_q05',
    

Best trial: 193. Best value: 1.91685: 100%|██████████| 200/200 [06:47<00:00,  2.04s/it]


{'hidden_layer_sizes': 0.403701063446967, 'chi2_threshold': 0.23577351342977612, 'activation': 0.11573885639773458, 'learning_rate': 0.10043238362462671, 'alpha': 0.05915791344643287, 'imputer_strategy': 0.05895846569074226, 'cv': 0.026237803963720472}
{'selected_columns': Index(['binary__house_q02_2.0', 'binary__house_q07_1.0',
       'binary__house_q07_2.0', 'categorical__house_q03_2.0',
       'categorical__house_q03_infrequent_sklearn',
       'categorical__house_q06_4.0',
       'categorical__house_q06_infrequent_sklearn',
       'categorical__edu_q11_13.0', 'categorical__edu_q11_infrequent_sklearn',
       'categorical__edu_q17_13.0', 'categorical__edu_q17_infrequent_sklearn',
       'numerical__house_q09', 'numerical__house_q15', 'numerical__house_q16',
       'numerical__house_q21', 'numerical__house_q22', 'numerical__edu_q07',
       'numerical__edu_q18', 'ordinal__house_q13', 'ordinal__house_q19',
       'ordinal__edu_q01', 'ordinal__edu_q02', 'ordinal__edu_q04',
       'ordi

In [4]:
import traceback

import lightgbm as lgb
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsTransformer
from sklearn.pipeline import Pipeline

from models import *


def load_objective(
    X: pd.DataFrame,
    y: pd.DataFrame,
    y_binarized: pd.DataFrame,
    get_trained_model: callable,
):
    X.replace(-999, np.nan, inplace=True)

    def objective(trial: optuna.Trial):
        chi2_threshold = trial.suggest_int("chi2_threshold", 2, 100)
        cv = trial.suggest_int("cv", 3, 5)
        imputer_strategy = trial.suggest_categorical(
            "imputer_strategy", ["mean", "median", "most_frequent"]
        )

        train_losses, valid_losses, valid_loss_shift, n_iters = [], [], [], []
        X_train, X_valid, y_train, y_valid, y_train_binarized, y_valid_binarized = (
            train_test_split(X, y, y_binarized, test_size=1 / cv, random_state=SEED)
        )  # lazy, just do 1 round of cv

        preprocessor = get_preprocessor(
            ordinal_transformer=Pipeline([("imputer", "passthrough")]),
            imputer_strategy=[
                "most_frequent",
                "most_frequent",
                imputer_strategy,
                imputer_strategy,
            ],
            remainder="drop",
        )

        X_train_processed = preprocessor.fit_transform(X_train)
        selector = SelectPercentile(chi2, percentile=chi2_threshold)
        X_train = selector.fit_transform(X_train_processed, y_train)
        X_valid = preprocessor.transform(X_valid)
        X_valid = selector.transform(X_valid)

        p = trial.suggest_int("p", 1, 2)
        k_neighbors = trial.suggest_int("k_neighbors", 2, 15)
        transformer = KNeighborsTransformer(
            mode="connectivity", n_neighbors=k_neighbors, p=p
        )
        X_dist_graph = transformer.fit_transform(X_train)
        X_dist_graph.setdiag(0)
        # print((X_dist_graph.sum(axis=1) == k_neighbors).astype(int).sum(), X_train.shape)
        # print(X_dist_graph.sum(axis=1), k_neighbors)
        # print((X_dist_graph @ y_train.T).shape)
        # print(np.asarray(X_dist_graph.sum(axis=1)).shape)

        knn_train_means = np.divide(
            (X_dist_graph @ y_train.T), np.asarray(X_dist_graph.sum(axis=1)).flatten()
        )
        # print(knn_train_means.shape)
        X_train = np.hstack([X_train, knn_train_means.reshape(-1, 1)])
        knn_valid_means = (
            (transformer.kneighbors_graph(X_valid) @ y_train.T) / k_neighbors
        ).reshape(-1, 1)
        X_valid = np.hstack([X_valid, knn_valid_means])

        model = get_trained_model(
            trial, seed=SEED, X_train=X_train, y_train=y_train - 1
        )
        # train_losses.append(
        #     log_loss(y_train_binarized, model.predict_proba(X_train), normalize=False)
        #     / len(y_train_binarized)
        # )

        y_pred = model.predict_proba(X_valid)
        for column_index in [
            column for column in range(10) if column not in model.classes_
        ]:
            y_pred = np.insert(y_pred, column_index - 1, 0, axis=1)

        valid_losses.append(
            log_loss(y_valid_binarized, y_pred, normalize=False)
            / len(y_valid_binarized)
        )
        # valid_loss_shift.append(valid_losses[-1] - train_losses[-1])
        # n_iters.append(model.n_iter_[0])

        mean_valid_loss = np.mean(valid_losses)
        trial.set_user_attr(
            "selected_columns", X_train_processed.columns[selector.get_support()]
        )
        # trial.set_user_attr("n_iter", np.mean(n_iters))
        # trial.set_user_attr("train_loss", np.mean(train_losses))
        # trial.set_user_attr("valid_loss_shift", np.mean(valid_loss_shift))
        return mean_valid_loss

    return objective


def get_lr(trial, seed, X_train, y_train):
    model = suggest_logistic_regression(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_linear_svc(trial, seed, X_train, y_train):
    model, calibrated_model = suggest_linear_svc(trial, seed)
    model.fit(X_train, y_train)
    calibrated_model.fit(X_train, y_train)
    return calibrated_model


def get_kernel_svc(trial, seed, X_train, y_train):
    model = suggest_kernel_svc(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_naive_bayes(trial, seed, X_train, y_train):
    model = suggest_naive_bayes(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_knn(trial, seed, X_train, y_train):
    model = suggest_knn_classifier(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_random_forest(trial, seed, X_train, y_train):
    model = suggest_random_forest(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_gradient_boosting(trial, seed, X_train, y_train):
    model = suggest_gradient_boosting(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_mlp(trial, seed, X_train, y_train):
    model = suggest_mlp_classifier(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_lightgbm(trial, seed, X_train, y_train):
    callbacks = [lgb.log_evaluation(period=0)]
    model = suggest_lightgbm(trial, SEED)
    model.fit(X_train, y_train, callbacks=callbacks)
    return model


models = {
    "naive_bayes": (get_naive_bayes, 500),
    "knn": (get_knn, 500),
    "lr": (get_lr, 1000),
    "linear_svc": (get_linear_svc, 1000),
    "kernel_svc": (get_kernel_svc, 300),
    "random_forest": (get_random_forest, 800),
    "gradient_boosting": (get_gradient_boosting, 200),
    "lightgbm": (get_lightgbm, 200),
    "mlp": (get_mlp, 200),
}


optuna.logging.set_verbosity(optuna.logging.ERROR)
data = get_divided_edu(
    remove_boring_columns(transform_all_house(combined_transformed_train_with_num_pov))
)
poi = data[1][~outliers_mask]
nunique = poi.nunique()
cols_to_drop = nunique[nunique == 1].index
poi = poi.drop(cols_to_drop, axis=1)
# poi = poi.drop(columns=["edu_q09", "edu_q10", "edu_q11", "edu_q12", "edu_q13"])
print(poi.shape)
for index, (model_name, (get_trained_model, rounds)) in enumerate(models.items()):
    best_values, best_params = [], []
    print(model_name)
    try:
        X, y, y_binarized = (
            poi.drop(Y_COLUMNS + ["num_pov"], axis=1),
            poi["num_pov"],
            poi[Y_COLUMNS],
        )
        study = optuna.create_study(direction="minimize")
        study.optimize(
            load_objective(X, y, y_binarized, get_trained_model),
            n_trials=rounds,
            n_jobs=-1,
            show_progress_bar=True,
        )
        best_values.append(study.best_value)
        best_params.append(study.best_params)
        print(optuna.importance.get_param_importances(study))
        print(study.best_trial.user_attrs)
    except Exception as e:
        print(traceback.format_exc())
    print(best_values), print(best_params)
    print("=====================================")
    print()
    # break

(4881, 35)
naive_bayes


Best trial: 270. Best value: 2.06937: 100%|██████████| 500/500 [00:30<00:00, 16.19it/s]


{'chi2_threshold': 0.9960224559945181, 'k_neighbors': 0.0020644029968294036, 'imputer_strategy': 0.0017041860162010179, 'cv': 0.00017418407966506537, 'p': 3.477091278652121e-05}
{'selected_columns': Index(['ordinal__edu_q06'], dtype='object')}
[2.0693729741716074]
[{'chi2_threshold': 3, 'cv': 3, 'imputer_strategy': 'median', 'p': 2, 'k_neighbors': 4}]

knn


Best trial: 469. Best value: 5.497: 100%|██████████| 500/500 [00:38<00:00, 12.88it/s]  


{'n_neighbors': 0.9944246518585331, 'chi2_threshold': 0.0024672006628011322, 'k_neighbors': 0.0008611714045004871, 'weights': 0.0007168660898764568, 'leaf_size': 0.0004951561319713854, 'imputer_strategy': 0.00036599101403049375, 'algorithm': 0.00031739397694676726, 'cv': 0.0002527281169552073, 'p': 9.884074438485276e-05}
{'selected_columns': Index(['ordinal__edu_q04', 'ordinal__edu_q06'], dtype='object')}
[5.496996406955654]
[{'chi2_threshold': 6, 'cv': 5, 'imputer_strategy': 'mean', 'p': 2, 'k_neighbors': 15, 'n_neighbors': 10, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 42}]

lr


Best trial: 550. Best value: 1.90799: 100%|██████████| 1000/1000 [04:48<00:00,  3.46it/s]


{'chi2_threshold': 0.7368160123697628, 'cv': 0.18253338758634782, 'k_neighbors': 0.02733725790431029, 'l1_ratio': 0.024019348539190893, 'imputer_strategy': 0.014603207649002943, 'C': 0.007769576943488697, 'p': 0.0069212090078965526}
{'selected_columns': Index(['binary__house_q02_2.0', 'binary__house_q07_1.0',
       'binary__house_q07_2.0', 'categorical__house_q03_2.0',
       'categorical__house_q03_infrequent_sklearn',
       'categorical__house_q06_4.0',
       'categorical__house_q06_infrequent_sklearn',
       'categorical__edu_q11_13.0', 'categorical__edu_q11_infrequent_sklearn',
       'categorical__edu_q17_13.0', 'categorical__edu_q17_infrequent_sklearn',
       'numerical__house_q09', 'numerical__house_q15', 'numerical__house_q16',
       'numerical__house_q22', 'numerical__edu_q07', 'numerical__edu_q18',
       'ordinal__house_q13', 'ordinal__house_q19', 'ordinal__edu_q01',
       'ordinal__edu_q02', 'ordinal__edu_q04', 'ordinal__edu_q05',
       'ordinal__edu_q06'],
      dt

Best trial: 833. Best value: 1.90714: 100%|██████████| 1000/1000 [02:37<00:00,  6.34it/s]


{'calibration_method': 0.731821376854197, 'chi2_threshold': 0.11832728482660587, 'penalty': 0.06010507454148468, 'C': 0.025238486287375128, 'imputer_strategy': 0.01918208907954812, 'p': 0.019081600562065617, 'k_neighbors': 0.01729341841632802, 'intercept_scaling': 0.005982085301996335, 'cv': 0.0029685841303993695}
{'selected_columns': Index(['binary__house_q02_2.0', 'binary__house_q07_1.0',
       'binary__house_q07_2.0', 'categorical__house_q03_2.0',
       'categorical__house_q03_infrequent_sklearn',
       'categorical__house_q06_4.0',
       'categorical__house_q06_infrequent_sklearn',
       'categorical__edu_q11_13.0', 'categorical__edu_q11_infrequent_sklearn',
       'categorical__edu_q17_13.0', 'categorical__edu_q17_infrequent_sklearn',
       'numerical__house_q09', 'numerical__house_q15', 'numerical__house_q16',
       'numerical__house_q22', 'numerical__edu_q07', 'numerical__edu_q18',
       'ordinal__house_q13', 'ordinal__house_q19', 'ordinal__edu_q01',
       'ordinal__edu

Best trial: 130. Best value: 1.91933: 100%|██████████| 300/300 [01:21<00:00,  3.67it/s]


{'kernel': 0.47493315733535685, 'chi2_threshold': 0.2653132007733747, 'k_neighbors': 0.10099101696288429, 'imputer_strategy': 0.06562296733222502, 'C': 0.038394747876915915, 'cv': 0.03450069470086483, 'p': 0.020244215018378572}
{'selected_columns': Index(['binary__house_q02_2.0', 'binary__house_q07_2.0',
       'categorical__house_q03_2.0',
       'categorical__house_q03_infrequent_sklearn',
       'categorical__house_q06_4.0',
       'categorical__house_q06_infrequent_sklearn',
       'categorical__edu_q11_13.0', 'categorical__edu_q11_infrequent_sklearn',
       'categorical__edu_q17_13.0', 'categorical__edu_q17_infrequent_sklearn',
       'numerical__house_q09', 'numerical__house_q15', 'numerical__house_q16',
       'numerical__house_q22', 'numerical__edu_q07', 'numerical__edu_q18',
       'ordinal__house_q13', 'ordinal__house_q19', 'ordinal__edu_q01',
       'ordinal__edu_q02', 'ordinal__edu_q04', 'ordinal__edu_q05',
       'ordinal__edu_q06'],
      dtype='object')}
[1.919333709284

Best trial: 474. Best value: 1.9079: 100%|██████████| 800/800 [04:51<00:00,  2.74it/s] 


{'max_depth': 0.4910717827391361, 'max_features': 0.2666882964975534, 'chi2_threshold': 0.20684884965906214, 'k_neighbors': 0.009040496918730524, 'min_samples_leaf': 0.008737542797882915, 'n_estimators': 0.007891569404624349, 'imputer_strategy': 0.0042182888621476785, 'criterion': 0.00276474776289042, 'min_samples_split': 0.0014836847247861604, 'p': 0.0009395872411858123, 'cv': 0.00031515339200084523}
{'selected_columns': Index(['binary__house_q02_2.0', 'binary__house_q07_1.0',
       'binary__house_q07_2.0', 'binary__house_q11_2.0',
       'categorical__house_q03_2.0',
       'categorical__house_q03_infrequent_sklearn',
       'categorical__house_q06_4.0',
       'categorical__house_q06_infrequent_sklearn',
       'categorical__edu_q11_13.0', 'categorical__edu_q11_infrequent_sklearn',
       'categorical__edu_q17_13.0', 'categorical__edu_q17_infrequent_sklearn',
       'numerical__house_q05y', 'numerical__house_q09', 'numerical__house_q15',
       'numerical__house_q16', 'numerical__h

Best trial: 199. Best value: 1.92246: 100%|██████████| 200/200 [34:38<00:00, 10.39s/it]


{'learning_rate': 0.5335780747201692, 'chi2_threshold': 0.14116868079088868, 'max_depth': 0.12058587192766933, 'k_neighbors': 0.06543181739249893, 'imputer_strategy': 0.05020674593205407, 'criterion': 0.03429593520796182, 'subsample': 0.020632805842138665, 'min_samples_split': 0.0202523709156975, 'n_estimators': 0.007291964436566221, 'min_samples_leaf': 0.0029422508888445006, 'cv': 0.0019815576925523186, 'max_features': 0.0015763030679308327, 'p': 5.562118502808573e-05}
{'selected_columns': Index(['ordinal__edu_q04', 'ordinal__edu_q06'], dtype='object')}
[1.922461080280412]
[{'chi2_threshold': 7, 'cv': 4, 'imputer_strategy': 'median', 'p': 1, 'k_neighbors': 11, 'learning_rate': 0.0026857575457760598, 'n_estimators': 852, 'max_depth': 6, 'subsample': 0.8833181169091867, 'criterion': 'friedman_mse', 'max_features': 'log2', 'min_samples_split': 5, 'min_samples_leaf': 3}]

lightgbm


Best trial: 191. Best value: 1.92226: 100%|██████████| 200/200 [33:58<00:00, 10.19s/it]


{'learning_rate': 0.4248154992176534, 'lambda_l2': 0.2628155583603399, 'chi2_threshold': 0.2526324421269619, 'lambda_l1': 0.019277806612835652, 'cv': 0.008510719991738235, 'imputer_strategy': 0.00834134046232405, 'k_neighbors': 0.008260924536376075, 'n_estimators': 0.007888541360275517, 'max_depth': 0.007321660448373528, 'p': 0.0001355068831217998}
{'selected_columns': Index(['ordinal__edu_q04', 'ordinal__edu_q06'], dtype='object')}
[1.922261698616725]
[{'chi2_threshold': 7, 'cv': 4, 'imputer_strategy': 'most_frequent', 'p': 2, 'k_neighbors': 4, 'max_depth': 1, 'learning_rate': 0.010794732121145626, 'n_estimators': 644, 'lambda_l1': 0.9568037591985643, 'lambda_l2': 0.6068394145766958}]

mlp


Best trial: 86. Best value: 1.91285: 100%|██████████| 200/200 [12:18<00:00,  3.69s/it]


{'chi2_threshold': 0.5698138678691916, 'activation': 0.13147296237075345, 'hidden_layer_sizes': 0.10688705439924923, 'cv': 0.059164091485620705, 'alpha': 0.04696858488857412, 'learning_rate': 0.03916906032929913, 'imputer_strategy': 0.028153263056770326, 'k_neighbors': 0.01088142367565963, 'p': 0.007489691924881903}
{'selected_columns': Index(['binary__house_q02_2.0', 'binary__house_q07_1.0',
       'binary__house_q07_2.0', 'binary__house_q11_2.0',
       'categorical__house_q03_2.0',
       'categorical__house_q03_infrequent_sklearn',
       'categorical__house_q06_4.0',
       'categorical__house_q06_infrequent_sklearn',
       'categorical__edu_q11_13.0', 'categorical__edu_q11_infrequent_sklearn',
       'categorical__edu_q17_13.0', 'categorical__edu_q17_infrequent_sklearn',
       'numerical__house_q05y', 'numerical__house_q09', 'numerical__house_q15',
       'numerical__house_q16', 'numerical__house_q21', 'numerical__house_q22',
       'numerical__edu_q07', 'numerical__edu_q18', '