In [None]:
import os

import numpy as np
import pandas as pd
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

from data import *


SEED = 662
DATA_DIR = "processed"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import warnings

warnings.filterwarnings("ignore")

In [None]:
import traceback

import lightgbm as lgb
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsTransformer
from sklearn.pipeline import Pipeline

from models import *


def load_objective(
    X: pd.DataFrame,
    y: pd.DataFrame,
    y_binarized: pd.DataFrame,
    get_trained_model: callable,
):
    X.replace(-999, np.nan, inplace=True)

    def objective(trial: optuna.Trial):
        chi2_threshold = trial.suggest_int("chi2_threshold", 2, 100)
        # cv = trial.suggest_int("cv", 3, 5)
        cv = 3
        imputer_strategy = trial.suggest_categorical(
            "imputer_strategy", ["mean", "median", "most_frequent"]
        )

        train_losses, valid_losses, valid_loss_shift, n_iters = [], [], [], []
        X_train, X_valid, y_train, y_valid, y_train_binarized, y_valid_binarized = (
            train_test_split(X, y, y_binarized, test_size=1 / cv, random_state=SEED)
        )  # lazy, just do 1 round of cv

        # compute pov means aggregated by psu
        aggregated_pov_train_data = pd.concat(
            [X_train["psu_hh_idcode"], y_train], axis=1
        )
        aggregated_pov_train_data[["psu", "hh", "idcode"]] = aggregated_pov_train_data[
            "psu_hh_idcode"
        ].str.split("_", expand=True)
        df_mean = aggregated_pov_train_data.groupby("psu")["num_pov"].mean()
        X_train_means = pd.merge(
            aggregated_pov_train_data,
            df_mean,
            on="psu",
            how="left",
            suffixes=("", "_mean"),
        )["num_pov_mean"]
        X_train_means.replace(np.nan, X_train_means.mean(), inplace=True)

        X_valid[["psu", "hh", "idcode"]] = X_valid["psu_hh_idcode"].str.split(
            "_", expand=True
        )
        X_valid_means = pd.merge(
            X_valid,
            df_mean,
            on="psu",
            how="left",
            suffixes=("", "_mean"),
        )["num_pov"]
        X_valid_means.replace(np.nan, X_valid_means.mean(), inplace=True)
        X_valid_means.rename("num_pov_mean", inplace=True)

        preprocessor = get_preprocessor(
            # ordinal_transformer=Pipeline([("imputer", "passthrough")]),
            imputer_strategy=[
                "most_frequent",
                "most_frequent",
                imputer_strategy,
                imputer_strategy,
            ],
            remainder="drop",
        )

        X_train_processed = preprocessor.fit_transform(X_train)
        selector = SelectPercentile(chi2, percentile=chi2_threshold)
        X_train = selector.fit_transform(X_train_processed, y_train)

        X_valid = preprocessor.transform(X_valid)
        X_valid = selector.transform(X_valid)

        # add pov means to the data
        X_train = np.column_stack([X_train, X_train_means])
        X_valid = np.column_stack([X_valid, X_valid_means])

        model = get_trained_model(
            trial, seed=SEED, X_train=X_train, y_train=y_train - 1
        )
        # train_losses.append(
        #     log_loss(y_train_binarized, model.predict_proba(X_train), normalize=False)
        #     / len(y_train_binarized)
        # )

        y_pred = model.predict_proba(X_valid)
        for column_index in [
            column for column in range(10) if column not in model.classes_
        ]:
            y_pred = np.insert(y_pred, column_index - 1, 0, axis=1)

        valid_losses.append(
            log_loss(y_valid_binarized, y_pred, normalize=False)
            / len(y_valid_binarized)
        )
        # valid_loss_shift.append(valid_losses[-1] - train_losses[-1])
        # n_iters.append(model.n_iter_[0])

        mean_valid_loss = np.mean(valid_losses)
        trial.set_user_attr(
            "selected_columns", X_train_processed.columns[selector.get_support()]
        )
        # trial.set_user_attr("n_iter", np.mean(n_iters))
        # trial.set_user_attr("train_loss", np.mean(train_losses))
        # trial.set_user_attr("valid_loss_shift", np.mean(valid_loss_shift))
        return mean_valid_loss

    return objective


def get_lr(trial, seed, X_train, y_train):
    model = suggest_logistic_regression(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_linear_svc(trial, seed, X_train, y_train):
    model, calibrated_model = suggest_linear_svc(trial, seed)
    model.fit(X_train, y_train)
    calibrated_model.fit(X_train, y_train)
    return calibrated_model


def get_kernel_svc(trial, seed, X_train, y_train):
    model = suggest_kernel_svc(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_naive_bayes(trial, seed, X_train, y_train):
    model = suggest_naive_bayes(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_knn(trial, seed, X_train, y_train):
    model = suggest_knn_classifier(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_random_forest(trial, seed, X_train, y_train):
    model = suggest_random_forest(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_gradient_boosting(trial, seed, X_train, y_train):
    model = suggest_gradient_boosting(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_mlp(trial, seed, X_train, y_train):
    model = suggest_mlp_classifier(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_lightgbm(trial, seed, X_train, y_train):
    callbacks = [lgb.log_evaluation(period=0)]
    model = suggest_lightgbm(trial, SEED)
    model.fit(X_train, y_train, callbacks=callbacks)
    return model


models = {
    "naive_bayes": (get_naive_bayes, 500),
    "knn": (get_knn, 500),
    "lr": (get_lr, 1000),
    "linear_svc": (get_linear_svc, 1000),
    "kernel_svc": (get_kernel_svc, 300),
    "random_forest": (get_random_forest, 800),
    # "gradient_boosting": (get_gradient_boosting, 200),
    # "lightgbm": (get_lightgbm, 200),
    # "mlp": (get_mlp, 200),
}


optuna.logging.set_verbosity(optuna.logging.ERROR)
# data = get_divided_edu(
#     remove_boring_columns(transform_all_house(combined_transformed_train_with_num_pov))
# )
# poi = data[1]
data = remove_boring_columns(
    transform_all_house(combined_transformed_train_with_num_pov)
)

poi = data
nunique = poi.nunique()
cols_to_drop = nunique[nunique == 1].index
poi = poi.drop(cols_to_drop, axis=1)
poi = poi.drop(columns=["edu_q09", "edu_q10", "edu_q11", "edu_q12", "edu_q13"])
print(poi.shape)
for index, (model_name, (get_trained_model, rounds)) in enumerate(models.items()):
    best_values, best_params = [], []
    print(model_name)
    # if True:
    try:
        X, y, y_binarized = (
            poi.drop(Y_COLUMNS + ["num_pov"], axis=1),
            poi["num_pov"],
            poi[Y_COLUMNS],
        )
        study = optuna.create_study(direction="minimize")
        study.optimize(
            load_objective(X, y, y_binarized, get_trained_model),
            n_trials=rounds,
            n_jobs=-1,
            show_progress_bar=True,
        )
        best_values.append(study.best_value)
        best_params.append(study.best_params)
        print(optuna.importance.get_param_importances(study))
        print(study.best_trial.user_attrs)
    except Exception as e:
        print(traceback.format_exc())
    print(best_values), print(best_params)
    print("=====================================")
    print()
    # break

(5337, 83)
naive_bayes


Best trial: 22. Best value: 1.93201: 100%|██████████| 500/500 [00:36<00:00, 13.74it/s]


{'chi2_threshold': 0.9991505848962349, 'imputer_strategy': 0.0008494151037650758}
{'selected_columns': Index(['ordinal__edu_q04', 'ordinal__edu_q06'], dtype='object')}
[1.9320051055024683]
[{'chi2_threshold': 2, 'imputer_strategy': 'mean'}]

knn


Best trial: 247. Best value: 7.25762: 100%|██████████| 500/500 [00:46<00:00, 10.76it/s]


{'n_neighbors': 0.9531234872851849, 'chi2_threshold': 0.0266168868352012, 'algorithm': 0.01059748386448167, 'leaf_size': 0.005357597789615384, 'weights': 0.0028324768142661922, 'p': 0.0008766817843957751, 'imputer_strategy': 0.0005953856268548951}
{'selected_columns': Index(['binary__house_q02_2.0', 'binary__house_q07_1.0',
       'binary__house_q07_2.0', 'binary__house_q11_2.0', 'binary__edu_q03_2.0',
       'binary__edu_q43_2.0', 'categorical__house_q03_2.0',
       'categorical__house_q03_infrequent_sklearn',
       'categorical__house_q06_4.0',
       'categorical__house_q06_infrequent_sklearn',
       'categorical__edu_q17_13.0', 'categorical__edu_q17_infrequent_sklearn',
       'categorical__edu_q23_infrequent_sklearn',
       'categorical__edu_q28_infrequent_sklearn',
       'categorical__edu_q42_infrequent_sklearn',
       'categorical__edu_q63_infrequent_sklearn', 'numerical__house_q05y',
       'numerical__house_q09', 'numerical__house_q15', 'numerical__house_q16',
       'nu

Best trial: 838. Best value: 1.76884: 100%|██████████| 1000/1000 [02:05<00:00,  7.94it/s]


{'chi2_threshold': 0.8587907619702322, 'C': 0.11743991772907768, 'imputer_strategy': 0.017288621169248248, 'l1_ratio': 0.006480699131441807}
{'selected_columns': Index(['binary__house_q02_2.0', 'binary__house_q07_1.0',
       'binary__house_q07_2.0', 'binary__house_q11_2.0',
       'binary__house_q17_1.0', 'binary__edu_q03_2.0', 'binary__edu_q08_2.0',
       'binary__edu_q14_2.0', 'binary__edu_q43_2.0', 'binary__edu_q45_2.0',
       'categorical__house_q03_2.0',
       'categorical__house_q03_infrequent_sklearn',
       'categorical__house_q06_4.0',
       'categorical__house_q06_infrequent_sklearn',
       'categorical__edu_q17_13.0', 'categorical__edu_q17_infrequent_sklearn',
       'categorical__edu_q23_infrequent_sklearn',
       'categorical__edu_q28_infrequent_sklearn',
       'categorical__edu_q42_infrequent_sklearn',
       'categorical__edu_q63_infrequent_sklearn', 'numerical__house_q05y',
       'numerical__house_q09', 'numerical__house_q15', 'numerical__house_q16',
       'n

Best trial: 923. Best value: 1.75378: 100%|██████████| 1000/1000 [01:45<00:00,  9.44it/s]


{'calibration_method': 0.8291994021796736, 'penalty': 0.12235362371919643, 'chi2_threshold': 0.04211673903617082, 'imputer_strategy': 0.005401199592882798, 'intercept_scaling': 0.0008161561618986984, 'C': 0.00011287931017772441}
{'selected_columns': Index(['binary__house_q02_2.0', 'binary__house_q07_1.0',
       'binary__house_q07_2.0', 'binary__house_q11_2.0',
       'binary__house_q17_1.0', 'binary__edu_q03_2.0', 'binary__edu_q08_2.0',
       'binary__edu_q14_2.0', 'binary__edu_q19_2.0', 'binary__edu_q25_2.0',
       'binary__edu_q32_2.0', 'binary__edu_q43_2.0', 'binary__edu_q45_2.0',
       'binary__edu_q46_2.0', 'binary__edu_q50_2.0', 'binary__edu_q57_2.0',
       'binary__edu_q61_2.0', 'binary__edu_q64_2.0',
       'categorical__house_q03_2.0',
       'categorical__house_q03_infrequent_sklearn',
       'categorical__house_q06_4.0',
       'categorical__house_q06_infrequent_sklearn',
       'categorical__edu_q17_13.0', 'categorical__edu_q17_infrequent_sklearn',
       'categorical_

Best trial: 267. Best value: 1.73853: 100%|██████████| 300/300 [01:03<00:00,  4.71it/s]


{'kernel': 0.8157565926889628, 'imputer_strategy': 0.08592760934639258, 'chi2_threshold': 0.062359321053445015, 'C': 0.03595647691119966}
{'selected_columns': Index(['binary__house_q02_2.0', 'binary__house_q07_1.0',
       'binary__house_q07_2.0', 'binary__house_q11_2.0',
       'binary__house_q17_1.0', 'binary__edu_q03_2.0', 'binary__edu_q08_2.0',
       'binary__edu_q14_2.0', 'binary__edu_q25_2.0', 'binary__edu_q43_2.0',
       'binary__edu_q45_2.0', 'categorical__house_q03_2.0',
       'categorical__house_q03_infrequent_sklearn',
       'categorical__house_q06_4.0',
       'categorical__house_q06_infrequent_sklearn',
       'categorical__edu_q17_13.0', 'categorical__edu_q17_infrequent_sklearn',
       'categorical__edu_q23_infrequent_sklearn',
       'categorical__edu_q28_infrequent_sklearn',
       'categorical__edu_q42_infrequent_sklearn',
       'categorical__edu_q63_infrequent_sklearn', 'numerical__house_q05y',
       'numerical__house_q09', 'numerical__house_q15', 'numerical__h

Best trial: 746. Best value: 1.73363: 100%|██████████| 800/800 [06:37<00:00,  2.01it/s]


{'max_features': 0.6485598501720181, 'max_depth': 0.272951765045096, 'chi2_threshold': 0.043377108897899475, 'imputer_strategy': 0.012195088782843722, 'min_samples_leaf': 0.008428365179603395, 'n_estimators': 0.006833983328256834, 'criterion': 0.0058738865299653295, 'min_samples_split': 0.0017799520643170174}
{'selected_columns': Index(['binary__edu_q03_2.0', 'binary__edu_q43_2.0',
       'categorical__house_q03_2.0', 'categorical__house_q06_4.0',
       'categorical__edu_q28_infrequent_sklearn', 'ordinal__edu_q01',
       'ordinal__edu_q02', 'ordinal__edu_q04', 'ordinal__edu_q06'],
      dtype='object')}
[1.733626632579007]
[{'chi2_threshold': 13, 'imputer_strategy': 'median', 'n_estimators': 172, 'criterion': 'gini', 'max_features': 'sqrt', 'max_depth': 6, 'min_samples_split': 4, 'min_samples_leaf': 6}]



## with knn means too


In [7]:
import traceback

import lightgbm as lgb
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsTransformer
from sklearn.pipeline import Pipeline

from models import *


def load_objective(
    X: pd.DataFrame,
    y: pd.DataFrame,
    y_binarized: pd.DataFrame,
    get_trained_model: callable,
):
    X.replace(-999, np.nan, inplace=True)

    def objective(trial: optuna.Trial):
        chi2_threshold = trial.suggest_int("chi2_threshold", 2, 100)
        # cv = trial.suggest_int("cv", 3, 5)
        cv = 3
        imputer_strategy = trial.suggest_categorical(
            "imputer_strategy", ["mean", "median", "most_frequent"]
        )

        train_losses, valid_losses, valid_loss_shift, n_iters = [], [], [], []
        X_train, X_valid, y_train, y_valid, y_train_binarized, y_valid_binarized = (
            train_test_split(X, y, y_binarized, test_size=1 / cv, random_state=SEED)
        )  # lazy, just do 1 round of cv

        # add pov means aggregated by psu
        aggregated_pov_train_data = pd.concat(
            [X_train["psu_hh_idcode"], y_train], axis=1
        )
        aggregated_pov_train_data[["psu", "hh", "idcode"]] = aggregated_pov_train_data[
            "psu_hh_idcode"
        ].str.split("_", expand=True)
        df_mean = aggregated_pov_train_data.groupby("psu")["num_pov"].mean()
        X_train_means = pd.merge(
            aggregated_pov_train_data,
            df_mean,
            on="psu",
            how="left",
            suffixes=("", "_mean"),
        )["num_pov_mean"]
        X_train_means.replace(np.nan, X_train_means.mean(), inplace=True)

        X_valid[["psu", "hh", "idcode"]] = X_valid["psu_hh_idcode"].str.split(
            "_", expand=True
        )
        X_valid_means = pd.merge(
            X_valid,
            df_mean,
            on="psu",
            how="left",
            suffixes=("", "_mean"),
        )["num_pov"]
        X_valid_means.replace(np.nan, X_valid_means.mean(), inplace=True)
        X_valid_means.rename("num_pov_mean", inplace=True)

        preprocessor = get_preprocessor(
            # ordinal_transformer=Pipeline([("imputer", "passthrough")]),
            imputer_strategy=[
                "most_frequent",
                "most_frequent",
                imputer_strategy,
                imputer_strategy,
            ],
            remainder="drop",
        )

        X_train_processed = preprocessor.fit_transform(X_train)
        selector = SelectPercentile(chi2, percentile=chi2_threshold)
        X_train = selector.fit_transform(X_train_processed, y_train)
        X_train = np.column_stack([X_train, X_train_means])

        X_valid = preprocessor.transform(X_valid)
        X_valid = selector.transform(X_valid)
        X_valid = np.column_stack([X_valid, X_valid_means])

        # add knn means too
        p = trial.suggest_int("p", 1, 3)
        k_neighbors = trial.suggest_int("k_neighbors", 2, 15)
        transformer = KNeighborsTransformer(
            mode="connectivity", n_neighbors=k_neighbors, p=p
        )
        X_dist_graph = transformer.fit_transform(X_train)
        X_dist_graph.setdiag(0)

        knn_train_means = np.divide(
            (X_dist_graph @ y_train.T), np.asarray(X_dist_graph.sum(axis=1)).flatten()
        )
        np.nan_to_num(knn_train_means, nan=np.nanmean(knn_train_means), copy=False)
        # print(knn_train_means.shape)
        X_train = np.hstack([X_train, knn_train_means.reshape(-1, 1)])
        knn_valid_means = (
            (transformer.kneighbors_graph(X_valid) @ y_train.T) / k_neighbors
        ).reshape(-1, 1)
        np.nan_to_num(knn_valid_means, nan=np.nanmean(knn_train_means), copy=False)
        X_valid = np.hstack([X_valid, knn_valid_means])

        model = get_trained_model(
            trial, seed=SEED, X_train=X_train, y_train=y_train - 1
        )

        y_pred = model.predict_proba(X_valid)
        for column_index in [
            column for column in range(10) if column not in model.classes_
        ]:
            y_pred = np.insert(y_pred, column_index - 1, 0, axis=1)

        valid_losses.append(
            log_loss(y_valid_binarized, y_pred, normalize=False)
            / len(y_valid_binarized)
        )

        mean_valid_loss = np.mean(valid_losses)
        trial.set_user_attr(
            "selected_columns", X_train_processed.columns[selector.get_support()]
        )
        return mean_valid_loss

    return objective


def get_lr(trial, seed, X_train, y_train):
    model = suggest_logistic_regression(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_linear_svc(trial, seed, X_train, y_train):
    model, calibrated_model = suggest_linear_svc(trial, seed)
    model.fit(X_train, y_train)
    calibrated_model.fit(X_train, y_train)
    return calibrated_model


def get_kernel_svc(trial, seed, X_train, y_train):
    model = suggest_kernel_svc(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_naive_bayes(trial, seed, X_train, y_train):
    model = suggest_naive_bayes(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_knn(trial, seed, X_train, y_train):
    model = suggest_knn_classifier(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_random_forest(trial, seed, X_train, y_train):
    model = suggest_random_forest(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_gradient_boosting(trial, seed, X_train, y_train):
    model = suggest_gradient_boosting(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_mlp(trial, seed, X_train, y_train):
    model = suggest_mlp_classifier(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_lightgbm(trial, seed, X_train, y_train):
    callbacks = [lgb.log_evaluation(period=0)]
    model = suggest_lightgbm(trial, SEED)
    model.fit(X_train, y_train, callbacks=callbacks)
    return model


models = {
    "naive_bayes": (get_naive_bayes, 500),
    "knn": (get_knn, 500),
    "lr": (get_lr, 1000),
    "linear_svc": (get_linear_svc, 1000),
    "kernel_svc": (get_kernel_svc, 300),
    "random_forest": (get_random_forest, 800),
    # "gradient_boosting": (get_gradient_boosting, 200),
    # "lightgbm": (get_lightgbm, 200),
    # "mlp": (get_mlp, 200),
}


optuna.logging.set_verbosity(optuna.logging.ERROR)
# data = get_divided_edu(
#     remove_boring_columns(transform_all_house(combined_transformed_train_with_num_pov))
# )
# poi = data[1]
data = remove_boring_columns(
    transform_all_house(combined_transformed_train_with_num_pov)
)

poi = data
nunique = poi.nunique()
cols_to_drop = nunique[nunique == 1].index
poi = poi.drop(cols_to_drop, axis=1)
poi = poi.drop(columns=["edu_q09", "edu_q10", "edu_q11", "edu_q12", "edu_q13"])
print(poi.shape)
for index, (model_name, (get_trained_model, rounds)) in enumerate(models.items()):
    best_values, best_params = [], []
    print(model_name)
    # if True:
    try:
        X, y, y_binarized = (
            poi.drop(Y_COLUMNS + ["num_pov"], axis=1),
            poi["num_pov"],
            poi[Y_COLUMNS],
        )
        study = optuna.create_study(direction="minimize")
        study.optimize(
            load_objective(X, y, y_binarized, get_trained_model),
            n_trials=rounds,
            n_jobs=-1,
            show_progress_bar=True,
        )
        best_values.append(study.best_value)
        best_params.append(study.best_params)
        print(optuna.importance.get_param_importances(study))
        print(study.best_trial.user_attrs)
    except Exception as e:
        print(traceback.format_exc())
    print(best_values), print(best_params)
    print("=====================================")
    print()
    # break

(5337, 83)
naive_bayes


Best trial: 156. Best value: 2.09946: 100%|██████████| 500/500 [00:56<00:00,  8.92it/s]


{'chi2_threshold': 0.9996236078221455, 'imputer_strategy': 0.0002345250421877872, 'k_neighbors': 8.721594454349048e-05, 'p': 5.465119112322476e-05}
{'selected_columns': Index(['ordinal__edu_q04', 'ordinal__edu_q06'], dtype='object')}
[2.0994553384821413]
[{'chi2_threshold': 2, 'imputer_strategy': 'mean', 'p': 3, 'k_neighbors': 2}]

knn


Best trial: 334. Best value: 7.35327: 100%|██████████| 500/500 [01:13<00:00,  6.83it/s]


{'n_neighbors': 0.97844292804431, 'chi2_threshold': 0.011745912032447037, 'algorithm': 0.004847616110622205, 'imputer_strategy': 0.0013110732205409254, 'k_neighbors': 0.001104469916936566, 'leaf_size': 0.0010600028423814103, 'weights': 0.0007461858676045925, 'p': 0.0007418119651572472}
{'selected_columns': Index(['binary__house_q02_2.0', 'binary__house_q07_1.0',
       'binary__house_q07_2.0', 'binary__house_q11_2.0', 'binary__edu_q03_2.0',
       'binary__edu_q43_2.0', 'categorical__house_q03_2.0',
       'categorical__house_q03_infrequent_sklearn',
       'categorical__house_q06_4.0',
       'categorical__house_q06_infrequent_sklearn',
       'categorical__edu_q17_13.0', 'categorical__edu_q17_infrequent_sklearn',
       'categorical__edu_q23_infrequent_sklearn',
       'categorical__edu_q28_infrequent_sklearn',
       'categorical__edu_q42_infrequent_sklearn',
       'categorical__edu_q63_infrequent_sklearn', 'numerical__house_q05y',
       'numerical__house_q09', 'numerical__house_q

Best trial: 804. Best value: 1.75081: 100%|██████████| 1000/1000 [04:34<00:00,  3.64it/s]


{'chi2_threshold': 0.9230693489010938, 'k_neighbors': 0.019635624858270626, 'l1_ratio': 0.01755270762442616, 'C': 0.017161184841551748, 'imputer_strategy': 0.013278520836059538, 'p': 0.009302612938598192}
{'selected_columns': Index(['binary__house_q02_2.0', 'binary__house_q07_1.0',
       'binary__house_q07_2.0', 'binary__house_q11_2.0',
       'binary__house_q17_1.0', 'binary__edu_q03_2.0', 'binary__edu_q08_2.0',
       'binary__edu_q14_2.0', 'binary__edu_q25_2.0', 'binary__edu_q32_2.0',
       'binary__edu_q43_2.0', 'binary__edu_q45_2.0', 'binary__edu_q46_2.0',
       'binary__edu_q50_2.0', 'binary__edu_q57_2.0', 'binary__edu_q61_2.0',
       'binary__edu_q64_2.0', 'categorical__house_q03_2.0',
       'categorical__house_q03_infrequent_sklearn',
       'categorical__house_q06_4.0',
       'categorical__house_q06_infrequent_sklearn',
       'categorical__edu_q17_13.0', 'categorical__edu_q17_infrequent_sklearn',
       'categorical__edu_q23_infrequent_sklearn',
       'categorical__edu

Best trial: 412. Best value: 1.75562: 100%|██████████| 1000/1000 [02:12<00:00,  7.57it/s]


{'calibration_method': 0.8526345345911466, 'chi2_threshold': 0.08090643164962882, 'penalty': 0.05495571390120113, 'imputer_strategy': 0.0034032380878866947, 'C': 0.002778835702248165, 'p': 0.002746358100206632, 'k_neighbors': 0.0023588626223099655, 'intercept_scaling': 0.00021602534537158092}
{'selected_columns': Index(['ordinal__edu_q04', 'ordinal__edu_q06'], dtype='object')}
[1.755621355719366]
[{'chi2_threshold': 2, 'imputer_strategy': 'median', 'p': 2, 'k_neighbors': 11, 'C': 5.577851931233571, 'penalty': 'l1', 'intercept_scaling': 48310.95303535649, 'calibration_method': 'sigmoid'}]

kernel_svc


Best trial: 243. Best value: 1.73927: 100%|██████████| 300/300 [01:32<00:00,  3.26it/s]


{'kernel': 0.5346789772065295, 'chi2_threshold': 0.15484680512222626, 'k_neighbors': 0.11364359920191082, 'C': 0.0992309100385792, 'imputer_strategy': 0.058477690000040744, 'p': 0.03912201843071353}
{'selected_columns': Index(['binary__house_q02_2.0', 'binary__house_q07_1.0',
       'binary__house_q07_2.0', 'binary__house_q11_2.0',
       'binary__house_q17_1.0', 'binary__edu_q03_2.0', 'binary__edu_q08_2.0',
       'binary__edu_q14_2.0', 'binary__edu_q43_2.0',
       'categorical__house_q03_2.0',
       'categorical__house_q03_infrequent_sklearn',
       'categorical__house_q06_4.0',
       'categorical__house_q06_infrequent_sklearn',
       'categorical__edu_q17_13.0', 'categorical__edu_q17_infrequent_sklearn',
       'categorical__edu_q23_infrequent_sklearn',
       'categorical__edu_q28_infrequent_sklearn',
       'categorical__edu_q42_infrequent_sklearn',
       'categorical__edu_q63_infrequent_sklearn', 'numerical__house_q05y',
       'numerical__house_q09', 'numerical__house_q15'

Best trial: 723. Best value: 1.72492: 100%|██████████| 800/800 [09:09<00:00,  1.46it/s]


{'max_features': 0.6987719368337719, 'chi2_threshold': 0.13524326609431667, 'max_depth': 0.08141169774879732, 'n_estimators': 0.06805990887310893, 'min_samples_leaf': 0.007803078388777225, 'criterion': 0.002885648737811641, 'imputer_strategy': 0.0020365524529168035, 'p': 0.002035112565770145, 'min_samples_split': 0.0011954709156548901, 'k_neighbors': 0.0005573273890743986}
{'selected_columns': Index(['binary__house_q02_2.0', 'binary__house_q07_1.0',
       'binary__house_q07_2.0', 'binary__house_q11_2.0',
       'binary__house_q17_1.0', 'binary__edu_q03_2.0', 'binary__edu_q08_2.0',
       'binary__edu_q14_2.0', 'binary__edu_q43_2.0', 'binary__edu_q45_2.0',
       'categorical__house_q03_2.0',
       'categorical__house_q03_infrequent_sklearn',
       'categorical__house_q06_4.0',
       'categorical__house_q06_infrequent_sklearn',
       'categorical__edu_q17_13.0', 'categorical__edu_q17_infrequent_sklearn',
       'categorical__edu_q23_infrequent_sklearn',
       'categorical__edu_q28