In [1]:
import os

import numpy as np
import pandas as pd
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

from data import *


SEED = 662
DATA_DIR = "processed"

data = get_divided_edu(remove_boring_columns(combined_transformed_train_with_num_pov))

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import warnings

warnings.filterwarnings("ignore")

In [4]:
import traceback

import lightgbm as lgb
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsTransformer
from sklearn.pipeline import Pipeline

from models import *


def load_objective(
    X: pd.DataFrame,
    y: pd.DataFrame,
    y_binarized: pd.DataFrame,
    get_trained_model: callable,
):
    X.replace(-999, np.nan, inplace=True)

    def objective(trial: optuna.Trial):
        chi2_threshold = trial.suggest_int("chi2_threshold", 2, 100)
        cv = trial.suggest_int("cv", 3, 5)
        imputer_strategy = trial.suggest_categorical(
            "imputer_strategy", ["mean", "median", "most_frequent"]
        )

        train_losses, valid_losses, valid_loss_shift, n_iters = [], [], [], []
        X_train, X_valid, y_train, y_valid, y_train_binarized, y_valid_binarized = (
            train_test_split(X, y, y_binarized, test_size=1 / cv, random_state=SEED)
        )  # lazy, just do 1 round of cv

        # add pov means aggregated by psu
        aggregated_pov_train_data = pd.concat(
            [X_train["psu_hh_idcode"], y_train], axis=1
        )
        aggregated_pov_train_data[["psu", "hh", "idcode"]] = aggregated_pov_train_data[
            "psu_hh_idcode"
        ].str.split("_", expand=True)
        df_mean = aggregated_pov_train_data.groupby("psu")["num_pov"].mean()
        X_train_means = pd.merge(
            aggregated_pov_train_data,
            df_mean,
            on="psu",
            how="left",
            suffixes=("", "_mean"),
        )["num_pov_mean"]
        X_train_means.replace(np.nan, X_train_means.mean(), inplace=True)

        X_valid[["psu", "hh", "idcode"]] = X_valid["psu_hh_idcode"].str.split(
            "_", expand=True
        )
        X_valid_means = pd.merge(
            X_valid,
            df_mean,
            on="psu",
            how="left",
            suffixes=("", "_mean"),
        )["num_pov"]
        X_valid_means.replace(np.nan, X_valid_means.mean(), inplace=True)
        X_valid_means.rename("num_pov_mean", inplace=True)

        preprocessor = get_preprocessor(
            # ordinal_transformer=Pipeline([("imputer", "passthrough")]),
            imputer_strategy=[
                "most_frequent",
                "most_frequent",
                imputer_strategy,
                imputer_strategy,
            ],
            remainder="drop",
        )

        X_train_processed = preprocessor.fit_transform(X_train)
        selector = SelectPercentile(chi2, percentile=chi2_threshold)
        X_train = selector.fit_transform(X_train_processed, y_train)
        X_train = np.column_stack([X_train, X_train_means])

        X_valid = preprocessor.transform(X_valid)
        X_valid = selector.transform(X_valid)
        X_valid = np.column_stack([X_valid, X_valid_means])

        model = get_trained_model(
            trial, seed=SEED, X_train=X_train, y_train=y_train - 1
        )
        # train_losses.append(
        #     log_loss(y_train_binarized, model.predict_proba(X_train), normalize=False)
        #     / len(y_train_binarized)
        # )

        y_pred = model.predict_proba(X_valid)
        for column_index in [
            column for column in range(10) if column not in model.classes_
        ]:
            y_pred = np.insert(y_pred, column_index - 1, 0, axis=1)

        valid_losses.append(
            log_loss(y_valid_binarized, y_pred, normalize=False)
            / len(y_valid_binarized)
        )
        # valid_loss_shift.append(valid_losses[-1] - train_losses[-1])
        # n_iters.append(model.n_iter_[0])

        mean_valid_loss = np.mean(valid_losses)
        trial.set_user_attr(
            "selected_columns", X_train_processed.columns[selector.get_support()]
        )
        # trial.set_user_attr("n_iter", np.mean(n_iters))
        # trial.set_user_attr("train_loss", np.mean(train_losses))
        # trial.set_user_attr("valid_loss_shift", np.mean(valid_loss_shift))
        return mean_valid_loss

    return objective


def get_lr(trial, seed, X_train, y_train):
    model = suggest_logistic_regression(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_linear_svc(trial, seed, X_train, y_train):
    model, calibrated_model = suggest_linear_svc(trial, seed)
    model.fit(X_train, y_train)
    calibrated_model.fit(X_train, y_train)
    return calibrated_model


def get_kernel_svc(trial, seed, X_train, y_train):
    model = suggest_kernel_svc(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_naive_bayes(trial, seed, X_train, y_train):
    model = suggest_naive_bayes(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_knn(trial, seed, X_train, y_train):
    model = suggest_knn_classifier(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_random_forest(trial, seed, X_train, y_train):
    model = suggest_random_forest(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_gradient_boosting(trial, seed, X_train, y_train):
    model = suggest_gradient_boosting(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_mlp(trial, seed, X_train, y_train):
    model = suggest_mlp_classifier(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_lightgbm(trial, seed, X_train, y_train):
    callbacks = [lgb.log_evaluation(period=0)]
    model = suggest_lightgbm(trial, SEED)
    model.fit(X_train, y_train, callbacks=callbacks)
    return model


models = {
    "naive_bayes": (get_naive_bayes, 500),
    "knn": (get_knn, 500),
    "lr": (get_lr, 1000),
    "linear_svc": (get_linear_svc, 1000),
    "kernel_svc": (get_kernel_svc, 300),
    "random_forest": (get_random_forest, 800),
    # "gradient_boosting": (get_gradient_boosting, 200),
    # "lightgbm": (get_lightgbm, 200),
    # "mlp": (get_mlp, 200),
}


optuna.logging.set_verbosity(optuna.logging.ERROR)
# data = get_divided_edu(
#     remove_boring_columns(transform_all_house(combined_transformed_train_with_num_pov))
# )
# poi = data[1]
data = remove_boring_columns(
    transform_all_house(combined_transformed_train_with_num_pov)
)

poi = data
nunique = poi.nunique()
cols_to_drop = nunique[nunique == 1].index
poi = poi.drop(cols_to_drop, axis=1)
poi = poi.drop(columns=["edu_q09", "edu_q10", "edu_q11", "edu_q12", "edu_q13"])
print(poi.shape)
for index, (model_name, (get_trained_model, rounds)) in enumerate(models.items()):
    best_values, best_params = [], []
    print(model_name)
    # if True:
    try:
        X, y, y_binarized = (
            poi.drop(Y_COLUMNS + ["num_pov"], axis=1),
            poi["num_pov"],
            poi[Y_COLUMNS],
        )
        study = optuna.create_study(direction="minimize")
        study.optimize(
            load_objective(X, y, y_binarized, get_trained_model),
            n_trials=rounds,
            n_jobs=-1,
            show_progress_bar=True,
        )
        best_values.append(study.best_value)
        best_params.append(study.best_params)
        print(optuna.importance.get_param_importances(study))
        print(study.best_trial.user_attrs)
    except Exception as e:
        print(traceback.format_exc())
    print(best_values), print(best_params)
    print("=====================================")
    print()
    # break

(5337, 83)
naive_bayes


Best trial: 23. Best value: 1.93201: 100%|██████████| 500/500 [00:39<00:00, 12.77it/s]


{'chi2_threshold': 0.9987151036124883, 'imputer_strategy': 0.001156294915934847, 'cv': 0.00012860147157684536}
{'selected_columns': Index(['ordinal__edu_q04', 'ordinal__edu_q06'], dtype='object')}
[1.9320051055024683]
[{'chi2_threshold': 2, 'cv': 3, 'imputer_strategy': 'mean'}]

knn


Best trial: 438. Best value: 6.60999: 100%|██████████| 500/500 [00:47<00:00, 10.55it/s]


{'n_neighbors': 0.9632191943416601, 'chi2_threshold': 0.0198963418257379, 'weights': 0.008279110807380778, 'imputer_strategy': 0.0028273873025130634, 'cv': 0.002792949903561912, 'algorithm': 0.001346904871225611, 'leaf_size': 0.0012256287616631482, 'p': 0.0004124821862575216}
{'selected_columns': Index(['binary__house_q02_2.0', 'binary__house_q07_1.0',
       'binary__house_q07_2.0', 'binary__house_q11_2.0',
       'binary__house_q17_1.0', 'binary__house_q17_5.0', 'binary__edu_q03_2.0',
       'binary__edu_q08_2.0', 'binary__edu_q14_2.0', 'binary__edu_q25_2.0',
       'binary__edu_q32_2.0', 'binary__edu_q43_2.0', 'binary__edu_q45_2.0',
       'categorical__house_q03_2.0',
       'categorical__house_q03_infrequent_sklearn',
       'categorical__house_q06_4.0',
       'categorical__house_q06_infrequent_sklearn',
       'categorical__edu_q17_13.0', 'categorical__edu_q17_infrequent_sklearn',
       'categorical__edu_q23_infrequent_sklearn',
       'categorical__edu_q28_infrequent_sklearn',

Best trial: 691. Best value: 1.73115: 100%|██████████| 1000/1000 [04:12<00:00,  3.95it/s]


{'cv': 0.8829551923085388, 'l1_ratio': 0.04717648762723656, 'chi2_threshold': 0.029593817628813274, 'imputer_strategy': 0.028865778912325706, 'C': 0.011408723523085612}
{'selected_columns': Index(['binary__house_q02_2.0', 'binary__house_q07_1.0',
       'binary__house_q07_2.0', 'binary__house_q11_2.0',
       'binary__house_q17_1.0', 'binary__house_q17_5.0', 'binary__edu_q03_2.0',
       'binary__edu_q43_2.0', 'categorical__house_q03_2.0',
       'categorical__house_q03_infrequent_sklearn',
       'categorical__house_q06_4.0',
       'categorical__house_q06_infrequent_sklearn',
       'categorical__edu_q17_13.0', 'categorical__edu_q17_infrequent_sklearn',
       'categorical__edu_q23_infrequent_sklearn',
       'categorical__edu_q28_infrequent_sklearn',
       'categorical__edu_q42_infrequent_sklearn',
       'categorical__edu_q63_infrequent_sklearn', 'numerical__house_q05y',
       'numerical__house_q09', 'numerical__house_q15', 'numerical__house_q16',
       'numerical__house_q21', '

Best trial: 927. Best value: 1.71354: 100%|██████████| 1000/1000 [02:10<00:00,  7.69it/s]


{'calibration_method': 0.7849374706037204, 'penalty': 0.10665538847134362, 'chi2_threshold': 0.0699204402709655, 'cv': 0.02171641258987899, 'imputer_strategy': 0.009243743621157277, 'intercept_scaling': 0.00638994590700167, 'C': 0.0011365985359323596}
{'selected_columns': Index(['binary__house_q02_2.0', 'binary__house_q07_1.0',
       'binary__house_q07_2.0', 'binary__house_q11_2.0',
       'binary__house_q17_1.0', 'binary__house_q17_5.0', 'binary__edu_q03_2.0',
       'binary__edu_q08_2.0', 'binary__edu_q43_2.0',
       'categorical__house_q03_2.0',
       'categorical__house_q03_infrequent_sklearn',
       'categorical__house_q06_4.0',
       'categorical__house_q06_infrequent_sklearn',
       'categorical__edu_q17_13.0', 'categorical__edu_q17_infrequent_sklearn',
       'categorical__edu_q23_infrequent_sklearn',
       'categorical__edu_q28_infrequent_sklearn',
       'categorical__edu_q42_infrequent_sklearn',
       'categorical__edu_q63_infrequent_sklearn', 'numerical__house_q05y'

Best trial: 110. Best value: 1.68631: 100%|██████████| 300/300 [01:38<00:00,  3.05it/s]


{'kernel': 0.7144784966199781, 'chi2_threshold': 0.11414701525085245, 'C': 0.06454216435149247, 'cv': 0.054900639934860315, 'imputer_strategy': 0.05193168384281665}
{'selected_columns': Index(['binary__house_q02_2.0', 'binary__house_q07_1.0',
       'binary__house_q07_2.0', 'binary__house_q11_2.0',
       'binary__house_q17_1.0', 'binary__house_q17_5.0', 'binary__edu_q03_2.0',
       'binary__edu_q08_2.0', 'binary__edu_q14_2.0', 'binary__edu_q43_2.0',
       'categorical__house_q03_2.0',
       'categorical__house_q03_infrequent_sklearn',
       'categorical__house_q06_4.0',
       'categorical__house_q06_infrequent_sklearn',
       'categorical__edu_q17_13.0', 'categorical__edu_q17_infrequent_sklearn',
       'categorical__edu_q23_infrequent_sklearn',
       'categorical__edu_q28_infrequent_sklearn',
       'categorical__edu_q42_infrequent_sklearn',
       'categorical__edu_q63_infrequent_sklearn', 'numerical__house_q05y',
       'numerical__house_q09', 'numerical__house_q15', 'numeri

Best trial: 512. Best value: 1.70666: 100%|██████████| 800/800 [07:45<00:00,  1.72it/s]


{'max_features': 0.5861675193817554, 'max_depth': 0.30052321908895047, 'chi2_threshold': 0.048603905278758845, 'min_samples_leaf': 0.023491352972643844, 'n_estimators': 0.023055176765097125, 'imputer_strategy': 0.006073980134060792, 'min_samples_split': 0.0054806336058715365, 'cv': 0.003978049135522743, 'criterion': 0.0026261636373394333}
{'selected_columns': Index(['binary__house_q02_2.0', 'binary__edu_q03_2.0', 'binary__edu_q43_2.0',
       'categorical__house_q03_2.0',
       'categorical__house_q03_infrequent_sklearn',
       'categorical__house_q06_4.0', 'categorical__edu_q17_infrequent_sklearn',
       'categorical__edu_q23_infrequent_sklearn',
       'categorical__edu_q28_infrequent_sklearn',
       'categorical__edu_q63_infrequent_sklearn', 'numerical__edu_q34',
       'ordinal__edu_q01', 'ordinal__edu_q02', 'ordinal__edu_q04',
       'ordinal__edu_q05', 'ordinal__edu_q06'],
      dtype='object')}
[1.7066610966291977]
[{'chi2_threshold': 22, 'cv': 5, 'imputer_strategy': 'median