In [1]:
import os

import numpy as np
import pandas as pd
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

from data import *


SEED = 662
DATA_DIR = "processed"

data = get_divided_edu(remove_boring_columns(combined_transformed_train_with_num_pov))

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import warnings

warnings.filterwarnings("ignore")

In [3]:
import lightgbm as lgb
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsTransformer

from models import *


def load_objective(
    X: pd.DataFrame,
    y: pd.DataFrame,
    y_binarized: pd.DataFrame,
    get_trained_model: callable,
):
    X.replace(-999, np.nan, inplace=True)

    def objective(trial: optuna.Trial):
        chi2_threshold = trial.suggest_int("chi2_threshold", 1, 100)
        cv = trial.suggest_int("cv", 3, 5)
        imputer_strategy = trial.suggest_categorical(
            "imputer_strategy", ["mean", "median", "most_frequent"]
        )

        # dropped_columns = X.columns[X.isnull().mean() > null_threshold]
        # X_cleaned = X.drop(dropped_columns, axis=1)
        X_cleaned = X

        train_losses, valid_losses, valid_loss_shift, n_iters = [], [], [], []
        X_train, X_valid, y_train, y_valid, y_train_binarized, y_valid_binarized = (
            train_test_split(
                X_cleaned, y, y_binarized, test_size=1 / cv, random_state=SEED
            )
        )  # lazy, just do 1 round of cv

        preprocessor = get_preprocessor(
            imputer_strategy=[
                "most_frequent",
                "most_frequent",
                imputer_strategy,
                imputer_strategy,
            ],
            remainder="drop",
        )

        X_train = preprocessor.fit_transform(X_train)
        selector = SelectPercentile(chi2, percentile=chi2_threshold)
        X_train = selector.fit_transform(X_train, y_train)
        X_valid = preprocessor.transform(X_valid)
        X_valid = selector.transform(X_valid)

        p = trial.suggest_int("p", 1, 3)
        k_neighbors = trial.suggest_int("k_neighbors", 2, 15)
        transformer = KNeighborsTransformer(
            mode="connectivity", n_neighbors=k_neighbors, p=p
        )
        X_dist_graph = transformer.fit_transform(X_train)
        knn_train_means = (
            (X_dist_graph - np.identity(X_dist_graph.shape[0]))
            @ y_train.T
            / (k_neighbors - 1)
        ).reshape(-1, 1)
        X_train = np.hstack([X_train, knn_train_means])
        knn_valid_means = (
            transformer.kneighbors_graph(X_valid) @ y_train.T / k_neighbors
        ).reshape(-1, 1)
        X_valid = np.hstack([X_valid, knn_valid_means])

        model = get_trained_model(
            trial, seed=SEED, X_train=X_train, y_train=y_train - 1
        )
        # train_losses.append(
        #     log_loss(y_train_binarized, model.predict_proba(X_train), normalize=False)
        #     / len(y_train_binarized)
        # )

        y_pred = model.predict_proba(X_valid)
        for column_index in [
            column for column in range(10) if column not in model.classes_
        ]:
            y_pred = np.insert(y_pred, column_index - 1, 0, axis=1)

        valid_losses.append(
            log_loss(y_valid_binarized, y_pred, normalize=False)
            / len(y_valid_binarized)
        )
        # valid_loss_shift.append(valid_losses[-1] - train_losses[-1])
        # n_iters.append(model.n_iter_[0])

        mean_valid_loss = np.mean(valid_losses)
        trial.set_user_attr("n_iter", np.mean(n_iters))
        trial.set_user_attr("train_loss", np.mean(train_losses))
        trial.set_user_attr("valid_loss_shift", np.mean(valid_loss_shift))
        return mean_valid_loss

    return objective


def get_lr(trial, seed, X_train, y_train):
    model = suggest_logistic_regression(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_linear_svc(trial, seed, X_train, y_train):
    model, calibrated_model = suggest_linear_svc(trial, seed)
    model.fit(X_train, y_train)
    calibrated_model.fit(X_train, y_train)
    return calibrated_model


def get_kernel_svc(trial, seed, X_train, y_train):
    model = suggest_kernel_svc(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_naive_bayes(trial, seed, X_train, y_train):
    model = suggest_naive_bayes(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_knn(trial, seed, X_train, y_train):
    model = suggest_knn_classifier(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_random_forest(trial, seed, X_train, y_train):
    model = suggest_random_forest(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_gradient_boosting(trial, seed, X_train, y_train):
    model = suggest_gradient_boosting(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_mlp(trial, seed, X_train, y_train):
    model = suggest_mlp_classifier(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_lightgbm(trial, seed, X_train, y_train):
    callbacks = [lgb.log_evaluation(period=0)]
    model = suggest_lightgbm(trial, SEED)
    model.fit(X_train, y_train, callbacks=callbacks)
    return model


models = {
    "naive_bayes": (get_naive_bayes, 500),
    "knn": (get_knn, 1000),
    "lr": (get_lr, 500),
    "linear_svc": (get_linear_svc, 500),
    "kernel_svc": (get_kernel_svc, 300),
    "random_forest": (get_random_forest, 300),
    "gradient_boosting": (get_gradient_boosting, 100),
    "lightgbm": (get_lightgbm, 100),
    "mlp": (get_mlp, 100),
}


optuna.logging.set_verbosity(optuna.logging.ERROR)
data = get_divided_edu(
    remove_boring_columns(transform_all_house(combined_transformed_train_with_num_pov))
)
best_values, best_params = [], []
poi = data[1]
for index, (model_name, (get_trained_model, rounds)) in enumerate(models.items()):
    print(model_name)
    try:
        X, y, y_binarized = (
            poi.drop(Y_COLUMNS + ["num_pov"], axis=1),
            poi["num_pov"],
            poi[Y_COLUMNS],
        )
        study = optuna.create_study(direction="minimize")
        study.optimize(
            load_objective(X, y, y_binarized, get_trained_model),
            n_trials=rounds,
            n_jobs=-1,
            show_progress_bar=True,
        )
        best_values.append(study.best_value)
        best_params.append(study.best_params)
    except Exception as e:
        print(e)
    print(best_values), print(best_params)
    print("=====================================")
    print()

naive_bayes


Best trial: 98. Best value: 1.96638: 100%|██████████| 500/500 [00:37<00:00, 13.21it/s]


[1.96637676754484]
[{'chi2_threshold': 2, 'cv': 4, 'imputer_strategy': 'mean', 'p': 1, 'k_neighbors': 5}]

knn


Best trial: 211. Best value: 8.12332: 100%|██████████| 1000/1000 [02:00<00:00,  8.31it/s]


[1.96637676754484, 8.123316515187259]
[{'chi2_threshold': 2, 'cv': 4, 'imputer_strategy': 'mean', 'p': 1, 'k_neighbors': 5}, {'chi2_threshold': 76, 'cv': 5, 'imputer_strategy': 'most_frequent', 'p': 1, 'k_neighbors': 11, 'n_neighbors': 10, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 17}]

lr


Best trial: 486. Best value: 1.90581: 100%|██████████| 500/500 [04:07<00:00,  2.02it/s]


[1.96637676754484, 8.123316515187259, 1.9058111371118034]
[{'chi2_threshold': 2, 'cv': 4, 'imputer_strategy': 'mean', 'p': 1, 'k_neighbors': 5}, {'chi2_threshold': 76, 'cv': 5, 'imputer_strategy': 'most_frequent', 'p': 1, 'k_neighbors': 11, 'n_neighbors': 10, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 17}, {'chi2_threshold': 100, 'cv': 3, 'imputer_strategy': 'median', 'p': 1, 'k_neighbors': 15, 'C': 0.5237696002260732, 'l1_ratio': 0.9975071651406071}]

linear_svc


Best trial: 495. Best value: 1.90836: 100%|██████████| 500/500 [02:05<00:00,  3.99it/s]


[1.96637676754484, 8.123316515187259, 1.9058111371118034, 1.908361505173058]
[{'chi2_threshold': 2, 'cv': 4, 'imputer_strategy': 'mean', 'p': 1, 'k_neighbors': 5}, {'chi2_threshold': 76, 'cv': 5, 'imputer_strategy': 'most_frequent', 'p': 1, 'k_neighbors': 11, 'n_neighbors': 10, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 17}, {'chi2_threshold': 100, 'cv': 3, 'imputer_strategy': 'median', 'p': 1, 'k_neighbors': 15, 'C': 0.5237696002260732, 'l1_ratio': 0.9975071651406071}, {'chi2_threshold': 90, 'cv': 3, 'imputer_strategy': 'most_frequent', 'p': 1, 'k_neighbors': 10, 'C': 0.00155771036867991, 'penalty': 'l2', 'intercept_scaling': 419.7063933601583, 'calibration_method': 'sigmoid'}]

kernel_svc


Best trial: 172. Best value: 1.91425: 100%|██████████| 300/300 [01:55<00:00,  2.59it/s]


[1.96637676754484, 8.123316515187259, 1.9058111371118034, 1.908361505173058, 1.914245888039007]
[{'chi2_threshold': 2, 'cv': 4, 'imputer_strategy': 'mean', 'p': 1, 'k_neighbors': 5}, {'chi2_threshold': 76, 'cv': 5, 'imputer_strategy': 'most_frequent', 'p': 1, 'k_neighbors': 11, 'n_neighbors': 10, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 17}, {'chi2_threshold': 100, 'cv': 3, 'imputer_strategy': 'median', 'p': 1, 'k_neighbors': 15, 'C': 0.5237696002260732, 'l1_ratio': 0.9975071651406071}, {'chi2_threshold': 90, 'cv': 3, 'imputer_strategy': 'most_frequent', 'p': 1, 'k_neighbors': 10, 'C': 0.00155771036867991, 'penalty': 'l2', 'intercept_scaling': 419.7063933601583, 'calibration_method': 'sigmoid'}, {'chi2_threshold': 83, 'cv': 3, 'imputer_strategy': 'mean', 'p': 3, 'k_neighbors': 4, 'C': 15.18425701830652, 'kernel': 'rbf', 'gamma': 'auto'}]

random_forest


Best trial: 183. Best value: 1.89859: 100%|██████████| 300/300 [02:23<00:00,  2.09it/s]


[1.96637676754484, 8.123316515187259, 1.9058111371118034, 1.908361505173058, 1.914245888039007, 1.8985887141861184]
[{'chi2_threshold': 2, 'cv': 4, 'imputer_strategy': 'mean', 'p': 1, 'k_neighbors': 5}, {'chi2_threshold': 76, 'cv': 5, 'imputer_strategy': 'most_frequent', 'p': 1, 'k_neighbors': 11, 'n_neighbors': 10, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 17}, {'chi2_threshold': 100, 'cv': 3, 'imputer_strategy': 'median', 'p': 1, 'k_neighbors': 15, 'C': 0.5237696002260732, 'l1_ratio': 0.9975071651406071}, {'chi2_threshold': 90, 'cv': 3, 'imputer_strategy': 'most_frequent', 'p': 1, 'k_neighbors': 10, 'C': 0.00155771036867991, 'penalty': 'l2', 'intercept_scaling': 419.7063933601583, 'calibration_method': 'sigmoid'}, {'chi2_threshold': 83, 'cv': 3, 'imputer_strategy': 'mean', 'p': 3, 'k_neighbors': 4, 'C': 15.18425701830652, 'kernel': 'rbf', 'gamma': 'auto'}, {'chi2_threshold': 92, 'cv': 3, 'imputer_strategy': 'most_frequent', 'p': 1, 'k_neighbors': 7, 'n_estimators':

Best trial: 79. Best value: 1.90461: 100%|██████████| 100/100 [36:49<00:00, 22.10s/it] 


[1.96637676754484, 8.123316515187259, 1.9058111371118034, 1.908361505173058, 1.914245888039007, 1.8985887141861184, 1.90460783266393]
[{'chi2_threshold': 2, 'cv': 4, 'imputer_strategy': 'mean', 'p': 1, 'k_neighbors': 5}, {'chi2_threshold': 76, 'cv': 5, 'imputer_strategy': 'most_frequent', 'p': 1, 'k_neighbors': 11, 'n_neighbors': 10, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 17}, {'chi2_threshold': 100, 'cv': 3, 'imputer_strategy': 'median', 'p': 1, 'k_neighbors': 15, 'C': 0.5237696002260732, 'l1_ratio': 0.9975071651406071}, {'chi2_threshold': 90, 'cv': 3, 'imputer_strategy': 'most_frequent', 'p': 1, 'k_neighbors': 10, 'C': 0.00155771036867991, 'penalty': 'l2', 'intercept_scaling': 419.7063933601583, 'calibration_method': 'sigmoid'}, {'chi2_threshold': 83, 'cv': 3, 'imputer_strategy': 'mean', 'p': 3, 'k_neighbors': 4, 'C': 15.18425701830652, 'kernel': 'rbf', 'gamma': 'auto'}, {'chi2_threshold': 92, 'cv': 3, 'imputer_strategy': 'most_frequent', 'p': 1, 'k_neighbors': 

Best trial: 99. Best value: 1.91349: 100%|██████████| 100/100 [29:03<00:00, 17.43s/it]


[1.96637676754484, 8.123316515187259, 1.9058111371118034, 1.908361505173058, 1.914245888039007, 1.8985887141861184, 1.90460783266393, 1.913489491766046]
[{'chi2_threshold': 2, 'cv': 4, 'imputer_strategy': 'mean', 'p': 1, 'k_neighbors': 5}, {'chi2_threshold': 76, 'cv': 5, 'imputer_strategy': 'most_frequent', 'p': 1, 'k_neighbors': 11, 'n_neighbors': 10, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 17}, {'chi2_threshold': 100, 'cv': 3, 'imputer_strategy': 'median', 'p': 1, 'k_neighbors': 15, 'C': 0.5237696002260732, 'l1_ratio': 0.9975071651406071}, {'chi2_threshold': 90, 'cv': 3, 'imputer_strategy': 'most_frequent', 'p': 1, 'k_neighbors': 10, 'C': 0.00155771036867991, 'penalty': 'l2', 'intercept_scaling': 419.7063933601583, 'calibration_method': 'sigmoid'}, {'chi2_threshold': 83, 'cv': 3, 'imputer_strategy': 'mean', 'p': 3, 'k_neighbors': 4, 'C': 15.18425701830652, 'kernel': 'rbf', 'gamma': 'auto'}, {'chi2_threshold': 92, 'cv': 3, 'imputer_strategy': 'most_frequent', 'p':

Best trial: 88. Best value: 1.91347: 100%|██████████| 100/100 [00:48<00:00,  2.05it/s]

[1.96637676754484, 8.123316515187259, 1.9058111371118034, 1.908361505173058, 1.914245888039007, 1.8985887141861184, 1.90460783266393, 1.913489491766046, 1.9134682338328506]
[{'chi2_threshold': 2, 'cv': 4, 'imputer_strategy': 'mean', 'p': 1, 'k_neighbors': 5}, {'chi2_threshold': 76, 'cv': 5, 'imputer_strategy': 'most_frequent', 'p': 1, 'k_neighbors': 11, 'n_neighbors': 10, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 17}, {'chi2_threshold': 100, 'cv': 3, 'imputer_strategy': 'median', 'p': 1, 'k_neighbors': 15, 'C': 0.5237696002260732, 'l1_ratio': 0.9975071651406071}, {'chi2_threshold': 90, 'cv': 3, 'imputer_strategy': 'most_frequent', 'p': 1, 'k_neighbors': 10, 'C': 0.00155771036867991, 'penalty': 'l2', 'intercept_scaling': 419.7063933601583, 'calibration_method': 'sigmoid'}, {'chi2_threshold': 83, 'cv': 3, 'imputer_strategy': 'mean', 'p': 3, 'k_neighbors': 4, 'C': 15.18425701830652, 'kernel': 'rbf', 'gamma': 'auto'}, {'chi2_threshold': 92, 'cv': 3, 'imputer_strategy': '




In [None]:
[1.7508815852971797, 1.9101396393150984, 1.7085325830171214]  # knn means

In [6]:
[1.8779598721204045, 1.9174010676883153, 1.7137433760356882]

[1.8779598721204045, 1.9174010676883153, 1.7137433760356882]

In [None]:
# part = data[1]
# masks = [
#     part["house_q05y"] <= 25,
#     part["house_q05y"] <= 35,
#     part["house_q05y"] <= 55,
#     part["house_q05y"] <= 60,
#     part["house_q05y"] <= 75,
#     (part["house_q06"] == 1) | (part["house_q06"] == 3),
#     part["house_q09"] < 1,
#     part["house_q09"] <= 1,
#     part["house_q09"] <= 3,
#     part["edu_q01"] == 1,
#     part["edu_q01"] == 3,
#     part["edu_q02"] == 1,
#     part["edu_q02"] == 3,
#     part["edu_q04"] <= 3,
#     part["edu_q04"] <= 5,
#     part["edu_q05"] <= 7,
#     part["edu_q07"] == 0,
#     part["edu_q07"] <= 1,
# ]
# for index, mask in enumerate(masks):
#     print(index)
#     try:
#         best_values, best_params = [], []
#         for mask in [mask, ~mask]:
#             poi = part[mask]
#             X, y, y_binarized = (
#                 poi.drop(Y_COLUMNS + ["num_pov"], axis=1),
#                 poi["num_pov"],
#                 poi[Y_COLUMNS],
#             )
#             study = optuna.create_study(direction="minimize")
#             study.optimize(
#                 load_objective(X, y, y_binarized),
#                 n_trials=100,
#                 n_jobs=-1,
#                 show_progress_bar=True,
#             )
#             best_values.append(study.best_value)
#             best_params.append(study.best_params)
#         print(best_values), print(best_params)
#     except Exception as e:
#         print(e)

#     print()

In [59]:
from importlib import reload
import data
import models

reload(data)
reload(models)
from data import *
from models import *

In [5]:
imputer_strategy = "median"

preprocessor = get_preprocessor(
    imputer_strategy=[
        "most_frequent",
        "most_frequent",
        imputer_strategy,
        imputer_strategy,
    ],
    remainder="drop",
)

X_train = preprocessor.fit_transform(
    remove_boring_columns(transform_all_house(combined_transformed_train_with_num_pov))
)
X_train

Unnamed: 0,binary__house_q02_2.0,binary__house_q07_1.0,binary__house_q07_2.0,binary__house_q11_2.0,binary__house_q14_2.0,binary__house_q17_1.0,binary__house_q17_5.0,binary__house_q20_2.0,binary__edu_q03_2.0,binary__edu_q08_1.0,...,ordinal__edu_q05,ordinal__edu_q06,ordinal__edu_q12,ordinal__edu_q13,ordinal__edu_q21,ordinal__edu_q22,ordinal__edu_q44,ordinal__edu_q48,ordinal__edu_q53,ordinal__edu_q66
0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.999008,0.991089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.999008,0.991089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.999008,0.991089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.999008,0.991089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.995040,0.990099,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5332,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.995040,0.990099,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5333,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.999008,0.991089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5334,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.995040,0.995050,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5335,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.995040,0.998020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
best_values

[1.9493473708185856, 1.916132109257349, 1.7147438366189351]

In [4]:
best_values

[1.9493431208424308]