In [21]:
import os

import numpy as np
import pandas as pd
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

from data import *


SEED = 662
DATA_DIR = "processed"

data = get_divided_edu(remove_boring_columns(combined_transformed_train_with_num_pov))

In [2]:
import warnings

warnings.filterwarnings("ignore")

In [None]:
import traceback

import lightgbm as lgb
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsTransformer

from models import *


def load_objective(
    X: pd.DataFrame,
    y: pd.DataFrame,
    y_binarized: pd.DataFrame,
    get_trained_model: callable,
):
    X.replace(-999, np.nan, inplace=True)

    def objective(trial: optuna.Trial):
        chi2_threshold = trial.suggest_int("chi2_threshold", 2, 100)
        cv = trial.suggest_int("cv", 3, 5)
        imputer_strategy = trial.suggest_categorical(
            "imputer_strategy", ["mean", "median", "most_frequent"]
        )

        train_losses, valid_losses, valid_loss_shift, n_iters = [], [], [], []
        X_train, X_valid, y_train, y_valid, y_train_binarized, y_valid_binarized = (
            train_test_split(X, y, y_binarized, test_size=1 / cv, random_state=SEED)
        )  # lazy, just do 1 round of cv

        preprocessor = get_preprocessor(
            imputer_strategy=[
                "most_frequent",
                "most_frequent",
                imputer_strategy,
                imputer_strategy,
            ],
            remainder="drop",
        )

        X_train_processed = preprocessor.fit_transform(X_train)
        selector = SelectPercentile(chi2, percentile=chi2_threshold)
        X_train = selector.fit_transform(X_train_processed, y_train)
        X_valid = preprocessor.transform(X_valid)
        X_valid = selector.transform(X_valid)

        p = trial.suggest_int("p", 1, 3)
        k_neighbors = trial.suggest_int("k_neighbors", 2, 15)
        transformer = KNeighborsTransformer(
            mode="connectivity", n_neighbors=k_neighbors, p=p
        )
        X_dist_graph = transformer.fit_transform(X_train)
        X_dist_graph.setdiag(0)
        # print((X_dist_graph.sum(axis=1) == k_neighbors).astype(int).sum(), X_train.shape)
        # print(X_dist_graph.sum(axis=1), k_neighbors)
        # print((X_dist_graph @ y_train.T).shape)
        # print(np.asarray(X_dist_graph.sum(axis=1)).shape)

        knn_train_means = np.divide(
            (X_dist_graph @ y_train.T), np.asarray(X_dist_graph.sum(axis=1)).flatten()
        )
        # print(knn_train_means.shape)
        X_train = np.hstack([X_train, knn_train_means.reshape(-1, 1)])
        knn_valid_means = (
            (transformer.kneighbors_graph(X_valid) @ y_train.T) / k_neighbors
        ).reshape(-1, 1)
        X_valid = np.hstack([X_valid, knn_valid_means])

        model = get_trained_model(
            trial, seed=SEED, X_train=X_train, y_train=y_train - 1
        )
        # train_losses.append(
        #     log_loss(y_train_binarized, model.predict_proba(X_train), normalize=False)
        #     / len(y_train_binarized)
        # )

        y_pred = model.predict_proba(X_valid)
        for column_index in [
            column for column in range(10) if column not in model.classes_
        ]:
            y_pred = np.insert(y_pred, column_index - 1, 0, axis=1)

        valid_losses.append(
            log_loss(y_valid_binarized, y_pred, normalize=False)
            / len(y_valid_binarized)
        )
        # valid_loss_shift.append(valid_losses[-1] - train_losses[-1])
        # n_iters.append(model.n_iter_[0])

        mean_valid_loss = np.mean(valid_losses)
        trial.set_user_attr(
            "selected_columns", X_train_processed.columns[selector.get_support()]
        )
        # trial.set_user_attr("n_iter", np.mean(n_iters))
        # trial.set_user_attr("train_loss", np.mean(train_losses))
        # trial.set_user_attr("valid_loss_shift", np.mean(valid_loss_shift))
        return mean_valid_loss

    return objective


def get_lr(trial, seed, X_train, y_train):
    model = suggest_logistic_regression(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_linear_svc(trial, seed, X_train, y_train):
    model, calibrated_model = suggest_linear_svc(trial, seed)
    model.fit(X_train, y_train)
    calibrated_model.fit(X_train, y_train)
    return calibrated_model


def get_kernel_svc(trial, seed, X_train, y_train):
    model = suggest_kernel_svc(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_naive_bayes(trial, seed, X_train, y_train):
    model = suggest_naive_bayes(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_knn(trial, seed, X_train, y_train):
    model = suggest_knn_classifier(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_random_forest(trial, seed, X_train, y_train):
    model = suggest_random_forest(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_gradient_boosting(trial, seed, X_train, y_train):
    model = suggest_gradient_boosting(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_mlp(trial, seed, X_train, y_train):
    model = suggest_mlp_classifier(trial, seed)
    model.fit(X_train, y_train)
    return model


def get_lightgbm(trial, seed, X_train, y_train):
    callbacks = [lgb.log_evaluation(period=0)]
    model = suggest_lightgbm(trial, SEED)
    model.fit(X_train, y_train, callbacks=callbacks)
    return model


models = {
    "naive_bayes": (get_naive_bayes, 500),
    "knn": (get_knn, 500),
    "lr": (get_lr, 1000),
    "linear_svc": (get_linear_svc, 1000),
    "kernel_svc": (get_kernel_svc, 300),
    "random_forest": (get_random_forest, 800),
    "gradient_boosting": (get_gradient_boosting, 200),
    "lightgbm": (get_lightgbm, 200),
    "mlp": (get_mlp, 200),
}


optuna.logging.set_verbosity(optuna.logging.ERROR)
# data = get_divided_edu(
#     remove_boring_columns(transform_all_house(combined_transformed_train_with_num_pov))
# )
data = remove_boring_columns(
    transform_all_house(combined_transformed_train_with_num_pov)
)
poi = data[1][outliers_mask]
for index, (model_name, (get_trained_model, rounds)) in enumerate(models.items()):
    rounds = 50
    best_values, best_params = [], []
    print(model_name)
    try:
        X, y, y_binarized = (
            poi.drop(Y_COLUMNS + ["num_pov"], axis=1),
            poi["num_pov"],
            poi[Y_COLUMNS],
        )
        study = optuna.create_study(direction="minimize")
        study.optimize(
            load_objective(X, y, y_binarized, get_trained_model),
            n_trials=rounds,
            n_jobs=-1,
            show_progress_bar=True,
        )
        best_values.append(study.best_value)
        best_params.append(study.best_params)
        print(study.best_trial.user_attrs)
    except Exception as e:
        print(traceback.format_exc())
    print(best_values), print(best_params)
    print("=====================================")
    print()
    # break

naive_bayes


Best trial: 446. Best value: 2.20521: 100%|██████████| 500/500 [01:08<00:00,  7.35it/s]


{'selected_columns': Index(['ordinal__edu_q04', 'ordinal__edu_q06'], dtype='object')}
[2.2052081483539747]
[{'chi2_threshold': 2, 'cv': 5, 'imputer_strategy': 'most_frequent', 'p': 1, 'k_neighbors': 15}]

knn


Best trial: 240. Best value: 5.46239: 100%|██████████| 500/500 [01:19<00:00,  6.25it/s]


{'selected_columns': Index(['ordinal__edu_q04', 'ordinal__edu_q06'], dtype='object')}
[5.462391783115261]
[{'chi2_threshold': 2, 'cv': 3, 'imputer_strategy': 'mean', 'p': 3, 'k_neighbors': 11, 'n_neighbors': 10, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 21}]

lr


Best trial: 958. Best value: 1.88905: 100%|██████████| 1000/1000 [08:14<00:00,  2.02it/s]


{'selected_columns': Index(['binary__house_q02_2.0', 'binary__house_q07_1.0',
       'binary__house_q07_2.0', 'binary__house_q11_2.0',
       'binary__house_q17_5.0', 'binary__edu_q03_2.0', 'binary__edu_q09_2.0',
       'binary__edu_q43_2.0', 'categorical__house_q03_2.0',
       'categorical__house_q03_infrequent_sklearn',
       'categorical__house_q06_4.0',
       'categorical__house_q06_infrequent_sklearn',
       'categorical__edu_q11_13.0', 'categorical__edu_q11_infrequent_sklearn',
       'categorical__edu_q17_13.0', 'categorical__edu_q17_infrequent_sklearn',
       'categorical__edu_q23_infrequent_sklearn',
       'categorical__edu_q28_infrequent_sklearn',
       'categorical__edu_q42_infrequent_sklearn',
       'categorical__edu_q63_infrequent_sklearn', 'numerical__house_q05y',
       'numerical__house_q05m', 'numerical__house_q09', 'numerical__house_q15',
       'numerical__house_q16', 'numerical__house_q21', 'numerical__house_q22',
       'numerical__edu_q07', 'numerical__edu

Best trial: 689. Best value: 1.89244: 100%|██████████| 1000/1000 [05:14<00:00,  3.18it/s]


{'selected_columns': Index(['binary__house_q02_2.0', 'binary__house_q07_1.0',
       'binary__house_q07_2.0', 'binary__house_q11_2.0',
       'binary__house_q17_1.0', 'binary__house_q17_5.0', 'binary__edu_q03_2.0',
       'binary__edu_q08_2.0', 'binary__edu_q09_2.0', 'binary__edu_q14_2.0',
       'binary__edu_q19_2.0', 'binary__edu_q25_2.0', 'binary__edu_q32_2.0',
       'binary__edu_q43_2.0', 'binary__edu_q45_2.0', 'binary__edu_q46_2.0',
       'binary__edu_q50_2.0', 'binary__edu_q57_2.0', 'binary__edu_q61_2.0',
       'binary__edu_q64_2.0', 'categorical__house_q03_2.0',
       'categorical__house_q03_infrequent_sklearn',
       'categorical__house_q06_4.0',
       'categorical__house_q06_infrequent_sklearn',
       'categorical__edu_q11_13.0', 'categorical__edu_q11_infrequent_sklearn',
       'categorical__edu_q17_13.0', 'categorical__edu_q17_infrequent_sklearn',
       'categorical__edu_q23_infrequent_sklearn',
       'categorical__edu_q28_infrequent_sklearn',
       'categorical__e

Best trial: 184. Best value: 1.89975: 100%|██████████| 300/300 [02:13<00:00,  2.25it/s]


{'selected_columns': Index(['binary__house_q02_2.0', 'binary__house_q07_1.0',
       'binary__house_q07_2.0', 'binary__house_q11_2.0',
       'binary__house_q17_1.0', 'binary__house_q17_5.0', 'binary__edu_q03_2.0',
       'binary__edu_q08_2.0', 'binary__edu_q09_2.0', 'binary__edu_q14_2.0',
       'binary__edu_q43_2.0', 'categorical__house_q03_2.0',
       'categorical__house_q03_infrequent_sklearn',
       'categorical__house_q06_4.0',
       'categorical__house_q06_infrequent_sklearn',
       'categorical__edu_q11_13.0', 'categorical__edu_q11_infrequent_sklearn',
       'categorical__edu_q17_13.0', 'categorical__edu_q17_infrequent_sklearn',
       'categorical__edu_q23_infrequent_sklearn',
       'categorical__edu_q28_infrequent_sklearn',
       'categorical__edu_q42_infrequent_sklearn',
       'categorical__edu_q63_infrequent_sklearn', 'numerical__house_q05y',
       'numerical__house_q05m', 'numerical__house_q09', 'numerical__house_q15',
       'numerical__house_q16', 'numerical__ho

Best trial: 788. Best value: 1.89044: 100%|██████████| 800/800 [08:55<00:00,  1.50it/s]


{'selected_columns': Index(['binary__house_q02_2.0', 'binary__house_q07_2.0',
       'binary__house_q17_5.0', 'binary__edu_q03_2.0', 'binary__edu_q09_2.0',
       'binary__edu_q43_2.0', 'categorical__house_q03_2.0',
       'categorical__house_q03_infrequent_sklearn',
       'categorical__house_q06_4.0',
       'categorical__house_q06_infrequent_sklearn',
       'categorical__edu_q11_13.0', 'categorical__edu_q11_infrequent_sklearn',
       'categorical__edu_q17_13.0', 'categorical__edu_q17_infrequent_sklearn',
       'categorical__edu_q23_infrequent_sklearn',
       'categorical__edu_q28_infrequent_sklearn',
       'categorical__edu_q42_infrequent_sklearn',
       'categorical__edu_q63_infrequent_sklearn', 'numerical__house_q09',
       'numerical__house_q15', 'numerical__house_q16', 'numerical__house_q21',
       'numerical__house_q22', 'numerical__edu_q07', 'numerical__edu_q18',
       'numerical__edu_q26', 'numerical__edu_q34', 'numerical__edu_q36',
       'numerical__edu_q38', 'nume

Best trial: 199. Best value: 1.88779: 100%|██████████| 200/200 [34:53<00:00, 10.47s/it]


{'selected_columns': Index(['binary__house_q02_2.0', 'binary__house_q07_1.0',
       'binary__house_q07_2.0', 'binary__house_q17_5.0', 'binary__edu_q03_2.0',
       'binary__edu_q09_2.0', 'binary__edu_q43_2.0',
       'categorical__house_q03_2.0',
       'categorical__house_q03_infrequent_sklearn',
       'categorical__house_q06_4.0',
       'categorical__house_q06_infrequent_sklearn',
       'categorical__edu_q11_13.0', 'categorical__edu_q11_infrequent_sklearn',
       'categorical__edu_q17_13.0', 'categorical__edu_q17_infrequent_sklearn',
       'categorical__edu_q23_infrequent_sklearn',
       'categorical__edu_q28_infrequent_sklearn',
       'categorical__edu_q42_infrequent_sklearn',
       'categorical__edu_q63_infrequent_sklearn', 'numerical__house_q09',
       'numerical__house_q15', 'numerical__house_q16', 'numerical__house_q21',
       'numerical__house_q22', 'numerical__edu_q07', 'numerical__edu_q18',
       'numerical__edu_q26', 'numerical__edu_q34', 'numerical__edu_q36',
  

Best trial: 167. Best value: 1.891: 100%|██████████| 200/200 [1:07:47<00:00, 20.34s/it]  


{'selected_columns': Index(['binary__house_q02_2.0', 'binary__house_q07_1.0',
       'binary__house_q07_2.0', 'binary__house_q17_5.0', 'binary__edu_q03_2.0',
       'binary__edu_q09_2.0', 'binary__edu_q43_2.0',
       'categorical__house_q03_2.0',
       'categorical__house_q03_infrequent_sklearn',
       'categorical__house_q06_4.0',
       'categorical__house_q06_infrequent_sklearn',
       'categorical__edu_q11_13.0', 'categorical__edu_q11_infrequent_sklearn',
       'categorical__edu_q17_13.0', 'categorical__edu_q17_infrequent_sklearn',
       'categorical__edu_q23_infrequent_sklearn',
       'categorical__edu_q28_infrequent_sklearn',
       'categorical__edu_q42_infrequent_sklearn',
       'categorical__edu_q63_infrequent_sklearn', 'numerical__house_q09',
       'numerical__house_q15', 'numerical__house_q16', 'numerical__house_q21',
       'numerical__house_q22', 'numerical__edu_q07', 'numerical__edu_q18',
       'numerical__edu_q26', 'numerical__edu_q34', 'numerical__edu_q36',
  

Best trial: 155. Best value: 1.89112: 100%|██████████| 200/200 [01:52<00:00,  1.79it/s]

{'selected_columns': Index(['binary__house_q02_2.0', 'binary__house_q07_2.0',
       'binary__house_q17_5.0', 'binary__edu_q03_2.0', 'binary__edu_q43_2.0',
       'categorical__house_q03_2.0',
       'categorical__house_q03_infrequent_sklearn',
       'categorical__house_q06_4.0', 'categorical__edu_q11_infrequent_sklearn',
       'categorical__edu_q17_infrequent_sklearn',
       'categorical__edu_q23_infrequent_sklearn',
       'categorical__edu_q28_infrequent_sklearn',
       'categorical__edu_q42_infrequent_sklearn',
       'categorical__edu_q63_infrequent_sklearn', 'numerical__house_q09',
       'numerical__house_q16', 'numerical__house_q22', 'numerical__edu_q07',
       'numerical__edu_q18', 'numerical__edu_q24', 'numerical__edu_q33',
       'numerical__edu_q34', 'numerical__edu_q36', 'numerical__edu_q37',
       'numerical__edu_q38', 'numerical__edu_q62', 'numerical__edu_q65',
       'ordinal__house_q13', 'ordinal__house_q19', 'ordinal__edu_q01',
       'ordinal__edu_q02', 'ordina




In [20]:
models = {
    "gradient_boosting": (get_gradient_boosting, 500),
    "lightgbm": (get_lightgbm, 500),
    "mlp": (get_mlp, 500),
}


optuna.logging.set_verbosity(optuna.logging.ERROR)
# data = get_divided_edu(
#     remove_boring_columns(transform_all_house(combined_transformed_train_with_num_pov))
# )
data = remove_boring_columns(
    transform_all_house(combined_transformed_train_with_num_pov)
)
poi = data
for index, (model_name, (get_trained_model, rounds)) in enumerate(models.items()):
    best_values, best_params = [], []
    print(model_name)
    try:
        X, y, y_binarized = (
            poi.drop(Y_COLUMNS + ["num_pov"], axis=1),
            poi["num_pov"],
            poi[Y_COLUMNS],
        )
        study = optuna.create_study(direction="minimize")
        study.optimize(
            load_objective(X, y, y_binarized, get_trained_model),
            n_trials=rounds,
            n_jobs=-1,
            show_progress_bar=True,
        )
        best_values.append(study.best_value)
        best_params.append(study.best_params)
        print(study.best_trial.user_attrs)
    except Exception as e:
        print(traceback.format_exc())
    print(best_values), print(best_params)
    print("=====================================")
    print()

gradient_boosting


Best trial: 414. Best value: 1.88643: 100%|██████████| 500/500 [1:11:13<00:00,  8.55s/it]


{'selected_columns': Index(['binary__house_q02_2.0', 'binary__house_q07_1.0',
       'binary__house_q07_2.0', 'binary__house_q11_2.0',
       'binary__house_q17_1.0', 'binary__house_q17_5.0', 'binary__edu_q03_2.0',
       'binary__edu_q08_2.0', 'binary__edu_q09_2.0', 'binary__edu_q14_2.0',
       'binary__edu_q19_2.0', 'binary__edu_q25_2.0', 'binary__edu_q32_2.0',
       'binary__edu_q43_2.0', 'binary__edu_q45_2.0', 'binary__edu_q46_2.0',
       'binary__edu_q50_2.0', 'binary__edu_q57_2.0', 'binary__edu_q61_2.0',
       'binary__edu_q64_2.0', 'categorical__house_q03_2.0',
       'categorical__house_q03_infrequent_sklearn',
       'categorical__house_q06_4.0',
       'categorical__house_q06_infrequent_sklearn',
       'categorical__edu_q11_13.0', 'categorical__edu_q11_infrequent_sklearn',
       'categorical__edu_q17_13.0', 'categorical__edu_q17_infrequent_sklearn',
       'categorical__edu_q23_infrequent_sklearn',
       'categorical__edu_q28_infrequent_sklearn',
       'categorical__e

Best trial: 493. Best value: 1.89032: 100%|██████████| 500/500 [1:19:18<00:00,  9.52s/it]  


{'selected_columns': Index(['binary__house_q02_2.0', 'binary__house_q07_1.0',
       'binary__house_q07_2.0', 'binary__house_q17_5.0', 'binary__edu_q03_2.0',
       'binary__edu_q09_2.0', 'binary__edu_q43_2.0',
       'categorical__house_q03_2.0',
       'categorical__house_q03_infrequent_sklearn',
       'categorical__house_q06_4.0',
       'categorical__house_q06_infrequent_sklearn',
       'categorical__edu_q11_13.0', 'categorical__edu_q11_infrequent_sklearn',
       'categorical__edu_q17_13.0', 'categorical__edu_q17_infrequent_sklearn',
       'categorical__edu_q23_infrequent_sklearn',
       'categorical__edu_q28_infrequent_sklearn',
       'categorical__edu_q42_infrequent_sklearn',
       'categorical__edu_q63_infrequent_sklearn', 'numerical__house_q09',
       'numerical__house_q15', 'numerical__house_q16', 'numerical__house_q21',
       'numerical__house_q22', 'numerical__edu_q07', 'numerical__edu_q18',
       'numerical__edu_q26', 'numerical__edu_q34', 'numerical__edu_q36',
  

Best trial: 385. Best value: 1.88643: 100%|██████████| 500/500 [04:54<00:00,  1.70it/s]

{'selected_columns': Index(['binary__house_q02_2.0', 'binary__house_q07_1.0',
       'binary__house_q07_2.0', 'binary__house_q11_2.0',
       'binary__house_q17_1.0', 'binary__house_q17_5.0', 'binary__edu_q03_2.0',
       'binary__edu_q08_2.0', 'binary__edu_q09_2.0', 'binary__edu_q14_2.0',
       'binary__edu_q19_2.0', 'binary__edu_q25_2.0', 'binary__edu_q32_2.0',
       'binary__edu_q43_2.0', 'binary__edu_q45_2.0', 'binary__edu_q61_2.0',
       'binary__edu_q64_2.0', 'categorical__house_q03_2.0',
       'categorical__house_q03_infrequent_sklearn',
       'categorical__house_q06_4.0',
       'categorical__house_q06_infrequent_sklearn',
       'categorical__edu_q11_13.0', 'categorical__edu_q11_infrequent_sklearn',
       'categorical__edu_q17_13.0', 'categorical__edu_q17_infrequent_sklearn',
       'categorical__edu_q23_infrequent_sklearn',
       'categorical__edu_q28_infrequent_sklearn',
       'categorical__edu_q42_infrequent_sklearn',
       'categorical__edu_q63_infrequent_sklearn'




In [None]:
a1 = combined_transformed_train_with_num_pov.copy()
a1["house_q15"] = pd.cut(
    a1["house_q15"],
    [0, 30, 40, 50, 60, 70, 80, 90, 100],
    labels=[1, 2, 3, 4, 5, 6, 7, 8],
)

In [None]:
[1.7508815852971797, 1.9101396393150984, 1.7085325830171214]  # knn means

In [6]:
[1.8779598721204045, 1.9174010676883153, 1.7137433760356882]

[1.8779598721204045, 1.9174010676883153, 1.7137433760356882]

In [None]:
# part = data[1]
# masks = [
#     part["house_q05y"] <= 25,
#     part["house_q05y"] <= 35,
#     part["house_q05y"] <= 55,
#     part["house_q05y"] <= 60,
#     part["house_q05y"] <= 75,
#     (part["house_q06"] == 1) | (part["house_q06"] == 3),
#     part["house_q09"] < 1,
#     part["house_q09"] <= 1,
#     part["house_q09"] <= 3,
#     part["edu_q01"] == 1,
#     part["edu_q01"] == 3,
#     part["edu_q02"] == 1,
#     part["edu_q02"] == 3,
#     part["edu_q04"] <= 3,
#     part["edu_q04"] <= 5,
#     part["edu_q05"] <= 7,
#     part["edu_q07"] == 0,
#     part["edu_q07"] <= 1,
# ]
# for index, mask in enumerate(masks):
#     print(index)
#     try:
#         best_values, best_params = [], []
#         for mask in [mask, ~mask]:
#             poi = part[mask]
#             X, y, y_binarized = (
#                 poi.drop(Y_COLUMNS + ["num_pov"], axis=1),
#                 poi["num_pov"],
#                 poi[Y_COLUMNS],
#             )
#             study = optuna.create_study(direction="minimize")
#             study.optimize(
#                 load_objective(X, y, y_binarized),
#                 n_trials=100,
#                 n_jobs=-1,
#                 show_progress_bar=True,
#             )
#             best_values.append(study.best_value)
#             best_params.append(study.best_params)
#         print(best_values), print(best_params)
#     except Exception as e:
#         print(e)

#     print()

In [40]:
for d in get_divided_edu(
    remove_boring_columns(transform_all_house(combined_transformed_train_with_num_pov))
):
    print(d.shape)

(139, 29)
(5138, 43)
(57, 82)


In [12]:
data = get_divided_edu(
    remove_boring_columns(transform_all_house(combined_transformed_train_with_num_pov))
)
p = data[1]
p

Unnamed: 0,psu_hh_idcode,subjective_poverty_1,subjective_poverty_2,subjective_poverty_3,subjective_poverty_4,subjective_poverty_5,subjective_poverty_6,subjective_poverty_7,subjective_poverty_8,subjective_poverty_9,...,edu_q09,edu_q10,edu_q11,edu_q12,edu_q13,edu_q17,edu_q18,edu_q19,edu_q20,num_pov
0,30_8_1,0,0,0,1,0,0,0,0,0,...,-999.0,-999.0,13.0,-999.0,-999.0,13.0,14.0,2.0,-999.0,4
1,194_1_2,1,0,0,0,0,0,0,0,0,...,-999.0,-999.0,13.0,-999.0,-999.0,13.0,14.0,2.0,-999.0,1
2,224_6_1,0,0,1,0,0,0,0,0,0,...,-999.0,-999.0,13.0,-999.0,-999.0,13.0,14.0,2.0,-999.0,3
3,323_10_1,0,0,0,0,1,0,0,0,0,...,-999.0,-999.0,13.0,-999.0,-999.0,13.0,14.0,2.0,-999.0,5
4,428_10_1,0,0,0,1,0,0,0,0,0,...,-999.0,-999.0,14.0,-999.0,-999.0,14.0,20.0,2.0,-999.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5332,571_8_1,0,0,1,0,0,0,0,0,0,...,-999.0,-999.0,2.0,-999.0,-999.0,2.0,10.0,2.0,-999.0,3
5333,601_5_1,0,0,0,1,0,0,0,0,0,...,-999.0,-999.0,13.0,-999.0,-999.0,13.0,14.0,2.0,-999.0,4
5334,782_1_1,0,1,0,0,0,0,0,0,0,...,-999.0,-999.0,13.0,-999.0,-999.0,13.0,18.0,2.0,-999.0,2
5335,606_3_1,0,0,0,0,1,0,0,0,0,...,-999.0,-999.0,2.0,-999.0,-999.0,2.0,22.0,2.0,-999.0,5


In [11]:
from importlib import reload
import data
import models

reload(data)
reload(models)
from data import *
from models import *

In [5]:
imputer_strategy = "median"

preprocessor = get_preprocessor(
    imputer_strategy=[
        "most_frequent",
        "most_frequent",
        imputer_strategy,
        imputer_strategy,
    ],
    remainder="drop",
)

X_train = preprocessor.fit_transform(
    remove_boring_columns(transform_all_house(combined_transformed_train_with_num_pov))
)
X_train

Unnamed: 0,binary__house_q02_2.0,binary__house_q07_1.0,binary__house_q07_2.0,binary__house_q11_2.0,binary__house_q14_2.0,binary__house_q17_1.0,binary__house_q17_5.0,binary__house_q20_2.0,binary__edu_q03_2.0,binary__edu_q08_1.0,...,ordinal__edu_q05,ordinal__edu_q06,ordinal__edu_q12,ordinal__edu_q13,ordinal__edu_q21,ordinal__edu_q22,ordinal__edu_q44,ordinal__edu_q48,ordinal__edu_q53,ordinal__edu_q66
0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.999008,0.991089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.999008,0.991089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.999008,0.991089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.999008,0.991089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.995040,0.990099,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5332,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.995040,0.990099,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5333,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.999008,0.991089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5334,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.995040,0.995050,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5335,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.995040,0.998020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
best_values

[1.9493473708185856, 1.916132109257349, 1.7147438366189351]

In [4]:
best_values

[1.9493431208424308]