In [1]:
import optuna

from data import Y_COLUMNS, combined_train_with_num_pov


X, y_binarized, y = (
    combined_train_with_num_pov.drop(Y_COLUMNS + ["num_pov"], axis=1),
    combined_train_with_num_pov[Y_COLUMNS],
    combined_train_with_num_pov["num_pov"],
)
X = X.drop(["house_q10"], axis=1)
X = X.iloc[:, 1:]

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

from data import get_preprocessor

MAX_ITER = 1000

optuna.logging.set_verbosity(optuna.logging.WARNING)
seed_search_range = range(1000)
best_values = []
for SEED in seed_search_range:

    def objective(trial: optuna.Trial):
        null_threshold = trial.suggest_float("null_threshold", 1e-30, 0.5)
        C = trial.suggest_float("C", 1e-10, 1e10, log=True)
        imputer_strategy = trial.suggest_categorical(
            "imputer_strategy", ["mean", "median", "most_frequent"]
        )
        penalty = trial.suggest_categorical("penalty", ["l1", "l2"])
        intercept_scaling = trial.suggest_float(
            "intercept_scaling", 1e-10, 1e10, log=True
        )

        dropped_columns = X.columns[X.isnull().mean() > null_threshold]
        X_cleaned = X.drop(dropped_columns, axis=1)

        X_train, X_valid, y_train, y_valid, y_train_binarized, y_valid_binarized = (
            train_test_split(
                X_cleaned, y, y_binarized, test_size=1 / 5, random_state=SEED
            )
        )

        preprocessor = get_preprocessor(
            imputer_strategy=[
                "most_frequent",
                "most_frequent",
                imputer_strategy,
                imputer_strategy,
            ],
            remainder="drop",
        )
        X_train = preprocessor.fit_transform(X_train)
        X_valid = preprocessor.transform(X_valid)

        model = LinearSVC(
            C=C,
            penalty=penalty,
            intercept_scaling=intercept_scaling,
            max_iter=MAX_ITER,
            random_state=SEED,
        )
        model.fit(X_train, y_train)
        calibration_method = trial.suggest_categorical(
            "calibration_method", ["sigmoid", "isotonic"]
        )
        calibrated_model = CalibratedClassifierCV(
            model, cv="prefit", method=calibration_method
        )
        calibrated_model.fit(X_train, y_train)
        y_pred = calibrated_model.predict_proba(X_valid)
        valid_loss = log_loss(y_valid_binarized, y_pred, normalize=False) / len(
            y_valid_binarized
        )
        return valid_loss

    try:
        study = optuna.create_study(direction="minimize")
        study.optimize(objective, n_trials=50, n_jobs=-1, show_progress_bar=True)
        best_values.append(study.best_value)
    except Exception as e:
        print(e)
        best_values.append(np.nan)
best_values

In [8]:
np.array(best_values).argsort()

array([662,  16, 713, 850, 183, 391, 957,  68, 934, 264, 746, 259, 599,
        99, 177, 589, 590, 436,  84, 396, 635, 772, 853, 893, 379, 628,
       704,  93, 303,  21, 657,  61, 193,  34, 640, 666, 380, 705, 583,
       197, 272, 369, 784, 919, 722, 322, 239, 311, 360, 608, 899, 701,
       626, 180, 834, 975, 209, 331, 792, 869, 907,  72, 910, 656, 109,
       691, 769, 450, 502, 808, 849, 566, 775, 925, 319, 820, 539, 206,
       283, 670, 524, 201, 310, 913, 660, 718, 564, 500, 515, 806, 731,
       174,  20, 456, 165, 372, 527, 813, 293, 672, 522, 683, 223, 240,
       410, 572, 544, 440, 428, 614, 685, 147, 489, 478, 409, 431, 888,
       543, 645, 711, 994, 616, 176, 190, 411, 858, 728,  71, 842, 421,
       375, 674, 312, 213, 881,  45, 280, 802, 953,  69, 346, 901, 156,
       827,  81, 279, 437, 631,  78, 255, 359, 702, 673, 821, 896, 172,
       852, 444,  19, 121, 946, 818, 173, 234, 476, 400, 313, 452, 963,
       191, 286, 203, 250, 422, 999, 574, 637, 166, 636, 164, 34