In [1]:
import os

import numpy as np
import pandas as pd
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

from data import (
    Y_COLUMNS,
    combined_train_with_num_pov,
)


SEED = 662
DATA_DIR = "processed"

X, y_binarized, y = (
    combined_train_with_num_pov.drop(Y_COLUMNS + ["num_pov"], axis=1),
    combined_train_with_num_pov[Y_COLUMNS],
    combined_train_with_num_pov["num_pov"],
)
X = X.drop(["house_q10"], axis=1)
X = X.iloc[:, 1:]
X

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,house_q02,house_q03,house_q04,house_q05y,house_q05m,house_q06,house_q07,house_q08,house_q09,house_q11,...,edu_q57,edu_q58,edu_q59,edu_q60,edu_q61,edu_q62,edu_q63,edu_q64,edu_q65,edu_q66
0,1.0,1.0,19680615.0,44.0,2.0,1.0,1.0,2.0,0.0,2.0,...,,,,,,,,,,
1,2.0,2.0,19640910.0,48.0,0.0,1.0,1.0,1.0,0.0,2.0,...,,,,,,,,,,
2,1.0,1.0,19510317.0,61.0,5.0,1.0,1.0,2.0,0.0,2.0,...,,,,,,,,,,
3,1.0,1.0,19460402.0,66.0,5.0,1.0,1.0,2.0,0.0,2.0,...,,,,,,,,,,
4,2.0,1.0,19400407.0,72.0,5.0,4.0,,,0.0,2.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5332,2.0,1.0,19390126.0,73.0,7.0,4.0,,,0.0,2.0,...,,,,,,,,,,
5333,1.0,1.0,19520312.0,60.0,6.0,1.0,1.0,2.0,0.0,2.0,...,,,,,,,,,,
5334,1.0,1.0,19570125.0,55.0,7.0,1.0,1.0,2.0,0.0,2.0,...,,,,,,,,,,
5335,1.0,1.0,19581225.0,53.0,8.0,1.0,1.0,2.0,0.0,1.0,...,,,,,,,,,,


In [2]:
import logging
import sys

from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from data import get_preprocessor

MAX_ITER = 20000
IMPUTER_STRATEGIES = ["mean", "median", "most_frequent"]


def objective(trial: optuna.Trial):
    null_threshold = trial.suggest_float("null_threshold", 0, 0.5)
    cv = trial.suggest_int("cv", 3, 5)
    C = trial.suggest_float("C", 1e-5, 1e5, log=True)
    l1_ratio = trial.suggest_float("l1_ratio", 0, 1)
    imputer_strategy = trial.suggest_categorical("imputer_strategy", IMPUTER_STRATEGIES)

    dropped_columns = X.columns[X.isnull().mean() > null_threshold]
    X_cleaned = X.drop(dropped_columns, axis=1)

    train_losses, valid_losses, valid_loss_shift, n_iters = [], [], [], []
    X_train, X_valid, y_train, y_valid, y_train_binarized, y_valid_binarized = (
        train_test_split(X_cleaned, y, y_binarized, test_size=1 / cv, random_state=SEED)
    )  # lazy, just do 1 round of cv

    # StratifiedKFold does not work well with our data
    # for train_indices, valid_indices in StratifiedKFold(
    #     n_splits=cv, shuffle=True, random_state=SEED
    # ).split(X_cleaned, y):
    #     X_train, X_valid = X_cleaned.iloc[train_indices], X_cleaned.iloc[valid_indices]
    #     y_train, y_valid = y[train_indices], y[valid_indices]
    #     y_train_binarized, y_valid_binarized = (
    #         y_binarized.iloc[train_indices],
    #         y_binarized.iloc[valid_indices],
    #     )

    preprocessor = get_preprocessor(
        imputer_strategy=[
            "most_frequent",
            "most_frequent",
            imputer_strategy,
            imputer_strategy,
        ],
        remainder="drop",
    )

    X_train = preprocessor.fit_transform(X_train)
    X_valid = preprocessor.transform(X_valid)

    model = LogisticRegression(
        C=C,
        l1_ratio=l1_ratio,
        penalty="elasticnet",
        max_iter=MAX_ITER,
        solver="saga",
        random_state=SEED,
    )
    model.fit(X_train, y_train)
    train_losses.append(
        log_loss(y_train_binarized, model.predict_proba(X_train), normalize=False)
        / len(y_train_binarized)
    )

    y_pred = model.predict_proba(X_valid)
    valid_losses.append(
        log_loss(y_valid_binarized, y_pred, normalize=False) / len(y_valid_binarized)
    )
    valid_loss_shift.append(valid_losses[-1] - train_losses[-1])
    n_iters.append(model.n_iter_[0])

    mean_valid_loss = np.mean(valid_losses)
    trial.set_user_attr("n_iter", np.mean(n_iters))
    trial.set_user_attr("train_loss", np.mean(train_losses))
    trial.set_user_attr("valid_loss_shift", np.mean(valid_loss_shift))
    return mean_valid_loss


optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
optuna.logging.set_verbosity(optuna.logging.WARNING)
storage = f"sqlite:///logistic_regression.db"
study_name = "logistic_regression8(w/ new seed)"
study = optuna.create_study(
    direction="minimize", storage=storage, study_name=study_name, load_if_exists=True
)
study.optimize(objective, n_trials=200, n_jobs=-1, show_progress_bar=True)

Best trial: 109. Best value: 1.89035: 100%|██████████| 200/200 [26:33<00:00,  7.97s/it]


In [3]:
best_trial = study.best_trial
null_threshold = best_trial.params["null_threshold"]
dropped_columns = X.columns[X.isnull().mean() > null_threshold]
X_cleaned = X.drop(dropped_columns, axis=1)

imputer_strategy = best_trial.params["imputer_strategy"]
final_preprocessor = get_preprocessor(
    imputer_strategy=[
        "most_frequent",
        "most_frequent",
        imputer_strategy,
        imputer_strategy,
    ],
    remainder="drop",
)
X_scaled = final_preprocessor.fit_transform(X)

C = best_trial.params["C"]
l1_ratio = best_trial.params["l1_ratio"]
final_model = LogisticRegression(
    penalty="elasticnet",
    C=C,
    l1_ratio=l1_ratio,
    max_iter=MAX_ITER * 5,
    random_state=SEED,
    solver="saga",
)

final_model.fit(X_scaled, y)
final_y_train_pred = final_model.predict_proba(X_scaled)
log_loss(y_binarized, final_y_train_pred)



1.890499820820199

In [None]:
from data import generate_submission, combined_test

X_submissions = combined_test.drop(Y_COLUMNS, axis=1)
X_submissions = X_submissions.drop(dropped_columns, axis=1)
X_submissions = X_submissions.drop(["house_q10"], axis=1)
X_submissions = X_submissions.iloc[:, 1:]
X_submissions_imputed = final_preprocessor.transform(X_submissions)
X_submissions_scaled = final_preprocessor.transform(X_submissions_imputed)
final_y_pred = final_model.predict_proba(X_submissions_scaled)

generate_submission(final_y_pred, "elasticnet-logistic")

Submission file saved as elasticnet-logistic-3.csv
