In [8]:
import os

import numpy as np
import pandas as pd
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.preprocessing import MinMaxScaler

from data import Y_COLUMNS, combined_train, sample_submission


SEED = 842
DATA_DIR = "processed"

X, y_binarized = (
    combined_train.drop(Y_COLUMNS, axis=1),
    combined_train[Y_COLUMNS],
)

y = np.argmax(y_binarized.values, axis=1)

null_threshold = 0.2
dropped_columns = X.columns[X.isnull().mean() > null_threshold]
X = X.drop(dropped_columns, axis=1)
X = X.drop(["house_q10"], axis=1)
X = X.iloc[:, 1:]
X

Unnamed: 0,house_q02,house_q03,house_q04,house_q05y,house_q05m,house_q06,house_q07,house_q08,house_q09,house_q11,...,edu_q04,edu_q05,edu_q06,edu_q07,edu_q08,edu_q11,edu_q14,edu_q17,edu_q18,edu_q19
0,1.0,1.0,19680615.0,44.0,2.0,1.0,1.0,2.0,0.0,2.0,...,1.0,8.0,2.0,1.0,2.0,13.0,2.0,13.0,14.0,2.0
1,2.0,2.0,19640910.0,48.0,0.0,1.0,1.0,1.0,0.0,2.0,...,1.0,8.0,2.0,0.0,2.0,13.0,2.0,13.0,14.0,2.0
2,1.0,1.0,19510317.0,61.0,5.0,1.0,1.0,2.0,0.0,2.0,...,1.0,8.0,2.0,0.0,2.0,13.0,2.0,13.0,14.0,2.0
3,1.0,1.0,19460402.0,66.0,5.0,1.0,1.0,2.0,0.0,2.0,...,1.0,8.0,2.0,0.0,2.0,13.0,2.0,13.0,14.0,2.0
4,2.0,1.0,19400407.0,72.0,5.0,4.0,,,0.0,2.0,...,1.0,4.0,1.0,0.0,2.0,14.0,2.0,14.0,20.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5332,2.0,1.0,19390126.0,73.0,7.0,4.0,,,0.0,2.0,...,1.0,4.0,1.0,0.0,2.0,2.0,2.0,2.0,10.0,2.0
5333,1.0,1.0,19520312.0,60.0,6.0,1.0,1.0,2.0,0.0,2.0,...,1.0,8.0,2.0,0.0,2.0,13.0,2.0,13.0,14.0,2.0
5334,1.0,1.0,19570125.0,55.0,7.0,1.0,1.0,2.0,0.0,2.0,...,5.0,4.0,6.0,0.0,2.0,13.0,2.0,13.0,18.0,2.0
5335,1.0,1.0,19581225.0,53.0,8.0,1.0,1.0,2.0,0.0,1.0,...,6.0,4.0,9.0,0.0,2.0,2.0,2.0,2.0,22.0,2.0


In [2]:
import logging
import sys

from sklearn.impute import SimpleImputer

MAX_ITER = 10000

X_train, X_valid, y_train, y_valid, y_train_binarized, y_valid_binarized = (
    train_test_split(X, y, y_binarized, test_size=0.2, random_state=SEED)
)
imputer = SimpleImputer(strategy="mean")
X_train_imputed = imputer.fit_transform(X_train)
X_valid_imputed = imputer.transform(X_valid)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train_imputed)
X_valid = scaler.transform(X_valid_imputed)


def objective(trial: optuna.Trial):
    C = trial.suggest_float("C", 1e-10, 1e10, log=True)
    l1_ratio = trial.suggest_float("l1_ratio", 0, 1)

    model = LogisticRegression(
        C=C,
        l1_ratio=l1_ratio,
        penalty="elasticnet",
        max_iter=10000,
        solver="saga",
        random_state=SEED,
    )
    model.fit(X_train, y_train)
    trial.set_user_attr("coef", model.coef_.tolist()[0])
    trial.set_user_attr("intercept", model.intercept_.tolist()[0])
    trial.set_user_attr("n_iter", model.n_iter_.tolist()[0])
    trial.set_user_attr("train_score", model.score(X_train, y_train))
    y_pred = model.predict_proba(X_valid)
    return log_loss(y_valid_binarized, y_pred, normalize=False) / len(y_valid_binarized)


optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
optuna.logging.set_verbosity(optuna.logging.WARNING)
study_name = "logistic_regression"
storage = f"sqlite:///{study_name}.db"
if os.path.exists(f"{study_name}.db"):
    os.remove(f"{study_name}.db")
study = optuna.create_study(
    direction="minimize", storage=storage, study_name=study_name
)
study.optimize(objective, n_trials=100, n_jobs=-1, show_progress_bar=True)

Best trial: 87. Best value: 1.93004: 100%|██████████| 100/100 [01:38<00:00,  1.02it/s]


In [3]:
final_imputer = SimpleImputer(strategy="mean")
X_imputed = final_imputer.fit_transform(X)
final_scaler = MinMaxScaler()
X_scaled = final_scaler.fit_transform(X_imputed)

final_model = LogisticRegression(
    **study.best_trial.params, max_iter=MAX_ITER, random_state=SEED
)
final_model.fit(X_scaled, y)
final_y_train_pred = final_model.predict_proba(X_scaled)

log_loss(y_binarized, final_y_train_pred)



1.9074939328245755

In [None]:
from data import combined_test, PROCESSED_DIR

X_submissions = combined_test.drop(Y_COLUMNS, axis=1)
X_submissions = X_submissions.drop(dropped_columns, axis=1)
X_submissions = X_submissions.drop(["house_q10"], axis=1)
X_submissions = X_submissions.iloc[:, 1:]
X_submissions_imputed = final_imputer.transform(X_submissions)
X_submissions_scaled = final_scaler.transform(X_submissions_imputed)
final_y_pred = final_model.predict_proba(X_submissions_scaled)

pd.DataFrame(
    np.hstack([sample_submission["psu_hh_idcode"].values.reshape(-1, 1), final_y_pred]),
    columns=["psu_hh_idcode"] + Y_COLUMNS,
).to_csv(f"{PROCESSED_DIR}/elasticnet-logistic-1.csv", index=False)