In [1]:
import os

import numpy as np
import pandas as pd
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelBinarizer


SEED = 842
DATA_DIR = "processed"
combined_train = pd.read_csv(os.path.join(DATA_DIR, "combined_train.csv"))

y_columns = [f"subjective_poverty_{i}" for i in range(1, 11)]
X, y_binarized = (
    combined_train.drop(y_columns, axis=1),
    combined_train[y_columns],
)

y = np.argmax(y_binarized.values, axis=1)

null_threshold = 0.2
X = X.drop(X.columns[X.isnull().mean() > null_threshold], axis=1)
X = X.drop(["house_q10"], axis=1)
X = X.iloc[:, 4:]
X

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,house_q05y,house_q05m,house_q06,house_q07,house_q08,house_q09,house_q11,house_q13,house_q14,house_q17,...,edu_q04,edu_q05,edu_q06,edu_q07,edu_q08,edu_q11,edu_q14,edu_q17,edu_q18,edu_q19
0,44.0,2.0,1.0,1.0,2.0,0.0,2.0,2.0,1.0,2.0,...,1.0,8.0,2.0,1.0,2.0,13.0,2.0,13.0,14.0,2.0
1,48.0,0.0,1.0,1.0,1.0,0.0,2.0,1.0,1.0,2.0,...,1.0,8.0,2.0,0.0,2.0,13.0,2.0,13.0,14.0,2.0
2,61.0,5.0,1.0,1.0,2.0,0.0,2.0,1.0,2.0,2.0,...,1.0,8.0,2.0,0.0,2.0,13.0,2.0,13.0,14.0,2.0
3,66.0,5.0,1.0,1.0,2.0,0.0,2.0,2.0,2.0,2.0,...,1.0,8.0,2.0,0.0,2.0,13.0,2.0,13.0,14.0,2.0
4,72.0,5.0,4.0,,,0.0,2.0,1.0,2.0,2.0,...,1.0,4.0,1.0,0.0,2.0,14.0,2.0,14.0,20.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5332,73.0,7.0,4.0,,,0.0,2.0,1.0,2.0,2.0,...,1.0,4.0,1.0,0.0,2.0,2.0,2.0,2.0,10.0,2.0
5333,60.0,6.0,1.0,1.0,2.0,0.0,2.0,1.0,2.0,2.0,...,1.0,8.0,2.0,0.0,2.0,13.0,2.0,13.0,14.0,2.0
5334,55.0,7.0,1.0,1.0,2.0,0.0,2.0,1.0,1.0,2.0,...,5.0,4.0,6.0,0.0,2.0,13.0,2.0,13.0,18.0,2.0
5335,53.0,8.0,1.0,1.0,2.0,0.0,1.0,,,2.0,...,6.0,4.0,9.0,0.0,2.0,2.0,2.0,2.0,22.0,2.0


In [2]:
import logging
import sys

from sklearn.impute import SimpleImputer

X_train, X_valid, y_train, y_valid, y_train_binarized, y_valid_binarized = (
    train_test_split(X, y, y_binarized, test_size=0.2, random_state=SEED)
)
imputer = SimpleImputer(strategy="mean")
X_train_imputed = imputer.fit_transform(X_train)
X_valid_imputed = imputer.transform(X_valid)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train_imputed)
X_valid = scaler.transform(X_valid_imputed)


def objective(trial: optuna.Trial):
    C = trial.suggest_float("C", 1e-10, 1e10, log=True)
    model = LogisticRegression(C=C, max_iter=10000, random_state=SEED)
    model.fit(X_train, y_train)
    trial.set_user_attr("coef", model.coef_.tolist()[0])
    trial.set_user_attr("intercept", model.intercept_.tolist()[0])
    trial.set_user_attr("n_iter", model.n_iter_.tolist()[0])
    trial.set_user_attr("train_score", model.score(X_train, y_train))
    y_pred = model.predict_proba(X_valid)
    return log_loss(y_valid_binarized, y_pred, normalize=False) / len(y_valid_binarized)


optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
optuna.logging.set_verbosity(optuna.logging.WARNING)
study_name = "logistic_regression"
storage = f"sqlite:///{study_name}.db"
if os.path.exists(f"{study_name}.db"):
    os.remove(f"{study_name}.db")
study = optuna.create_study(
    direction="minimize", storage=storage, study_name=study_name
)
study.optimize(objective, n_trials=100, n_jobs=-1, show_progress_bar=True)

Best trial: 92. Best value: 1.92919: 100%|██████████| 100/100 [00:13<00:00,  7.41it/s]


In [3]:
# use the weights in the best trial for the final model
best_trial = study.best_trial
weights = best_trial.user_attrs["coef"]
intercept = best_trial.user_attrs["intercept"]
train_score = best_trial.user_attrs["train_score"]
C = best_trial.params["C"]
model = LogisticRegression(C=C, max_iter=10000, random_state=SEED)
model.fit(X_train, y_train)

log_loss(
    y_binarized,
    model.predict_proba(scaler.transform(imputer.transform(X))),
    normalize=False,
) / len(y_binarized)

1.9148939653045414

In [4]:
best_trial.user_attrs

{'coef': [-0.26739639402735926,
  -0.10267020100912445,
  0.5020697626915188,
  0.22504418388672823,
  0.7119821786518152,
  -0.1670062089071004,
  0.10422753177135977,
  -0.1196319909251481,
  -0.19786149090738073,
  0.5336711348135713,
  -0.7015405359552571,
  -0.24786531703832382,
  0.11284576681946595,
  -0.1065481306521545,
  1.0981656778446263,
  -0.6595974403608177,
  0.7858808519294372,
  -0.9974247778459501,
  -0.025376978301565038,
  -0.05259896330237751,
  0.22804305300068217,
  -0.2967442826003086,
  -0.610049517279864,
  -1.182770151371228,
  0.12540570807955023],
 'intercept': 0.17696023494886334,
 'n_iter': 245,
 'train_score': 0.22557976106816585}