In [1]:
import os

import numpy as np
import pandas as pd
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.preprocessing import MinMaxScaler

SEED = 841
DATA_DIR = "processed"
combined_train = pd.read_csv(os.path.join(DATA_DIR, "combined_train.csv"))

y_columns = [f"subjective_poverty_{i}" for i in range(1, 11)]
X, y = combined_train.drop(y_columns, axis=1), combined_train["num_pov"]

null_threshold = 0.2
X = X.drop(X.columns[X.isnull().mean() > null_threshold], axis=1)
X = X.drop(["house_q10"], axis=1)
X = X.iloc[:, 4:]
X

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,num_pov,house_q02,house_q03,house_q04,house_q05y,house_q05m,house_q06,house_q07,house_q08,house_q09,...,edu_q04,edu_q05,edu_q06,edu_q07,edu_q08,edu_q11,edu_q14,edu_q17,edu_q18,edu_q19
0,4,1.0,1.0,19680615.0,44.0,2.0,1.0,1.0,2.0,0.0,...,1.0,8.0,2.0,1.0,2.0,13.0,2.0,13.0,14.0,2.0
1,1,2.0,2.0,19640910.0,48.0,0.0,1.0,1.0,1.0,0.0,...,1.0,8.0,2.0,0.0,2.0,13.0,2.0,13.0,14.0,2.0
2,3,1.0,1.0,19510317.0,61.0,5.0,1.0,1.0,2.0,0.0,...,1.0,8.0,2.0,0.0,2.0,13.0,2.0,13.0,14.0,2.0
3,5,1.0,1.0,19460402.0,66.0,5.0,1.0,1.0,2.0,0.0,...,1.0,8.0,2.0,0.0,2.0,13.0,2.0,13.0,14.0,2.0
4,4,2.0,1.0,19400407.0,72.0,5.0,4.0,,,0.0,...,1.0,4.0,1.0,0.0,2.0,14.0,2.0,14.0,20.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5332,3,2.0,1.0,19390126.0,73.0,7.0,4.0,,,0.0,...,1.0,4.0,1.0,0.0,2.0,2.0,2.0,2.0,10.0,2.0
5333,4,1.0,1.0,19520312.0,60.0,6.0,1.0,1.0,2.0,0.0,...,1.0,8.0,2.0,0.0,2.0,13.0,2.0,13.0,14.0,2.0
5334,2,1.0,1.0,19570125.0,55.0,7.0,1.0,1.0,2.0,0.0,...,5.0,4.0,6.0,0.0,2.0,13.0,2.0,13.0,18.0,2.0
5335,5,1.0,1.0,19581225.0,53.0,8.0,1.0,1.0,2.0,0.0,...,6.0,4.0,9.0,0.0,2.0,2.0,2.0,2.0,22.0,2.0


In [None]:
from sklearn.impute import SimpleImputer
from optuna_dashboard import run_server

def objective(trial: optuna.Trial):
    imputer = SimpleImputer(strategy="mean")
    X_imputed = imputer.fit_transform(X)

    X_train, X_valid, y_train, y_valid = train_test_split(X_imputed, y, test_size=0.2, random_state=SEED)
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_valid = scaler.transform(X_valid)

    C = trial.suggest_float("C", 1e-10, 1e10, log=True)
    model = LogisticRegression(C=C, max_iter=10000)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_valid)
    return log_loss(y_valid, y_pred, normalize=False) / len(y_valid)

storage = optuna.storages.InMemoryStorage()
study = optuna.create_study(direction="minimize", storage=storage, study_name="logistic_regression")
study.optimize(objective, n_trials=100, n_jobs=-1)
run_server(storage)

[I 2024-11-06 16:43:21,363] A new study created in memory with name: logistic_regression
[I 2024-11-06 16:43:22,009] Trial 0 finished with value: 1.9820208547097373 and parameters: {'C': 0.0006119453700199882}. Best is trial 0 with value: 1.9820208547097373.
[I 2024-11-06 16:43:22,049] Trial 4 finished with value: 1.9859010641645807 and parameters: {'C': 0.00038593410155680034}. Best is trial 0 with value: 1.9820208547097373.
[I 2024-11-06 16:43:22,121] Trial 3 finished with value: 1.953208740418042 and parameters: {'C': 0.002458078642684661}. Best is trial 3 with value: 1.953208740418042.
[I 2024-11-06 16:43:22,338] Trial 15 finished with value: 1.9922578879653627 and parameters: {'C': 3.149536370944646e-05}. Best is trial 3 with value: 1.953208740418042.
[I 2024-11-06 16:43:22,462] Trial 1 finished with value: 1.992459897051716 and parameters: {'C': 2.003136485109265e-05}. Best is trial 3 with value: 1.953208740418042.
[I 2024-11-06 16:43:23,068] Trial 19 finished with value: 1.98802