In [1]:
import os

import numpy as np
import pandas as pd
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

from data import Y_COLUMNS, combined_train, sample_submission


SEED = 842
DATA_DIR = "processed"

X, y_binarized = (
    combined_train.drop(Y_COLUMNS, axis=1),
    combined_train[Y_COLUMNS],
)

y = np.argmax(y_binarized.values, axis=1)

# null_threshold = 0.2
# dropped_columns = X.columns[X.isnull().mean() > null_threshold]
# X = X.drop(dropped_columns, axis=1)
X = X.drop(["house_q10"], axis=1)
X = X.iloc[:, 1:]
X

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,house_q02,house_q03,house_q04,house_q05y,house_q05m,house_q06,house_q07,house_q08,house_q09,house_q11,...,edu_q57,edu_q58,edu_q59,edu_q60,edu_q61,edu_q62,edu_q63,edu_q64,edu_q65,edu_q66
0,1.0,1.0,19680615.0,44.0,2.0,1.0,1.0,2.0,0.0,2.0,...,,,,,,,,,,
1,2.0,2.0,19640910.0,48.0,0.0,1.0,1.0,1.0,0.0,2.0,...,,,,,,,,,,
2,1.0,1.0,19510317.0,61.0,5.0,1.0,1.0,2.0,0.0,2.0,...,,,,,,,,,,
3,1.0,1.0,19460402.0,66.0,5.0,1.0,1.0,2.0,0.0,2.0,...,,,,,,,,,,
4,2.0,1.0,19400407.0,72.0,5.0,4.0,,,0.0,2.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5332,2.0,1.0,19390126.0,73.0,7.0,4.0,,,0.0,2.0,...,,,,,,,,,,
5333,1.0,1.0,19520312.0,60.0,6.0,1.0,1.0,2.0,0.0,2.0,...,,,,,,,,,,
5334,1.0,1.0,19570125.0,55.0,7.0,1.0,1.0,2.0,0.0,2.0,...,,,,,,,,,,
5335,1.0,1.0,19581225.0,53.0,8.0,1.0,1.0,2.0,0.0,1.0,...,,,,,,,,,,


In [2]:
import logging
import sys

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

MAX_ITER = 20000


def objective(trial: optuna.Trial):
    null_threshold = trial.suggest_float("null_threshold", 0.05, 0.5)
    cv = trial.suggest_int("cv", 3, 5)
    imputer_strategy = trial.suggest_categorical(
        "imputer_strategy", ["mean", "median", "most_frequent"]
    )
    scaler = trial.suggest_categorical("scaler", ["minmax", "standard"])
    C = trial.suggest_float("C", 1e-10, 1e10, log=True)
    l1_ratio = trial.suggest_float("l1_ratio", 0, 1)

    dropped_columns = X.columns[X.isnull().mean() > null_threshold]
    X_cleaned = X.drop(dropped_columns, axis=1)

    X_train, X_valid, y_train, y_valid, y_train_binarized, y_valid_binarized = (
        train_test_split(X_cleaned, y, y_binarized, test_size=1 / cv, random_state=SEED)
    )
    imputer = SimpleImputer(strategy=imputer_strategy)
    X_train_imputed = imputer.fit_transform(X_train)
    X_valid_imputed = imputer.transform(X_valid)
    if scaler == "standard":
        scaler = StandardScaler()
    elif scaler == "minmax":
        scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train_imputed)
    X_valid = scaler.transform(X_valid_imputed)

    model = LogisticRegression(
        C=C,
        l1_ratio=l1_ratio,
        penalty="elasticnet",
        max_iter=MAX_ITER,
        solver="saga",
        random_state=SEED,
    )
    model.fit(X_train, y_train)
    train_loss = log_loss(
        y_train_binarized, model.predict_proba(X_train), normalize=False
    ) / len(y_train_binarized)

    y_pred = model.predict_proba(X_valid)
    valid_loss = log_loss(y_valid_binarized, y_pred, normalize=False) / len(
        y_valid_binarized
    )

    trial.set_user_attr("n_iter", model.n_iter_.tolist()[0])
    trial.set_user_attr("train_score", model.score(X_train, y_train))
    trial.set_user_attr("train_loss", train_loss)
    trial.set_user_attr("valid_loss_shift", valid_loss - train_loss)
    return valid_loss


optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
optuna.logging.set_verbosity(optuna.logging.WARNING)
storage = f"sqlite:///logistic_regression.db"
study_name = "logistic_regression3"
study = optuna.create_study(
    direction="minimize", storage=storage, study_name=study_name, load_if_exists=True
)
study.optimize(objective, n_trials=500, n_jobs=-1, show_progress_bar=True)

Best trial: 476. Best value: 1.92075: 100%|██████████| 500/500 [40:31<00:00,  4.86s/it]


In [3]:
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

storage = f"sqlite:///logistic_regression.db"
study_name = "logistic_regression2"
study = optuna.create_study(
    direction="minimize", storage=storage, study_name=study_name, load_if_exists=True
)
MAX_ITER = 20000
best_trial = study.best_trial
best_trial

FrozenTrial(number=623, state=TrialState.COMPLETE, values=[1.9020280477225042], datetime_start=datetime.datetime(2024, 11, 7, 11, 49, 32, 121717), datetime_complete=datetime.datetime(2024, 11, 7, 11, 51, 16, 904923), params={'null_threshold': 0.0674840556372622, 'test_ratio': 0.1320960580761783, 'imputer_strategy': 'most_frequent', 'scaler': 'standard', 'C': 0.05327131225055067, 'l1_ratio': 0.1151949789987428}, user_attrs={'coef': [-0.1420053610844146, 0.0, 0.0, -0.14400165773680978, -0.04711043160354235, 0.15511454766542168, -0.018516833329619373, 0.00032491916954098814, 0.07818310595902957, -0.08826892761423724, -0.12377169430317367, 0.0, -0.03821530199355885, 0.21999530602718814, -0.0837086145347963, 0.10604480257299863, -0.09325976987746175, 0.0, 0.008749588201858562, 0.0, 0.0, -0.10779818094676052, -0.36614192794972555, 0.0], 'intercept': -0.34528697516966683, 'n_iter': 20000, 'train_score': 0.2290587219343696}, system_attrs={}, intermediate_values={}, distributions={'null_thresho

In [4]:
null_threshold = best_trial.params["null_threshold"]
dropped_columns = X.columns[X.isnull().mean() > null_threshold]
X_cleaned = X.drop(dropped_columns, axis=1)

final_imputer = SimpleImputer(strategy=best_trial.params["imputer_strategy"])
X_imputed = final_imputer.fit_transform(X_cleaned)

if best_trial.params["scaler"] == "standard":
    final_scaler = StandardScaler()
elif best_trial.params["scaler"] == "minmax":
    final_scaler = MinMaxScaler()
X_scaled = final_scaler.fit_transform(X_imputed)

C = best_trial.params["C"]
l1_ratio = best_trial.params["l1_ratio"]
final_model = LogisticRegression(
    penalty="elasticnet",
    C=C,
    l1_ratio=l1_ratio,
    max_iter=MAX_ITER * 5,
    random_state=SEED,
    solver="saga",
)

final_model.fit(X_scaled, y)

final_y_train_pred = final_model.predict_proba(X_scaled)

log_loss(y_binarized, final_y_train_pred)



1.9085195938311237

In [5]:
from data import generate_submission, combined_test

X_submissions = combined_test.drop(Y_COLUMNS, axis=1)
X_submissions = X_submissions.drop(dropped_columns, axis=1)
X_submissions = X_submissions.drop(["house_q10"], axis=1)
X_submissions = X_submissions.iloc[:, 1:]
X_submissions_imputed = final_imputer.transform(X_submissions)
X_submissions_scaled = final_scaler.transform(X_submissions_imputed)
final_y_pred = final_model.predict_proba(X_submissions_scaled)

generate_submission(final_y_pred, "elasticnet-logistic")

Submission file saved as elasticnet-logistic-3.csv
