In [1]:
import numpy as np
import pandas as pd
import optuna

from data import Y_COLUMNS, combined_train, sample_submission


SEED = 842
DATA_DIR = "processed"

X, y_binarized = (
    combined_train.drop(Y_COLUMNS, axis=1),
    combined_train[Y_COLUMNS],
)

y = np.argmax(y_binarized.values, axis=1)

# null_threshold = 0.2
# dropped_columns = X.columns[X.isnull().mean() > null_threshold]
# X = X.drop(dropped_columns, axis=1)
X = X.drop(["house_q10"], axis=1)
X = X.iloc[:, 1:]
X

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,house_q02,house_q03,house_q04,house_q05y,house_q05m,house_q06,house_q07,house_q08,house_q09,house_q11,...,edu_q57,edu_q58,edu_q59,edu_q60,edu_q61,edu_q62,edu_q63,edu_q64,edu_q65,edu_q66
0,1.0,1.0,19680615.0,44.0,2.0,1.0,1.0,2.0,0.0,2.0,...,,,,,,,,,,
1,2.0,2.0,19640910.0,48.0,0.0,1.0,1.0,1.0,0.0,2.0,...,,,,,,,,,,
2,1.0,1.0,19510317.0,61.0,5.0,1.0,1.0,2.0,0.0,2.0,...,,,,,,,,,,
3,1.0,1.0,19460402.0,66.0,5.0,1.0,1.0,2.0,0.0,2.0,...,,,,,,,,,,
4,2.0,1.0,19400407.0,72.0,5.0,4.0,,,0.0,2.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5332,2.0,1.0,19390126.0,73.0,7.0,4.0,,,0.0,2.0,...,,,,,,,,,,
5333,1.0,1.0,19520312.0,60.0,6.0,1.0,1.0,2.0,0.0,2.0,...,,,,,,,,,,
5334,1.0,1.0,19570125.0,55.0,7.0,1.0,1.0,2.0,0.0,2.0,...,,,,,,,,,,
5335,1.0,1.0,19581225.0,53.0,8.0,1.0,1.0,2.0,0.0,1.0,...,,,,,,,,,,


In [2]:
import logging
import sys

from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

MAX_ITER = 20000


def objective(trial: optuna.Trial):
    null_threshold = trial.suggest_float("null_threshold", 0.05, 0.5)
    cv = trial.suggest_int("cv", 3, 5)
    imputer_strategy = trial.suggest_categorical(
        "imputer_strategy", ["mean", "median", "most_frequent"]
    )
    scaler = trial.suggest_categorical("scaler", ["minmax", "standard"])
    C = trial.suggest_float("C", 1e-10, 1e10, log=True)
    penalty = trial.suggest_categorical("penalty", ["l1", "l2"])
    intercept_scaling = trial.suggest_float("intercept_scaling", 1e-10, 1e10, log=True)

    dropped_columns = X.columns[X.isnull().mean() > null_threshold]
    X_cleaned = X.drop(dropped_columns, axis=1)

    X_train, X_valid, y_train, y_valid, y_train_binarized, y_valid_binarized = (
        train_test_split(X_cleaned, y, y_binarized, test_size=1 / cv, random_state=SEED)
    )
    imputer = SimpleImputer(strategy=imputer_strategy)
    X_train_imputed = imputer.fit_transform(X_train)
    X_valid_imputed = imputer.transform(X_valid)
    if scaler == "standard":
        scaler = StandardScaler()
    elif scaler == "minmax":
        scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train_imputed)
    X_valid = scaler.transform(X_valid_imputed)

    model = LinearSVC(
        C=C,
        penalty=penalty,
        intercept_scaling=intercept_scaling,
        max_iter=MAX_ITER,
        random_state=SEED,
    )
    model.fit(X_train, y_train)
    calibration_method = trial.suggest_categorical(
        "calibration_method", ["sigmoid", "isotonic"]
    )
    calibrated_model = CalibratedClassifierCV(
        model, cv="prefit", method=calibration_method
    )
    calibrated_model.fit(X_train, y_train)

    train_loss = log_loss(
        y_train_binarized, calibrated_model.predict_proba(X_train), normalize=False
    ) / len(y_train_binarized)

    y_pred = calibrated_model.predict_proba(X_valid)
    valid_loss = log_loss(y_valid_binarized, y_pred, normalize=False) / len(
        y_valid_binarized
    )

    trial.set_user_attr("train_score", model.score(X_train, y_train))
    trial.set_user_attr("train_loss", train_loss)
    trial.set_user_attr("valid_loss_shift", valid_loss - train_loss)
    return valid_loss


optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
optuna.logging.set_verbosity(optuna.logging.WARNING)
storage = f"sqlite:///svm.db"
study_name = "svm"
study = optuna.create_study(
    direction="minimize", storage=storage, study_name=study_name, load_if_exists=True
)
study.optimize(objective, n_trials=500, n_jobs=-1, show_progress_bar=True)

Best trial: 321. Best value: 1.92581: 100%|██████████| 500/500 [00:51<00:00,  9.71it/s]


In [2]:
import logging
import sys

from sklearn.svm import SVC
from sklearn.metrics import log_loss
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

MAX_ITER = int(1e6)


def objective(trial: optuna.Trial):
    null_threshold = trial.suggest_float("null_threshold", 0.05, 0.5)
    cv = trial.suggest_int("cv", 3, 5)
    imputer_strategy = trial.suggest_categorical(
        "imputer_strategy", ["mean", "median", "most_frequent"]
    )
    scaler = trial.suggest_categorical("scaler", ["minmax", "standard"])
    C = trial.suggest_float("C", 1e-5, 1e5, log=True)
    kernel = trial.suggest_categorical("kernel", ["poly", "rbf", "sigmoid"])

    dropped_columns = X.columns[X.isnull().mean() > null_threshold]
    X_cleaned = X.drop(dropped_columns, axis=1)

    X_train, X_valid, y_train, y_valid, y_train_binarized, y_valid_binarized = (
        train_test_split(X_cleaned, y, y_binarized, test_size=1 / cv, random_state=SEED)
    )
    imputer = SimpleImputer(strategy=imputer_strategy)
    X_train_imputed = imputer.fit_transform(X_train)
    X_valid_imputed = imputer.transform(X_valid)
    if scaler == "standard":
        scaler = StandardScaler()
    elif scaler == "minmax":
        scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train_imputed)
    X_valid = scaler.transform(X_valid_imputed)

    if kernel == "poly":
        degree = trial.suggest_int("degree", 2, 5)
        model = SVC(
            C=C,
            kernel=kernel,
            degree=degree,
            max_iter=MAX_ITER,
            random_state=SEED,
            probability=True,
        )
    else:
        model = SVC(
            C=C,
            kernel=kernel,
            max_iter=MAX_ITER,
            random_state=SEED,
            probability=True,
        )
    model.fit(X_train, y_train)

    train_loss = log_loss(
        y_train_binarized, model.predict_proba(X_train), normalize=False
    ) / len(y_train_binarized)

    y_pred = model.predict_proba(X_valid)
    valid_loss = log_loss(y_valid_binarized, y_pred, normalize=False) / len(
        y_valid_binarized
    )

    trial.set_user_attr("train_score", model.score(X_train, y_train))
    trial.set_user_attr("train_loss", train_loss)
    trial.set_user_attr("valid_loss_shift", valid_loss - train_loss)
    return valid_loss


optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
optuna.logging.set_verbosity(optuna.logging.WARNING)
storage = f"sqlite:///svm.db"
study_name = "kernel-svc"
study = optuna.create_study(
    direction="minimize", storage=storage, study_name=study_name, load_if_exists=True
)
study.optimize(objective, n_trials=500, n_jobs=-1, show_progress_bar=True)

Best trial: 420. Best value: 1.9321: 100%|██████████| 500/500 [11:51<00:00,  1.42s/it]


In [6]:
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

linear_study = optuna.create_study(
    direction="minimize", storage=storage, study_name="linear-svc", load_if_exists=True
)
linear_best_trial = linear_study.best_trial
null_threshold = linear_best_trial.params["null_threshold"]
dropped_columns = X.columns[X.isnull().mean() > null_threshold]
X_cleaned = X.drop(dropped_columns, axis=1)

final_imputer = SimpleImputer(strategy=linear_best_trial.params["imputer_strategy"])
X_imputed = final_imputer.fit_transform(X_cleaned)

if linear_best_trial.params["scaler"] == "standard":
    final_scaler = StandardScaler()
elif linear_best_trial.params["scaler"] == "minmax":
    final_scaler = MinMaxScaler()
X_scaled = final_scaler.fit_transform(X_imputed)

C = linear_best_trial.params["C"]
penalty = linear_best_trial.params["penalty"]
intercept_scaling = linear_best_trial.params["intercept_scaling"]
final_model = LinearSVC(
    C=C,
    penalty=penalty,
    intercept_scaling=intercept_scaling,
    max_iter=MAX_ITER * 5,
    random_state=SEED,
)
final_model.fit(X_scaled, y)
calibration_method = linear_best_trial.params["calibration_method"]
final_calibrated_model = CalibratedClassifierCV(
    final_model, cv="prefit", method=calibration_method
)
final_calibrated_model.fit(X_scaled, y)

final_y_train_pred = final_calibrated_model.predict_proba(X_scaled)

log_loss(y_binarized, final_y_train_pred)

1.9115197552925864

In [9]:
from data import generate_submission, combined_test

X_submissions = combined_test.drop(Y_COLUMNS, axis=1)
X_submissions = X_submissions.drop(dropped_columns, axis=1)
X_submissions = X_submissions.drop(["house_q10"], axis=1)
X_submissions = X_submissions.iloc[:, 1:]
X_submissions_imputed = final_imputer.transform(X_submissions)
X_submissions_scaled = final_scaler.transform(X_submissions_imputed)
final_y_pred = final_calibrated_model.predict_proba(X_submissions_scaled)

generate_submission(final_y_pred, "linear-svc")

Submission file saved as linear-svc-1.csv


In [11]:
kernel_study = optuna.create_study(
    direction="minimize", storage=storage, study_name="kernel-svc", load_if_exists=True
)
kernel_best_trial = kernel_study.best_trial
null_threshold = kernel_best_trial.params["null_threshold"]
dropped_columns = X.columns[X.isnull().mean() > null_threshold]
X_cleaned = X.drop(dropped_columns, axis=1)

final_imputer = SimpleImputer(strategy=kernel_best_trial.params["imputer_strategy"])
X_imputed = final_imputer.fit_transform(X_cleaned)

if kernel_best_trial.params["scaler"] == "standard":
    final_scaler = StandardScaler()
elif kernel_best_trial.params["scaler"] == "minmax":
    final_scaler = MinMaxScaler()
X_scaled = final_scaler.fit_transform(X_imputed)

kernel = kernel_best_trial.params["kernel"]
C = kernel_best_trial.params["C"]
if kernel == "poly":
    degree = kernel_best_trial.params["degree"]
    final_model = SVC(
        C=C,
        kernel=kernel,
        degree=degree,
        max_iter=MAX_ITER * 5,
        random_state=SEED,
        probability=True,
    )
else:
    final_model = SVC(
        C=C, kernel=kernel, max_iter=MAX_ITER * 5, random_state=SEED, probability=True
    )
final_model.fit(X_scaled, y)

final_y_train_pred = final_model.predict_proba(X_scaled)

log_loss(y_binarized, final_y_train_pred)

1.9124056593435594

In [12]:
from data import generate_submission, combined_test

X_submissions = combined_test.drop(Y_COLUMNS, axis=1)
X_submissions = X_submissions.drop(dropped_columns, axis=1)
X_submissions = X_submissions.drop(["house_q10"], axis=1)
X_submissions = X_submissions.iloc[:, 1:]
X_submissions_imputed = final_imputer.transform(X_submissions)
X_submissions_scaled = final_scaler.transform(X_submissions_imputed)
final_y_pred = final_model.predict_proba(X_submissions_scaled)

generate_submission(final_y_pred, "kernel-svc")

Submission file saved as kernel-svc-1.csv
