In [1]:
import os

import numpy as np
import pandas as pd
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

from data import (
    Y_COLUMNS,
    combined_train_with_num_pov,
    combined_transformed_train_with_num_pov,
    get_divided_edu,
    remove_boring_columns,
    remove_all_valid_null_columns,
)


SEED = 662
DATA_DIR = "processed"

data = get_divided_edu(remove_boring_columns(combined_transformed_train_with_num_pov))

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from data import get_preprocessor
from models import (
    suggest_random_forest,
    suggest_xgboost,
    suggest_catboost,
    suggest_lightgbm,
    suggest_gradeint_boosting,
)

MAX_ITER = 2000


def load_objective(X: pd.DataFrame, y: pd.DataFrame, y_binarized: pd.DataFrame):
    X.replace(-999, np.nan, inplace=True)

    def objective(trial: optuna.Trial):
        null_threshold = trial.suggest_float("null_threshold", 0, 0.5)
        cv = trial.suggest_int("cv", 3, 5)
        imputer_strategy = trial.suggest_categorical(
            "imputer_strategy", ["mean", "median", "most_frequent"]
        )

        dropped_columns = X.columns[X.isnull().mean() > null_threshold]
        X_cleaned = X.drop(dropped_columns, axis=1)

        train_losses, valid_losses, valid_loss_shift, n_iters = [], [], [], []
        X_train, X_valid, y_train, y_valid, y_train_binarized, y_valid_binarized = (
            train_test_split(
                X_cleaned, y, y_binarized, test_size=1 / cv, random_state=SEED
            )
        )  # lazy, just do 1 round of cv

        preprocessor = get_preprocessor(
            imputer_strategy=[
                "most_frequent",
                "most_frequent",
                imputer_strategy,
                imputer_strategy,
            ],
            remainder="drop",
        )

        X_train = preprocessor.fit_transform(X_train)
        X_valid = preprocessor.transform(X_valid)

        model = suggest_gradeint_boosting(trial, seed=SEED)
        model.fit(X_train, y_train - 1)
        # train_losses.append(
        #     log_loss(y_train_binarized, model.predict_proba(X_train), normalize=False)
        #     / len(y_train_binarized)
        # )

        y_pred = model.predict_proba(X_valid)
        for column_index in [
            column for column in range(10) if column not in model.classes_
        ]:
            y_pred = np.insert(y_pred, column_index - 1, 0, axis=1)

        valid_losses.append(
            log_loss(y_valid_binarized, y_pred, normalize=False)
            / len(y_valid_binarized)
        )
        # valid_loss_shift.append(valid_losses[-1] - train_losses[-1])
        # n_iters.append(model.n_iter_[0])

        mean_valid_loss = np.mean(valid_losses)
        trial.set_user_attr("n_iter", np.mean(n_iters))
        trial.set_user_attr("train_loss", np.mean(train_losses))
        trial.set_user_attr("valid_loss_shift", np.mean(valid_loss_shift))
        return mean_valid_loss

    return objective


optuna.logging.set_verbosity(optuna.logging.ERROR)
best_values, best_params = [], []
for index, part in enumerate(data):
    X, y, y_binarized = part.drop(Y_COLUMNS, axis=1), part["num_pov"], part[Y_COLUMNS]
    study = optuna.create_study(direction="minimize")
    study.optimize(
        load_objective(X, y, y_binarized),
        n_trials=200,
        n_jobs=-1,
        show_progress_bar=True,
    )
    best_values.append(study.best_value)
    best_params.append(study.best_params)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dty

In [None]:
best_values

[1.9089964980474232, 1.9025787668006906, 1.6807145556462684]

In [None]:
best_params