# Import the training data

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# The proportion split between training and validation data
train_size = 0.75

# Main table
train = pd.read_pickle('data/global_train_data.pkl')

y = train['TARGET'].values

X_train, X_valid, y_train, y_valid = train_test_split(train.drop(
    ['TARGET', 'SK_ID_CURR'], axis=1), y, stratify=y, test_size=1 - train_size, random_state=1)

print('Shape of X_train:', X_train.shape)
print('Shape of X_valid:', X_valid.shape)

In [None]:
# preprocessing steps
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

Pipe_num = Pipeline(
    steps=[
        # tried median, mean, constant strategies
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipe_num, X_train.columns)])

preprocessor.fit(train)
X_train = preprocessor.transform(X_train)
X_valid = preprocessor.transform(X_valid)

# Optuna

## Optuna study results for LGBM

In [None]:
import optuna
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score

def objective(trial):

    params = {
        "boosting_type": 'gbdt',
        "objective": "binary",
        "n_jobs": -1,
        'scale_pos_weight': 11.5,
        'metric': 'auc',
        # 'learning_rate': 0.02,
        #'n_estimators': 1600,
        #'subsample_for_bin': 200,
        #'subsample': 0.8,
        #'subsample_freq': 10,
        'colsample_bytree': 0.8,
        'reg_lambda': 30,
        'reg_alpha': 25,
        'min_split_gain': 0.5,
        'min_child_weight': 1,
        'min_child_samples': 10,
        "learning_rate": trial.suggest_float("learning_rate", 1e-8, 1.0, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 10, 100000, log=True),
        "subsample_for_bin": trial.suggest_int("subsample_for_bin", 10, 10000, log=True),
        "subsample": trial.suggest_float("subsample", 0.001, 1.0, log=True),
        "subsample_freq": trial.suggest_int("subsample_freq", 1, 100),
        #"colsample_bytree": trial.suggest_float("colsample_bytree", 0.001, 1.0),
        #"reg_lambda": trial.suggest_int("reg_lambda", 1, 10000, log=True),
        #"reg_alpha": trial.suggest_int("reg_alpha", 1, 10000, log=True),
        #"min_split_gain": trial.suggest_float("min_split_gain", 0.001, 1.0),
        #"min_child_weight": trial.suggest_float("min_child_weight", 0.01, 1.0),
        #"min_child_samples": trial.suggest_int("min_child_samples", 1, 1000),
    }

    model = LGBMClassifier(**params)
    model.fit(X_train, y_train)
    accuracy = roc_auc_score(y_valid, model.predict_proba(X_valid)[:, 1])
    return accuracy


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, timeout=60)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))