# Just to Get Hyperparameter

In [1]:
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')
import glob
import os
import pickle
import joblib
import pandas as pd
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations
from eli5.permutation_importance import get_score_importances
import eli5
from eli5.sklearn import PermutationImportance
from sklearn import cluster

import optuna
from optuna import Trial
from optuna.samplers import TPESampler
from optuna.visualization import plot_contour, plot_optimization_history
from optuna.visualization import plot_parallel_coordinate, plot_slice, plot_param_importances
from hyperopt.pyll.base import scope
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
import lightgbm as lgb
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import xgboost as xgb
from catboost import Pool, CatBoostClassifier
from kaggler.model import AutoLGB
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV, StratifiedShuffleSplit, train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, log_loss
import random

train = pd.read_csv('data/train.csv').drop(['index', 'FLAG_MOBIL'], axis=1).fillna('NAN')
test = pd.read_csv('data/test.csv').drop(['index', 'FLAG_MOBIL'], axis=1).fillna('NAN')
sample_submission = pd.read_csv('data/sample_submission.csv')

# train데이터와 test데이터 변수를 함께 조정하기 위해 병합
merge_data = pd.concat([train, test], axis = 0)

# DAYS_BIRTH
merge_data['DAYS_BIRTH_month']=np.floor((-merge_data['DAYS_BIRTH'])/30)-(
    (np.floor((-merge_data['DAYS_BIRTH'])/30)/12).astype(int)*12)
merge_data['DAYS_BIRTH_week']=np.floor((-merge_data['DAYS_BIRTH'])/7)-(
    (np.floor((-merge_data['DAYS_BIRTH'])/7)/4).astype(int)*4)

# DAYS_EMPLOYED
merge_data['DAYS_EMPLOYED_month']=np.floor((-merge_data['DAYS_EMPLOYED'])/30)-(
    (np.floor((-merge_data['DAYS_EMPLOYED'])/30)/12).astype(int)*12)
merge_data['DAYS_EMPLOYED_week']=np.floor((-merge_data['DAYS_EMPLOYED'])/7)-(
    (np.floor((-merge_data['DAYS_EMPLOYED'])/7)/4).astype(int)*4)

# before_EMPLOYED
merge_data['before_EMPLOYED']=merge_data['DAYS_BIRTH']-merge_data['DAYS_EMPLOYED']
merge_data['before_EMPLOYED_month']=np.floor((-merge_data['before_EMPLOYED'])/30)-(
    (np.floor((-merge_data['before_EMPLOYED'])/30)/12).astype(int)*12)
merge_data['before_EMPLOYED_week']=np.floor((-merge_data['before_EMPLOYED'])/7)-(
    (np.floor((-merge_data['before_EMPLOYED'])/7)/4).astype(int)*4)

# DAYS_BIRTH / Income
merge_data['DAYS_BIRTH_month/income_total'] = merge_data['DAYS_BIRTH_month'] / merge_data['income_total']
merge_data['DAYS_BIRTH_week/income_total'] = merge_data['DAYS_BIRTH_week'] / merge_data['income_total']

# DAYS_EMPLOYED / Income
merge_data['DAYS_EMPLOYED_month/income_total'] = merge_data['DAYS_EMPLOYED_month'] / merge_data['income_total']
merge_data['DAYS_EMPLOYED_week/income_total'] = merge_data['DAYS_EMPLOYED_week'] / merge_data['income_total']

# before_EMPLOYED / Income
merge_data['before_EMPLOYED/income_total'] = merge_data['before_EMPLOYED'] / merge_data['income_total']
merge_data['before_EMPLOYED_month/income_total'] = merge_data['before_EMPLOYED_month'] / merge_data['income_total']
merge_data['before_EMPLOYED_week/income_total'] = merge_data['before_EMPLOYED_week'] / merge_data['income_total']

# Income / Family
merge_data['income_total/family_size'] = merge_data['income_total'] / merge_data['family_size']

merge_data['child_num/income_total'] = merge_data['child_num'] / merge_data['income_total']
merge_data['family_size/income_total'] = merge_data['family_size'] / merge_data['income_total']
merge_data['DAYS_BIRTH/income_total'] = merge_data['DAYS_BIRTH'] / merge_data['income_total']
merge_data['DAYS_EMPLOYED/income_total'] = merge_data['DAYS_EMPLOYED'] / merge_data['income_total']
merge_data['DAYS_EMPLOYED/DAYS_BIRTH'] =  merge_data['DAYS_EMPLOYED'] / merge_data['DAYS_BIRTH']

# Income skewed-data
merge_data['income_total'] = np.log1p(merge_data['income_total'])
# merge_data['log_income_total'] = np.log(merge_data['income_total'])
# merge_data['sqrt_income_total'] = np.sqrt(merge_data['income_total'])
# merge_data['boxcox_income_total'] = stats.boxcox(merge_data['income_total'])[0]

merge_data = merge_data.fillna(-999)
train = merge_data[merge_data['credit'] != -999]
test = merge_data[merge_data['credit'] == -999]
test.drop('credit', axis = 1, inplace = True)

train_cols = list(train.columns); train_cols.remove('credit'); train_cols.append('credit')
train = train[train_cols]

train_x = train.drop(['credit'], axis=1) # 데이터 나누기
train_y = train['credit']
test_x = test.copy()

enc = LabelEncoder()
for col in train_x.columns:
    if (train_x[col].dtypes=='O'):
        train_x[col] = enc.fit_transform(train_x[col])
        test_x[col] = enc.fit_transform(test_x[col])

def reduce_mem_usage(data):
    numerics = ['int8', 'int16', 'int32', 'int64', 'float32', 'float64']
    start_memory = data.memory_usage().sum() / 1024**2    
    for col in data.columns:
        col_type = data[col].dtypes
        if col_type in numerics:
            c_min = data[col].min()
            c_max = data[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    data[col] = data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    data[col] = data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    data[col] = data[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    data[col] = data[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    data[col] = data[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    data[col] = data[col].astype(np.float32)
                else:
                    data[col] = data[col].astype(np.float64)    
    end_memory = data.memory_usage().sum() / 1024**2
    print('Memory optimization from {:5.2f}MB to {:5.2f}MB ({:.1f}% reduction)'
          .format(start_memory, end_memory, 100 * (start_memory - end_memory) / start_memory))
    return data
train_x = reduce_mem_usage(train_x)
test_x = reduce_mem_usage(test_x)

Memory optimization from  6.86MB to  1.92MB (72.1% reduction)
Memory optimization from  2.59MB to  0.72MB (72.1% reduction)


In [7]:
def cat_objective(trial: Trial) -> float:
    cat_params = {
        'thread_count': None,
        'allow_writing_files': False,
        'loss_function': 'MultiClass',
        'bootstrap_type': 'Bernoulli', # Baysian은 커널 오류
        'n_estimators':  50000,
        #'learning_rate': 0.01,
        #'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bernoulli']), # Poisson은 gpu만
        'l2_leaf_reg': trial.suggest_loguniform("l2_leaf_reg", 1, 100),
        'random_strength': trial.suggest_loguniform("random_strength", 1e-2, 100),
        'max_bin': trial.suggest_int('max_bin', 150, 500), # border_counts
        'colsample_bylevel': trial.suggest_uniform('colsample_bylevel', 0.0, 0.2), # rsm # gpu not support
        'learning_rate':trial.suggest_uniform("learning_rate", 0.008, 0.02),
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations', 1, 15),
        'depth': trial.suggest_int('depth', 10, 14),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 50), # min_child_samples
        "subsample": trial.suggest_uniform("subsample", 0.5, 1.0)
        #"bagging_temperature": trial.suggest_loguniform("bagging_temperature", 1e-2, 100)
     }
#     if cat_params["bootstrap_type"] == "Bayesian":
#         cat_params["bagging_temperature"] = trial.suggest_loguniform('bagging_temperature', 1e-2, 100)
#     else:
#         cat_params["subsample"] = trial.suggest_uniform("subsample", 0.1, 1.0)
#    print(cat_params)
        
    
    
    # CV=10으로 튜닝
    
    seed = 91373
    kfold = StratifiedKFold(n_splits=5, random_state = seed, shuffle = True)
    cv = np.zeros((train_x.shape[0], 3))

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):

        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]
        
        catmodel = CatBoostClassifier(**cat_params)                                       # 진행상황 보고싶을때 False를 1000으로
        catmodel.fit(x_train, y_train,
                     eval_set=[(x_val,y_val)], early_stopping_rounds=30, verbose=False)

        cv[val_idx, :] = catmodel.predict_proba(x_val)
        #print(f'fold{n+1} multi_logloss: {log_loss(y_val, cv[val_idx, :])}') # Fold마다 점수 체크하려면 주석 해제
    #print('multi_logloss:', log_loss(train_y, cv))

    
    return log_loss(train_y, cv)

In [8]:
sampler = TPESampler(seed=42)
cat_study = optuna.create_study(study_name="cat_parameter_opt", direction="minimize", sampler=sampler)
cat_study.optimize(cat_objective, n_trials=50)

cat_best_hyperparams = cat_study.best_trial.params
cat_base_hyperparams = {'loss_function': 'MultiClass', 'n_estimators': 50000, 'thread_count': None, 'allow_writing_files': False,
                       'bootstrap_type': 'Bernoulli'}
cat_best_hyperparams.update(cat_base_hyperparams)
print("The best hyperparameters are:\n", cat_best_hyperparams)

[32m[I 2021-05-23 20:46:55,269][0m A new study created in memory with name: cat_parameter_opt[0m
[32m[I 2021-05-23 21:05:27,945][0m Trial 0 finished with value: 0.696713935421841 and parameters: {'l2_leaf_reg': 5.6115164153345045, 'random_strength': 63.512210106407046, 'max_bin': 406, 'colsample_bylevel': 0.35919509051822196, 'learning_rate': 0.00987222368530924, 'leaf_estimation_iterations': 3, 'depth': 10, 'min_data_in_leaf': 44, 'subsample': 0.8005575058716043}. Best is trial 0 with value: 0.696713935421841.[0m
[32m[I 2021-05-23 21:36:00,349][0m Trial 1 finished with value: 0.6996042840740807 and parameters: {'l2_leaf_reg': 26.070247583707673, 'random_strength': 0.012087541473056965, 'max_bin': 490, 'colsample_bylevel': 0.499465584480253, 'learning_rate': 0.010548069328139315, 'leaf_estimation_iterations': 3, 'depth': 10, 'min_data_in_leaf': 16, 'subsample': 0.762378215816119}. Best is trial 0 with value: 0.696713935421841.[0m
[32m[I 2021-05-23 21:53:57,409][0m Trial 2 fi

KeyboardInterrupt: 

In [None]:
# 결과 나오자마자 복붙해서 카톡방에 올려서 따로 저장해두기!! (파라미터값, logg값 둘 다)

In [9]:
plot_optimization_history(cat_study) 

In [10]:
plot_param_importances(cat_study)

In [11]:
plot_slice(cat_study)

In [None]:
# 위 그래프들은 지우지 말고 그대로 커밋해주기 (결과 보고 범위 다시 지정해야함)

In [None]:
 {'bootstrap_type': 'Bernoulli', 'l2_leaf_reg': 95.93542290320686, 'random_strength': 20.17940526189691, 
  'max_bin': 185, 'colsample_bylevel': 0.08309149072060056, 'learning_rate': 0.0643798350832877, 'leaf_estimation_iterations': 8, 
  'depth': 10, 'min_data_in_leaf': 26, 'subsample': 0.7746017801223928, 'loss_function': 'MultiClass', 'n_estimators': 50000, 
  'thread_count': None}