# 1. Module Import & Data Load

In [3]:
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')
import glob
import os
import pickle
import joblib
import pandas as pd
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations
from eli5.permutation_importance import get_score_importances
import eli5
from eli5.sklearn import PermutationImportance
from sklearn import cluster

import optuna
from optuna import Trial
from optuna.samplers import TPESampler
from optuna.visualization import plot_contour, plot_optimization_history
from optuna.visualization import plot_parallel_coordinate, plot_slice, plot_param_importances
from hyperopt.pyll.base import scope
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
import lightgbm as lgb
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import xgboost as xgb
from catboost import Pool, CatBoostClassifier
from kaggler.model import AutoLGB
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV, StratifiedShuffleSplit, train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, log_loss
import random

# 2. Feature Engineering

In [4]:
train = pd.read_csv('data/train.csv').drop(['index', 'FLAG_MOBIL'], axis=1).fillna('NAN')
test = pd.read_csv('data/test.csv').drop(['index', 'FLAG_MOBIL'], axis=1).fillna('NAN')
sample_submission = pd.read_csv('data/sample_submission.csv')

# train데이터와 test데이터 변수를 함께 조정하기 위해 병합
merge_data = pd.concat([train, test], axis = 0)

# DAYS_BIRTH
merge_data['DAYS_BIRTH_month']=np.floor((-merge_data['DAYS_BIRTH'])/30)-(
    (np.floor((-merge_data['DAYS_BIRTH'])/30)/12).astype(int)*12)
merge_data['DAYS_BIRTH_week']=np.floor((-merge_data['DAYS_BIRTH'])/7)-(
    (np.floor((-merge_data['DAYS_BIRTH'])/7)/4).astype(int)*4)

# DAYS_EMPLOYED
merge_data['DAYS_EMPLOYED_month']=np.floor((-merge_data['DAYS_EMPLOYED'])/30)-(
    (np.floor((-merge_data['DAYS_EMPLOYED'])/30)/12).astype(int)*12)
merge_data['DAYS_EMPLOYED_week']=np.floor((-merge_data['DAYS_EMPLOYED'])/7)-(
    (np.floor((-merge_data['DAYS_EMPLOYED'])/7)/4).astype(int)*4)

# before_EMPLOYED
merge_data['before_EMPLOYED']=merge_data['DAYS_BIRTH']-merge_data['DAYS_EMPLOYED']
merge_data['before_EMPLOYED_month']=np.floor((-merge_data['before_EMPLOYED'])/30)-(
    (np.floor((-merge_data['before_EMPLOYED'])/30)/12).astype(int)*12)
merge_data['before_EMPLOYED_week']=np.floor((-merge_data['before_EMPLOYED'])/7)-(
    (np.floor((-merge_data['before_EMPLOYED'])/7)/4).astype(int)*4)

# DAYS_BIRTH / Income
merge_data['DAYS_BIRTH_month/income_total'] = merge_data['DAYS_BIRTH_month'] / merge_data['income_total']
merge_data['DAYS_BIRTH_week/income_total'] = merge_data['DAYS_BIRTH_week'] / merge_data['income_total']

# DAYS_EMPLOYED / Income
merge_data['DAYS_EMPLOYED_month/income_total'] = merge_data['DAYS_EMPLOYED_month'] / merge_data['income_total']
merge_data['DAYS_EMPLOYED_week/income_total'] = merge_data['DAYS_EMPLOYED_week'] / merge_data['income_total']

# before_EMPLOYED / Income
merge_data['before_EMPLOYED/income_total'] = merge_data['before_EMPLOYED'] / merge_data['income_total']
merge_data['before_EMPLOYED_month/income_total'] = merge_data['before_EMPLOYED_month'] / merge_data['income_total']
merge_data['before_EMPLOYED_week/income_total'] = merge_data['before_EMPLOYED_week'] / merge_data['income_total']

# Income / Family
merge_data['income_total/family_size'] = merge_data['income_total'] / merge_data['family_size']

merge_data['child_num/income_total'] = merge_data['child_num'] / merge_data['income_total']
merge_data['family_size/income_total'] = merge_data['family_size'] / merge_data['income_total']
merge_data['DAYS_BIRTH/income_total'] = merge_data['DAYS_BIRTH'] / merge_data['income_total']
merge_data['DAYS_EMPLOYED/income_total'] = merge_data['DAYS_EMPLOYED'] / merge_data['income_total']
merge_data['DAYS_EMPLOYED/DAYS_BIRTH'] =  merge_data['DAYS_EMPLOYED'] / merge_data['DAYS_BIRTH']

# Income skewed-data
merge_data['income_total'] = np.log1p(merge_data['income_total'])
# merge_data['log_income_total'] = np.log(merge_data['income_total'])
# merge_data['sqrt_income_total'] = np.sqrt(merge_data['income_total'])
# merge_data['boxcox_income_total'] = stats.boxcox(merge_data['income_total'])[0]

merge_data = merge_data.fillna(-999)
train = merge_data[merge_data['credit'] != -999]
test = merge_data[merge_data['credit'] == -999]
test.drop('credit', axis = 1, inplace = True)

train_cols = list(train.columns); train_cols.remove('credit'); train_cols.append('credit')
train = train[train_cols]

train = train[train['child_num']<=6].reset_index(drop=True) # 아이의 수가 7명 이상인 데이터 제거

## 기본 모델로 성능 측정하는 함수

In [5]:
def base_lgbmodel(train, verbose=True):
    
    train_x = train.drop(['credit'], axis=1)
    train_y = train['credit']
    
    np.random.seed(0)
    lucky_seeds=np.random.randint(1, 10000, 5)
    score_list = []
    
    for i, seed in enumerate(lucky_seeds):

        kfold = StratifiedKFold(n_splits=5, random_state = seed, shuffle = True) # CV 늘려가면서 하기
        cv=np.zeros((train_x.shape[0], 3))

        for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):

            x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
            y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

            lgbmodel = LGBMClassifier(objective='multiclass', n_estimators=10000, random_state=seed)
            lgbmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=None) 

            cv[val_idx,:] = lgbmodel.predict_proba(x_val)
        if verbose==True:
            print(f'multi_logloss: {log_loss(train_y, cv):.4f}')
        score_list.append(log_loss(train_y, cv))
    print(f'Average Logloss: {np.mean(score_list):.4f}')
    return np.mean(score_list)

## 향상된 모델로 성능 측정하는 함수

In [6]:
def advanced_lgbmodel(train, verbose=True):
    
    train_x = train.drop(['credit'], axis=1)
    train_y = train['credit']
    
    np.random.seed(0)
    lucky_seeds=np.random.randint(1, 10000, 3)
    score_list = []
    
    for i, seed in enumerate(lucky_seeds):

        kfold = StratifiedKFold(n_splits=5, random_state = seed, shuffle = True) # CV 늘려가면서 하기
        cv=np.zeros((train_x.shape[0], 3))

        for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):

            x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
            y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

            lgbmodel = LGBMClassifier(learning_rate=0.01, objective='multiclass', num_leaves=1000, max_depth=-1,
                                      n_estimators=10000, random_state=seed)
            lgbmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=None) 

            cv[val_idx,:] = lgbmodel.predict_proba(x_val)
        if verbose==True:
            print(f'multi_logloss: {log_loss(train_y, cv):.4f}')
        score_list.append(log_loss(train_y, cv))
    print(f'Average Logloss: {np.mean(score_list):.4f}')
    return np.mean(score_list)

## 원 핫 인코딩

In [7]:
train_oh = train.copy()
test_oh = test.copy()

object_col = []
for col in train_oh.columns:
    if (train_oh[col].dtype == 'object'):
        object_col.append(col)   
enc = OneHotEncoder()
enc.fit(train.loc[:,object_col])

train_onehot_df = pd.DataFrame(enc.transform(train_oh.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
train_oh.drop(object_col, axis=1, inplace=True)
train_oh = pd.concat([train_oh, train_onehot_df], axis=1)    

test_onehot_df = pd.DataFrame(enc.transform(test_oh.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
test_oh.drop(object_col, axis=1, inplace=True)
test_oh = pd.concat([test_oh, test_onehot_df], axis=1)

print('One Hot Encoding Completed')

One Hot Encoding Completed


In [8]:
# base_lgbmodel(train_oh, verbose=False)

## 라벨 인코딩

In [9]:
train_lab = train.copy()
test_lab = test.copy()

enc = LabelEncoder()
for col in train_lab.columns:
    if train_lab[col].dtypes=='object':
        train_lab[col] = enc.fit_transform(train_lab[col])
        test_lab[col] = enc.fit_transform(test_lab[col])

    
print('Label Encoding Completed')

Label Encoding Completed


In [10]:
# base_lgbmodel(train_lab, verbose=False)

## 카테고리 인코딩

In [11]:
train_cat = train.copy()
test_cat = test.copy()

for col in train_cat.columns:
    if train_cat[col].dtypes=='object':
        train_cat[col] =  train_cat[col].astype('category')
        test_cat[col] =  test_cat[col].astype('category')
    
print('Category Encoding Completed')

Category Encoding Completed


In [12]:
# base_lgbmodel(train_cat, verbose=False)

### income_total 범주화

In [13]:
# print('Initial Logloss', end=' ')
# base_lgbmodel(train_cat, verbose=False)
# raw_income = train_cat.income_total.copy()
# for cut in np.arange(1000, 5000, 500):
#     print(f'cut space:{cut}', end=' ')
#     cutted_income = pd.cut(raw_income, bins=np.arange(27000, 1575000, cut), labels=False)
#     train_cat['income_total'] = cutted_income
#     base_lgbmodel(train_cat, verbose=False)

## KNN (income_total, income_type, occyp_type)

In [14]:
# income_type, occyp_type, income_total을 이용하여 클러스터링하기 위해 따로 원 핫 인코딩
kmeans_train = train[['occyp_type', 'income_type', 'income_total']]
kmeans_test = test[['occyp_type', 'income_type', 'income_total']]
object_col = []
for col in kmeans_train.columns:
    if kmeans_train[col].dtype == 'object':
        object_col.append(col)
        
enc = OneHotEncoder()
enc.fit(kmeans_train.loc[:,object_col])

train_onehot_df = pd.DataFrame(enc.transform(kmeans_train.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
kmeans_train.drop(object_col, axis=1, inplace=True)
kmeans_train = pd.concat([kmeans_train, train_onehot_df], axis=1)

test_onehot_df = pd.DataFrame(enc.transform(kmeans_test.loc[:,object_col]).toarray(),
             columns=enc.get_feature_names(object_col))
kmeans_test.drop(object_col, axis=1, inplace=True)
kmeans_test = pd.concat([kmeans_test, test_onehot_df], axis=1)

In [15]:
base_lgbmodel(train_cat, verbose=False)
# n_clusters를 3부터 10까지 진행하여 하나씩 성능 체크
score_list = {}
k_means_train_total_df = pd.DataFrame()
k_means_test_total_df = pd.DataFrame()
for i in tqdm(range(3, 12)):
    train_cat = train.copy()
    test_cat = test.copy()

    for col in train_cat.columns:
        if train_cat[col].dtypes=='object':
            train_cat[col] =  train_cat[col].astype('category')
            test_cat[col] =  test_cat[col].astype('category')

    # n_cluster를 늘려가며 클러스터링 진행
    k_means_train_df = pd.DataFrame()
    k_means_test_df = pd.DataFrame()
    k_means = cluster.KMeans(n_clusters=i)
    k_means.fit(kmeans_train)
    k_means_train_df = pd.concat([k_means_train_df, pd.DataFrame(k_means.labels_, columns=[f'cluster_{i}'])], axis=1)
    k_means_train_total_df = pd.concat([k_means_train_total_df, pd.DataFrame(k_means.labels_, columns=[f'cluster_{i}'])], axis=1)
    k_means.fit(kmeans_test)
    k_means_test_df = pd.concat([k_means_test_df, pd.DataFrame(k_means.labels_, columns=[f'cluster_{i}'])], axis=1)
    k_means_test_total_df = pd.concat([k_means_test_total_df, pd.DataFrame(k_means.labels_, columns=[f'cluster_{i}'])], axis=1)

    train_cat = pd.concat([train_cat, k_means_train_df], axis=1)
    test_cat = pd.concat([test_cat, k_means_test_df], axis=1)

    # 클러스터링 결과를 category 타입으로 변경
    for col in train_cat.columns:
        if train_cat[col].dtypes=='int32':
            train_cat[col] =  train_cat[col].astype('category')
            test_cat[col] =  test_cat[col].astype('category')
    print(f'cluster: {i}', end=' ')
    score_list[f'cluster_{i}'] = base_lgbmodel(train_cat, verbose=False)

KeyboardInterrupt: 

In [None]:
# 성능이 좋아지는 클러스터링 개수로만 피처 추출

train_cat = train.copy()
test_cat = test.copy()

for col in train_cat.columns:
    if train_cat[col].dtypes=='object':
        train_cat[col] =  train_cat[col].astype('category')
        test_cat[col] =  test_cat[col].astype('category')
        
n = 3 # 성능이 좋아지는 클러스터 개수별로 정렬한 후 앞의 n개만 추출
train_cat = pd.concat([train_cat, k_means_train_total_df.loc[:, sorted(score_list, key=lambda x: score_list[x])[:n]]], axis=1)
test_cat = pd.concat([test_cat, k_means_test_total_df.loc[:, sorted(score_list, key=lambda x: score_list[x])[:n]]], axis=1)

for col in train_cat.columns:
    if train_cat[col].dtypes=='int32':
        train_cat[col] =  train_cat[col].astype('category')
        test_cat[col] =  test_cat[col].astype('category')
        
print(k_means_train_total_df.loc[:, sorted(score_list, key=lambda x: score_list[x])[:n]].columns.tolist())
print(base_lgbmodel(train_cat, verbose=False))

### Permutation Feature Importance

In [None]:
# enc = LabelEncoder()
# for col in train_cat.columns:
#     if train_cat[col].dtype.name=='category':
#         train_cat[col] = enc.fit_transform(train_cat[col])
#         test_cat[col] = enc.fit_transform(test_cat[col])
        
# print('Label Encoding Completed')

# train_x = train_cat.drop(['credit'], axis=1)
# train_y = train_cat['credit']
# test_x = test_cat.copy()

# seeds = np.random.randint(0, 1000, 3)
# perm_dicts = {}
# cv = np.zeros((train_x.shape[0], 3))
# for n, seed in enumerate(seeds):
#     kfold = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)
#     for i, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):

#         x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
#         y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

#         lgbm = LGBMClassifier(n_estimators=10000, objective='multiclass', seed=0)
#         lgbm.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=None)
#         cv[val_idx, :] = lgbm.predict_proba(x_val)
        
#         perm = PermutationImportance(lgbm, scoring = "neg_log_loss", random_state = seed).fit(x_val, y_val)
#         perm_dicts[str(seed)+'_seed_'+str(i+1)+'_fold'] = pd.DataFrame({'feature':x_val.columns.tolist(), 
#                                                                         'importance':perm.feature_importances_}
#                                                                       ).sort_values('importance')
#     print('multi_logloss:', log_loss(train_y, cv))
        
# for i, df in enumerate(perm_dicts.values()):
#     if i==0:
#         perm_df = df
#     else:
#         perm_df = pd.merge(perm_df, df, on='feature')
# perm_remove_df = perm_df.set_index('feature').mean(axis=1)>=0
# remove_features = perm_remove_df[perm_remove_df==False].index
# train_x = train_x.drop(remove_features, axis=1)

### 변수 하나씩 지우며 성능 체크하는 코드

In [None]:
# kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
# cv = np.zeros((train_x.shape[0], 3))
# for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
#     x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
#     y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]
#     lgbm = LGBMClassifier(**lgb_best_hyperparams, seed=0)
#     lgbm.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=None)
#     cv[val_idx] = lgbm.predict_proba(x_val)
# Initial_log_loss = log_loss(train_y, cv)
# print(f'Initial_multi_logloss: {Initial_log_loss}')

# remove_features = {}
# for i in range(1, 2):
#     for j in tqdm(combinations(list(range(0, train_x.shape[1])), i)):
#         train_new_x = train_x.drop(train_x.columns[list(j)], axis=1)
        
#         kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
#         cv = np.zeros((train_new_x.shape[0], 3))
#         for n, (train_idx, val_idx) in enumerate(kfold.split(train_new_x, train_y)):
#             x_train, x_val = train_new_x.iloc[train_idx], train_new_x.iloc[val_idx]
#             y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]
#             lgbm = LGBMClassifier(**lgb_best_hyperparams)
#             lgbm.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=None)
#             cv[val_idx] = lgbm.predict_proba(x_val)
#         remove_features[list(j)[0]] = log_loss(train_y, cv)
#         if Initial_log_loss > log_loss(train_y, cv):
#             print(f'{list(j)[0]}_multi_logloss: {log_loss(train_y, cv)}')

In [None]:
# remove_feature = sorted(remove_features, key=lambda x: remove_features[x])[:3]
# train_x = train_x.drop(train_x.columns[remove_feature], axis=1)
# test_x =  test_x.drop((test_x.columns[remove_feature], axis=1)

# 3. Modeling

In [16]:
pred_dict = {}
pred_test_dict = {}

## (1) Lightgbm

In [17]:
train_x = train_cat.drop(['credit'], axis=1) # 데이터 나누기
train_y = train_cat['credit']
test_x = test_cat.copy()

train_x_raw = train_x.copy() # 카테고리 인코딩 된 데이터 저장해두기
test_x_raw = test_x.copy()

In [21]:
# train_x = train_x_raw.copy() # XGB, RF 돌리다가 LGB 돌리고싶을땐 주석풀고 이 코드 실행
# test_x = test_x_raw.copy()

### HyperParameter Tuning (optuna)

In [22]:
def lgb_objective(trial: Trial) -> float:
    params_lgb = {
        "random_state": 91373,
        "verbosity": -1,
        "n_estimators": 10000,
        "objective": "multiclass",
        "metric": "multi_logloss",
        "learning_rate": trial.suggest_float("learning_rate", 0.003, 0.009),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-3, 1),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 1),
        "max_depth": trial.suggest_int("max_depth", 8, 30),
        "num_leaves": trial.suggest_int("num_leaves", 64, 1200),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
        "subsample": trial.suggest_float("subsample", 0.3, 1.0),
        "subsample_freq": trial.suggest_int("subsample_freq", 1, 10),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
        "max_bin": trial.suggest_int("max_bin", 200, 500),
    }
    
    # CV=10으로 튜닝
    
    seed = 91373
    kfold = StratifiedKFold(n_splits=2, random_state = seed, shuffle = True) # Cross-validation cv=5
    cv = np.zeros((train_x.shape[0], 3))

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):

        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

        lgbmodel = LGBMClassifier(**params_lgb)
                                                                                        # 진행상황 보고싶을때 None을 100으로
        lgbmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=None) 
        cv[val_idx, :] = lgbmodel.predict_proba(x_val)
    # print('multi_logloss:', log_loss(train_y, cv))

    
    return log_loss(train_y, cv)

In [None]:
sampler = TPESampler(seed=42)
study = optuna.create_study(study_name="lgbm_parameter_opt", direction="minimize", sampler=sampler)
study.optimize(lgb_objective, n_trials=1)

lgb_best_hyperparams = study.best_trial.params
lgb_base_hyperparams = {'objective':'multiclass', 'n_estimators':10000, 'max_depth':-1,
                        'lambda_l1':lgb_best_hyperparams['reg_alpha'],
                        'lambda_l2':lgb_best_hyperparams['reg_lambda'],
                        'reg_alpha':None, 'reg_lambda':None}
lgb_best_hyperparams.update(lgb_base_hyperparams)
print("The best hyperparameters are:\n", lgb_best_hyperparams)

### HyperParameter Tuning (hyperopt)

In [24]:
# Hyperopt의 metric함수를 StratifiedKFold(cv=5)로 구하기
def lgb_score(params):
    seed = 91373
    print("Training with params:", params)
    kfold = StratifiedKFold(n_splits=2, random_state = seed, shuffle = True) # Cross-validation cv=5
    cv = np.zeros((train_x.shape[0], 3))

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):

        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

        lgbmodel = LGBMClassifier(**params)
                                                                                        # 진행상황 보고싶을때 None을 100으로
        lgbmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=None) 
        cv[val_idx, :] = lgbmodel.predict_proba(x_val)
    print('multi_logloss:', log_loss(train_y, cv))
    return {'loss': log_loss(train_y, cv), 'status': STATUS_OK}

    # 시드를 3개로 평균내고 싶을 때 아래 주석 해제
# def lgb_score(params):
#     print("Training with params:", params)
#     lucky_seeds=np.random.randint(1, 10000, 3) # 랜덤으로 시드 3개 생성
#     score_list = []
#     for i, seed in enumerate(lucky_seeds):
#         kfold = StratifiedKFold(n_splits=5, random_state = seed, shuffle = True) # Cross-validation cv=5
#         cv = np.zeros((train_x.shape[0], 3))

#         for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):

#             x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
#             y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

#             lgbmodel = LGBMClassifier(**params)

#             lgbmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=None) 
#             cv[val_idx, :] = lgbmodel.predict_proba(x_val)
#         print(f'seed{seed}_multi_logloss:', log_loss(train_y, cv))
#         score_list.append(log_loss(train_y, cv))
#     return {'loss': np.mean(score_list), 'status': STATUS_OK}

# Hyperopt의 범위를 지정해주고 max_evals만큼 반복한 후 최적의 파라미터를 반환
def lgb_optimize(random_state=0):

    space = {
        'learning_rate': hp.quniform('learning_rate', 0.003, 0.009, 0.001),
        #'learning_rate' : 0.004,
        'num_leaves': scope.int(hp.quniform('num_leaves', 300, 1200, 50)),
        #'num_leaves' : 1000,
        'min_data_in_leaf' : scope.int(hp.quniform('min_data_in_leaf', 10, 40, 1)),
        'min_child_weight': hp.quniform('min_child_weight', 1, 20, 0.001),
        #'min_child_weight' : 2,
        'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
        #'subsample' : 1,
        'colsample_bytree': hp.quniform('colsample_bytree', 0.2, 1, 0.01),
        #'colsample_bytree' : 0.6,
        'reg_alpha': hp.quniform('reg_alpha', 0.01, 1, 0.01),
        #'reg_alpha' : 0.94,
        'reg_lambda': hp.quniform('reg_lambda', 0.01, 1, 0.01),
        #'reg_lambda' : 0.98,
        'max_depth': scope.int(hp.quniform('max_depth', 8, 30, 1)),
        #'max_depth' : -1,
        'n_estimators' : 5000,
        'objective' : 'multiclass',
        'num_class' : 3,
        'seed': 0,
    }
    # Use the fmin function from Hyperopt to find the best hyperparameters
    best = fmin(lgb_score, space, algo=tpe.suggest, 
                # trials=trials, 
                max_evals=1)
    return best

In [25]:
# lgb_best_hyperparams = lgb_optimize()
# lgb_base_hyperparams = {'objective':'multiclass', 'num_class':3, 'n_estimators':5000, 
#                         'num_leaves':int(lgb_best_hyperparams['num_leaves']),
#                         'lambda_l1': lgb_best_hyperparams['reg_alpha'], 'lambda_l2': lgb_best_hyperparams['reg_lambda'],
#                         'reg_alpha': None, 'reg_lambda': None, 'min_child_samples': None, 'seed':0}
# lgb_best_hyperparams.update(lgb_base_hyperparams)
# print("The best hyperparameters are:", lgb_best_hyperparams)

In [None]:
lucky_seeds=[100,2019,91373] # Lucky seed 늘려가면서 하기
for i, seed in enumerate(lucky_seeds):

    kfold = StratifiedKFold(n_splits=10, random_state = seed, shuffle = True) # CV 늘려가면서 하기
    cv=np.zeros((train_x.shape[0], 3))
    pred_test = np.zeros((test_x.shape[0], 3), dtype=float)
    
    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):

        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

        lgbmodel = LGBMClassifier(**lgb_best_hyperparams)
        
        # 직접 파라미터 넣고싶을땐 아래 코드 주석 해제
#         lgbmodel = LGBMClassifier(learning_rate=0.004, objective='multiclass', n_estimators=10000, num_leaves=1000, 
#                                   max_depth=-1, min_child_weight=2, colsample_bytree=0.6, reg_alpha=0.94, reg_lambda=0.98,
#                                    n_jobs=-1, random_state=seed)
                                                                                    # 진행상황 보고싶을때 None을 100으로
        lgbmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=None) 
        
        cv[val_idx,:] = lgbmodel.predict_proba(x_val)
        #print(f'fold{n+1} multi_logloss: {log_loss(y_val, cv[val_idx, :])}') # Fold마다 점수 체크하려면 주석 해제
        pred_test += lgbmodel.predict_proba(test_x) / 10 # CV 바꾸면 이 숫자도 똑같이 바꿔야함
    pred_dict['lgb'+str(seed)] = cv
    pred_test_dict['lgb'+str(seed)] = pred_test
        
    print('multi_logloss :', log_loss(train_y, cv))

lgbmodels_path = os.listdir('./pred_pkl/')
lgbmodels_list = [x for x in lgbmodels_path if x.endswith("lgb.pkl")]
assert len(lgbmodels_list) == 15
lgb_preds = np.zeros((test_x.shape[0], 3))

for m in lgbmodels_list:
    lgbmodel = joblib.load('./pred_pkl/'+m)
    lgb_preds_proba = lgbmodel.predict_proba(test)
    lgb_preds += lgb_preds_proba/15

## (2) XGBoost

원핫인코딩된 feature로 만들어주기 **꼭 밑에 코드 실행하고 XGBoost랑 Randomforest 돌리기!!**

In [20]:
train_x_oh = train_x_raw.copy()
test_x_oh = test_x_raw.copy()

object_col = []
for col in train_x_oh.columns:
    if (train_x_oh[col].dtype.name == 'category'):
        object_col.append(col)   
enc = OneHotEncoder()
enc.fit(train_x_oh.loc[:,object_col])

train_onehot_df = pd.DataFrame(enc.transform(train_x_oh.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
train_x_oh.drop(object_col, axis=1, inplace=True)
train_x_oh = pd.concat([train_x_oh, train_onehot_df], axis=1)    

test_onehot_df = pd.DataFrame(enc.transform(test_x_oh.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
test_x_oh.drop(object_col, axis=1, inplace=True)
test_x_oh = pd.concat([test_x_oh, test_onehot_df], axis=1)

train_x = train_x_oh.copy()
test_x = test_x_oh.copy()

print('One Hot Encoding Completed')

One Hot Encoding Completed


### HyperParameter Tuning (optuna)

In [28]:
def xgb_objective(trial: Trial) -> float:
    params_xgb = {
        "random_state": 91373,
        "verbose": None,
        "num_class": 3,
        "objective": "multi:softprob",
        "eval_metric": "mlogloss",
        "tree_method": "gpu_hist",
        "learning_rate": trial.suggest_float("learning_rate", 0.003, 0.009),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-3, 1),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 1),
        "max_depth": trial.suggest_int("max_depth", 8, 30),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.3, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.3, 1.0),
        "subsample": trial.suggest_float("subsample", 0.3, 1.0),
        "gamma": trial.suggest_float("gamma", 0.3, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
        "max_bin": trial.suggest_int("max_bin", 200, 500),
    }
    
    # CV=10으로 튜닝
    
    seed = 91373
    kfold = StratifiedKFold(n_splits=2, random_state = seed, shuffle = True) # Cross-validation cv=5
    cv = np.zeros((train_x.shape[0], 3))

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):

        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]
        
        dtrain = xgb.DMatrix(x_train, label=y_train)
        dvalid = xgb.DMatrix(x_val, label=y_val)
        watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
                                                                                            # 진행상황 보고싶을때 None을 100으로
        xgbmodel = xgb.train(params_xgb, dtrain, 100000, watchlist, early_stopping_rounds=30, verbose_eval=None)

        cv[val_idx, :] = xgbmodel.predict(dvalid)
        #print(f'fold{n+1} multi_logloss: {log_loss(y_val, cv[val_idx, :])}') # Fold마다 점수 체크하려면 주석 해제
    print('multi_logloss:', log_loss(train_y, cv))

    
    return log_loss(train_y, cv)

In [None]:
sampler = TPESampler(seed=42)
study = optuna.create_study(study_name="xgbm_parameter_opt", direction="minimize", sampler=sampler)
study.optimize(xgb_objective, n_trials=1)

xgb_best_hyperparams = study.best_trial.params
xgb_base_hyperparams = {'objective':'multi:softprob', "num_class": 3, "eval_metric": "mlogloss", 
                        "tree_method": "gpu_hist", "random_state": 91373}
xgb_best_hyperparams.update(xgb_base_hyperparams)
print("The best hyperparameters are:\n", xgb_best_hyperparams)

### HyperParameter Tuning (hyperopt)

In [30]:
# Hyperopt의 metric함수를 StratifiedKFold(cv=5)로 구하기
def xgb_score(params):
    seed = 91373
    print("Training with params:", params)
    kfold = StratifiedKFold(n_splits=10, random_state = seed, shuffle = True) # Cross-validation cv=5
    cv = np.zeros((train_x.shape[0], 3))

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):

        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

        dtrain = xgb.DMatrix(x_train, label=y_train)
        dvalid = xgb.DMatrix(x_val, label=y_val)
        watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
                                                                                    # 진행상황 보고싶을때 None을 100으로
        xgbmodel = xgb.train(params, dtrain, 100000, watchlist, early_stopping_rounds=30, verbose_eval=None)
        cv[val_idx, :] = xgbmodel.predict(dvalid)
    print('multi_logloss:', log_loss(train_y, cv))
    return {'loss': log_loss(train_y, cv), 'status': STATUS_OK}

    # 시드를 3개로 평균내고 싶을 때 아래 주석 해제
# def xgb_score(params):
#     print("Training with params:", params)
#     np.random.seed(0)
#     lucky_seeds=np.random.randint(1, 10000, 3) # 랜덤으로 시드 3개 생성
#     score_list = []
#     for i, seed in enumerate(lucky_seeds):
#         kfold = StratifiedKFold(n_splits=5, random_state = seed, shuffle = True) # Cross-validation cv=5
#         cv = np.zeros((train_x.shape[0], 3))

#         for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):

#             x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
#             y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

#             dtrain = xgb.DMatrix(x_train, label=y_train)
#             dvalid = xgb.DMatrix(x_val, label=y_val)
#             watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
#                                                                                         # 진행상황 보고싶을때 None을 100으로
#             xgbmodel = xgb.train(params, dtrain, 100000, watchlist, early_stopping_rounds=30, verbose_eval=None)
#             cv[val_idx, :] = xgbmodel.predict(dvalid)
#         print(f'seed{seed}_multi_logloss:', log_loss(train_y, cv))
#         score_list.append(log_loss(train_y, cv))
#     return {'loss': np.mean(score_list), 'status': STATUS_OK}

# Hyperopt의 범위를 지정해주고 max_evals만큼 반복한 후 최적의 파라미터를 반환
def xgb_optimize(random_state=0):
    
    space = {
        'eta': hp.quniform('eta', 0.003, 0.009, 0.001),
        #'eta' : 0.005,
        'max_depth':  hp.choice('max_depth', np.arange(8, 30, dtype=int)),
        #'max_depth' : 100,
        'num_leaves':  hp.choice('num_leaves', np.arange(64, 1200, dtype=int)),
        'min_child_weight': hp.quniform('min_child_weight', 1, 20, 1),
        #'min_child_weight': 4,
        'subsample': hp.quniform('subsample', 0.3, 1, 0.05),
        #'subsample': 0.72,
        'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
        #'gamma': 0.7,
        'colsample_bytree': hp.quniform('colsample_bytree', 0.1, 1, 0.05),
        #'colsample_bytree': 0.45,
        'colsample_bylevel': hp.quniform('colsample_bylevel', 0.1, 1, 0.05),
        #'colsample_bylevel': 0.15,
        'alpha' :  hp.quniform('alpha', 0.01, 1, 0.01),
        'lambda' :  hp.quniform('lambda', 0.01, 1, 0.01),
        'max_delta_step': scope.int(hp.quniform('max_delta_step', 0, 10, 1)),
        'eval_metric': 'mlogloss',
        'objective' : 'multi:softprob',
        'num_class' : 3,
        'seed': 0,
    }
    # Use the fmin function from Hyperopt to find the best hyperparameters
    best = fmin(xgb_score, space, algo=tpe.suggest, 
                # trials=trials, 
                max_evals=50)
    return best

In [31]:
# xgb_best_hyperparams = xgb_optimize()
# xgb_base_hyperparams = {'objective':'multi:softprob', 'eval_metric': 'mlogloss', 'num_class':3,
#                         'max_delta_step':int(xgb_best_hyperparams['max_delta_step']), 'seed':0}
# xgb_best_hyperparams.update(xgb_base_hyperparams)
# print("The best hyperparameters are:", xgb_best_hyperparams)

## Train & Predict

In [None]:
lucky_seeds=[0, 100, 91373] # 늘려가면서
xgtest = xgb.DMatrix(test_x)
for i, seed in enumerate(lucky_seeds):

    kfold = StratifiedKFold(n_splits=10, random_state = seed, shuffle = True) # 늘려가면서
    cv = np.zeros((train.shape[0], 3))
    pred_test = np.zeros((test_x.shape[0], 3), dtype=float)

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]
        
        dtrain = xgb.DMatrix(x_train, label=y_train)
        dvalid = xgb.DMatrix(x_val, label=y_val)
        watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
        
        # 직접 파라미터 넣고싶을땐 아래 코드 주석 해제
#         xgb_best_hyperparams = {'colsample_bylevel': 0.15, 'colsample_bytree': 0.45, 'eta': 0.005, 'eval_metric': 'mlogloss', 
#          'gamma': 0.7, 'max_depth': 100, 'min_child_weight': 4, 'num_class': 3, 
#          'objective': 'multi:softprob', 'seed': 0, 'subsample': 0.72}
                                                                                            # 진행상황 보고싶을때 None을 100으로
        rfmodel = xgb.train(xgb_best_hyperparams, dtrain, 100000, watchlist, early_stopping_rounds=30, verbose_eval=None)

        cv[val_idx, :] = xgbmodel.predict(dvalid)
        #print(f'fold{n+1} multi_logloss: {log_loss(y_val, cv[val_idx, :])}') # Fold마다 점수 체크하려면 주석 해제
        pred_test += xgbmodel.predict(xgtest) / 10 # CV 바꾸면 이 숫자도 똑같이 바꿔야함
        
    pred_dict['xgb'+str(seed)] = cv
    pred_test_dict['xgb'+str(seed)] = pred_test
    print('multi_logloss:', log_loss(train_y, cv))

xgbmodels_path = os.listdir('./pred_pkl/')
xgbmodels_list = [x for x in xgbmodels_path if x.endswith("xgb.pkl")]
assert len(xgbmodels_list) == 15
xgb_preds = np.zeros((test_x.shape[0], 3))
xgtest = xgb.DMatrix(test_X)

for m in xgbmodels_list:
    xgbmodel = joblib.load('./pred_pkl/'+m)
    xgb_preds_proba = xgbmodel.predict_proba(xgtest
       xgb_preds += xgb_preds_proba/15

## (3) Random Forest

In [57]:
def rf_objective(trial: Trial) -> float:
    params_rf = {
        "random_state": 91373,
        "n_estimators": trial.suggest_int("n_estimators", 500, 1500),
        "max_depth": trial.suggest_int("max_depth", 10, 100),
        "max_features": trial.suggest_float("max_features", 0.15, 1.0),
        "min_samples_split": trial.suggest_int("min_samples_split", 1, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 20),
        "max_samples": trial.suggest_float("max_samples", 0.5, 1),
        "criterion": trial.suggest_categorical("criterion", ["gini", "entropy"]),
        'n_jobs': -1, 
    }
    
    # CV=10으로 튜닝
    
    seed = 91373
    kfold = StratifiedKFold(n_splits=10, random_state = seed, shuffle = True) # Cross-validation cv=5
    cv = np.zeros((train_x.shape[0], 3))

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):

        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

        rfmodel = RandomForestClassifier(**params_rf)
                                                                                        # 진행상황 보고싶을때 None을 100으로
        rfmodel.fit(x_train, y_train) 
        cv[val_idx, :] = rfmodel.predict_proba(x_val)
    # print('multi_logloss:', log_loss(train_y, cv))

    
    return log_loss(train_y, cv)

In [58]:
sampler = TPESampler(seed=42)
study = optuna.create_study(study_name="rf_parameter_opt", direction="minimize", sampler=sampler)
study.optimize(rf_objective, n_trials=50)

rf_best_hyperparams = study.best_trial.params
rf_base_hyperparams = {'bootstrap': True, 'n_jobs': -1, "random_state": 91373}
rf_best_hyperparams.update(rf_base_hyperparams)
print("The best hyperparameters are:\n", rf_best_hyperparams)

[32m[I 2021-05-20 23:46:34,230][0m A new study created in memory with name: rf_parameter_opt[0m
[32m[I 2021-05-20 23:48:01,533][0m Trial 0 finished with value: 0.7766513903157818 and parameters: {'n_estimators': 874, 'max_depth': 96, 'max_features': 0.7721948505396944, 'min_samples_split': 12, 'min_samples_leaf': 4, 'max_samples': 0.5779972601681014, 'criterion': 'entropy', 'bootstrap': False}. Best is trial 0 with value: 0.7766513903157818.[0m


The best hyperparameters are:
 {'n_estimators': 874, 'max_depth': 96, 'max_features': 0.7721948505396944, 'min_samples_split': 12, 'min_samples_leaf': 4, 'max_samples': 0.5779972601681014, 'criterion': 'entropy', 'bootstrap': False, 'n_jobs': -1, 'random_state': 91373}


### 3 seeds, 10 folds

In [59]:
lucky_seeds=[42,2019,91373] # 늘려가면서
for i, seed in enumerate(lucky_seeds):

    kfold = StratifiedKFold(n_splits=10, random_state = seed, shuffle = True) # 늘려가면서
    cv = np.zeros((train_x.shape[0], 3))
    pred_test = np.zeros((test_x.shape[0], 3), dtype=float)
    
    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

        rfmodel = RandomForestClassifier(**rf_best_hyperparams)
        
        rfmodel.fit(x_train, y_train)
        #joblib.dump(rfmodel, f'./pred_pkl/RF_{n+1}_fold_{seed}_seed_rf.pkl')
        
        cv[val_idx, :] = rfmodel.predict_proba(x_val)      
        # print(f'fold{n+1} multi_logloss: {log_loss(y_val, cv[val_idx, :])}')
        pred_test += rfmodel.predict_proba(test_x) / 10 # CV 바꾸면 이 숫자도 똑같이 바꿔야함
        
    pred_dict['rf'+str(seed)] = cv
    pred_test_dict['rf'+str(seed)] = pred_test
    print('multi_logloss :', log_loss(train_y, cv))

KeyboardInterrupt: 

rfmodels_path = os.listdir('./pred_pkl/')
rfmodels_list = [x for x in rfmodels_path if x.endswith("rf.pkl")]
assert len(rfmodels_list) == 15
rf_preds = np.zeros((test_x.shape[0], 3))

for m in rfmodels_list:
    rfmodel = joblib.load('./pred_pkl/'+m)
    rf_preds_proba = rfmodel.predict_proba(test_x)
    rf_preds += rf_preds_proba/15

## (4) Catboost (성능X)

In [18]:
def cat_objective(trial: Trial) -> float:
    cat_params = {
        'loss_function': 'MultiClass',
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'Poisson']),
        'task_type': 'GPU',
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
        'max_bin': trial.suggest_int('max_bin', 200, 400),
        #'rsm': trial.suggest_uniform('rsm', 0.3, 1.0),
        'subsample': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.006, 0.018),
        'n_estimators':  25000,
        'max_depth': trial.suggest_categorical('max_depth', [7,10,14,16]),
        'random_state': trial.suggest_categorical('random_state', [24, 48,2020]),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 300),
    }
    
    # CV=10으로 튜닝
    
    seed = 91373
    kfold = StratifiedKFold(n_splits=2, random_state = seed, shuffle = True)
    cv = np.zeros((train_x.shape[0], 3))

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):

        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]
        
        catmodel = CatBoostClassifier(**cat_params)                                       # 진행상황 보고싶을때 False를 100으로
        catmodel.fit(x_train, y_train, eval_set=[(x_val,y_val)], early_stopping_rounds=30, verbose=False)

        cv[val_idx, :] = catmodel.predict_proba(x_val)
        #print(f'fold{n+1} multi_logloss: {log_loss(y_val, cv[val_idx, :])}') # Fold마다 점수 체크하려면 주석 해제
    print('multi_logloss:', log_loss(train_y, cv))

    
    return log_loss(train_y, cv)

In [None]:
sampler = TPESampler(seed=42)
study = optuna.create_study(study_name="cat_parameter_opt", direction="minimize", sampler=sampler)
study.optimize(cat_objective, n_trials=3)

cat_best_hyperparams = study.best_trial.params
cat_base_hyperparams = {'loss_function': 'MultiClass', 'n_estimators': 2500, 'task_type': 'GPU'}
cat_best_hyperparams.update(cat_base_hyperparams)
print("The best hyperparameters are:\n", cat_best_hyperparams)

[32m[I 2021-05-21 00:15:03,308][0m A new study created in memory with name: cat_parameter_opt[0m
[32m[I 2021-05-21 00:17:49,983][0m Trial 0 finished with value: 0.7390998620576282 and parameters: {'bootstrap_type': 'Bernoulli', 'l2_leaf_reg': 0.24810409748678125, 'max_bin': 231, 'bagging_fraction': 0.49359671220172163, 'learning_rate': 0.006697003346018394, 'max_depth': 7, 'random_state': 24, 'min_data_in_leaf': 55}. Best is trial 0 with value: 0.7390998620576282.[0m


multi_logloss: 0.7390998620576282


In [66]:
lucky_seeds=[42,2019,91373]
for i, seed in enumerate(lucky_seeds):

    kfold = KFold(n_splits=5, random_state = seed, shuffle = True)
    cv = np.zeros((train.shape[0], 3))
    pred_test = np.zeros((test_x.shape[0], 3), dtype=float)
    
    for n, (train_idx, val_idx) in enumerate(kfold.split(train)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]
        _train = Pool(x_train, label=y_train)
        _valid = Pool(x_val, label=y_val)

        catmodel =  CatBoostClassifier(**cat_best_hyperparams)
        
        catmodel.fit(_train, eval_set=_valid, use_best_model=True, verbose=2000)
        #joblib.dump(rfmodel, f'./pred_pkl/RF_{n+1}_fold_{seed}_seed_rf.pkl')
        
        cv[val_idx, :] = catmodel.predict_proba(x_val)        
        pred_test += catmodel.predict_proba(test_x) / 5
        
    pred_dict['cat'+str(i+1)] = cv
    pred_test_dict['cat'+str(i+1)] = pred_test
    print('multi_logloss :', log_loss(true, cv))

0:	learn: 1.0869651	test: 1.0870956	best: 1.0870956 (0)	total: 197ms	remaining: 5h 27m 57s


KeyboardInterrupt: 

## (4) Stacking (AutoLGB)

### 27features = 3seed(42, 2019, 91373) x 3model(lgb, xgb, rf) x 3class(0, 1, 2)

In [None]:
new_pred_dict_1={}
new_pred_test_dict_1={}
for i in range(len(pred_dict)):
    if log_loss(train_y, list(pred_dict.values())[i])<0.68:
        new_pred_dict_1[list(pred_dict.keys())[i]]=list(pred_dict.values())[i]
        new_pred_test_dict_1[list(pred_test_dict.keys())[i]]=list(pred_test_dict.values())[i]

In [None]:
new_pred_dict_2={}
new_pred_test_dict_2={}
for i in range(len(pred_dict)):
    if log_loss(train_y, list(pred_dict.values())[i])<0.69:
        new_pred_dict_2[list(pred_dict.keys())[i]]=list(pred_dict.values())[i]
        new_pred_test_dict_2[list(pred_test_dict.keys())[i]]=list(pred_test_dict.values())[i]

In [None]:
with open('./pkl/new_pred_dict_1.pickle', 'wb') as fw:
    pickle.dump(new_pred_dict_1, fw)
    
with open('./pkl/new_pred_test_dict_1.pickle', 'wb') as fw:
    pickle.dump(new_pred_test_dict_1, fw)

In [None]:
with open('./pkl/new_pred_dict_2.pickle', 'wb') as fw:
    pickle.dump(new_pred_dict_2, fw)
    
with open('./pkl/new_pred_test_dict_2.pickle', 'wb') as fw:
    pickle.dump(new_pred_test_dict_2, fw)

In [None]:
with open('./pkl/new_pred_dict_1.pickle', 'rb') as fw:
    new_pred_dict_1 = pickle.load(fw)

with open('./pkl/new_pred_test_dict_2.pickle', 'rb') as fw:
    new_pred_test_dict_2 = pickle.load(fw)

In [None]:
with open('./pkl/new_pred_dict_2.pickle', 'rb') as fw:
    new_pred_dict_2 = pickle.load(fw)

with open('./pkl/new_pred_test_dict_2.pickle', 'rb') as fw:
    new_pred_test_dict_2 = pickle.load(fw)

In [None]:
X_train = pd.DataFrame(np.hstack([x for _, x in new_pred_dict_2.items()]))
X_test = pd.DataFrame(np.hstack([x for _, x in new_pred_test_dict_2.items()]))

pred = np.zeros((X_train.shape[0], 3), dtype=float)
pred_test = np.zeros((X_test.shape[0], 3), dtype=float)
#kfold = KFold(n_splits=5, random_state = seed, shuffle = True)
cv = StratifiedKFold(n_splits=7, shuffle=True, random_state=42) # 이건 CV 너무 크게하면 안됨, 3~6까지 테스트해보면 좋을듯

for i_cv, (i_trn, i_val) in enumerate(cv.split(X_train, train_y)):
    if i_cv == 0:
        clf = AutoLGB(objective='multiclass', metric='multi_logloss', params={'num_class': 3}, 
                      feature_selection=False, n_est=10000)
        clf.tune(X_train.iloc[i_trn], train_y[i_trn])
        n_best = clf.n_best
        features = clf.features
        params = clf.params
        print(f'best iteration: {n_best}')
        print(f'selected features ({len(features)}): {features}')        
        print(params)
        clf.fit(X_train.iloc[i_trn], train_y[i_trn])
    else:
        train_data = lgb.Dataset(X_train[features].iloc[i_trn], label=train_y[i_trn])
        clf = lgb.train(params, train_data, n_best, verbose_eval=100)
    
    pred[i_val] = clf.predict(X_train[features].iloc[i_val])
    pred_test += clf.predict(X_test[features]) / 7

In [None]:
print(f'CV Log Loss: {log_loss(train_y, pred):.6f}')

# 결과 제출

In [None]:
submission = sample_submission.copy()
submission.iloc[:, 1:] = pred_test

In [None]:
submission.to_csv('submission/submission.csv', index=False)

# End

### Optuna 시각화

In [None]:
# plot_contour(study, params=['learning_rate',
#                             'max_depth',
#                             'num_leaves',
#                             'colsample_bytree',
#                             'subsample',
#                             'min_child_weight'])

In [None]:
# plot_optimization_history(study) 

In [None]:
# plot_parallel_coordinate(study)

In [None]:
# plot_slice(study)

In [None]:
# optuna.visualization.plot_param_importances(study)

### RandomForest GridSearchCV

In [None]:
# params = {'max_depth': [55, 60, 65] # 튜닝할 파라미터 삽입
#             }

# rf_clf = RandomForestClassifier(random_state = 0, n_estimators = 1000, 
#                                 min_samples_leaf=2, min_samples_split=2,
#                                 criterion='entropy', n_jobs = -1)
# grid_cv = GridSearchCV(rf_clf, param_grid = params, cv = 5, n_jobs = -1)
# grid_cv.fit(df_train, y)

# print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
# print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))