In [1]:
import warnings
warnings.filterwarnings('ignore')
import glob
import os
import joblib
import pandas as pd
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations

from hyperopt.pyll.base import scope
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
import lightgbm as lgb
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import xgboost as xgb
from kaggler.model import AutoLGB
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV, StratifiedShuffleSplit, train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, log_loss
import random

# 1. 문제 정의

# 2. 데이터 수집

## (1) 데이콘 기본 데이터

In [13]:
train = pd.read_csv('data/train.csv').drop(['index'], axis=1).fillna('NAN')
test = pd.read_csv('data/test.csv').drop(['index'], axis=1).fillna('NAN')
sample_submission = pd.read_csv('data/sample_submission.csv')

# 3. 탐색적 데이터 분석

# 4. 변수 조정

In [14]:
# train데이터와 test데이터 변수를 함께 조정하기 위해 병합
merge_data = pd.concat([train, test], axis = 0)

# DAYS_BIRTH
merge_data['DAYS_BIRTH_month']=np.floor((-merge_data['DAYS_BIRTH'])/30)-(
    (np.floor((-merge_data['DAYS_BIRTH'])/30)/12).astype(int)*12)
merge_data['DAYS_BIRTH_week']=np.floor((-merge_data['DAYS_BIRTH'])/7)-(
    (np.floor((-merge_data['DAYS_BIRTH'])/7)/4).astype(int)*4)

# DAYS_EMPLOYED
merge_data['DAYS_EMPLOYED_month']=np.floor((-merge_data['DAYS_EMPLOYED'])/30)-(
    (np.floor((-merge_data['DAYS_EMPLOYED'])/30)/12).astype(int)*12)
merge_data['DAYS_EMPLOYED_week']=np.floor((-merge_data['DAYS_EMPLOYED'])/7)-(
    (np.floor((-merge_data['DAYS_EMPLOYED'])/7)/4).astype(int)*4)

# before_EMPLOYED
merge_data['before_EMPLOYED']=merge_data['DAYS_BIRTH']-merge_data['DAYS_EMPLOYED']
merge_data['before_EMPLOYED_month']=np.floor((-merge_data['before_EMPLOYED'])/30)-(
    (np.floor((-merge_data['before_EMPLOYED'])/30)/12).astype(int)*12)
merge_data['before_EMPLOYED_week']=np.floor((-merge_data['before_EMPLOYED'])/7)-(
    (np.floor((-merge_data['before_EMPLOYED'])/7)/4).astype(int)*4)

# DAYS_BIRTH
merge_data['1new_1'] = merge_data['DAYS_BIRTH_month'] / merge_data['income_total']
merge_data['2new_1'] = merge_data['DAYS_BIRTH_week'] / merge_data['income_total']

# DAYS_EMPLOYED
merge_data['10new_1'] = merge_data['DAYS_EMPLOYED_month'] / merge_data['income_total']
merge_data['11new_1'] = merge_data['DAYS_EMPLOYED_week'] / merge_data['income_total']

# before_EMPLOYED
merge_data['12new_1'] = merge_data['before_EMPLOYED'] / merge_data['income_total']
merge_data['13new_1'] = merge_data['before_EMPLOYED_month'] / merge_data['income_total']
merge_data['14new_1'] = merge_data['before_EMPLOYED_week'] / merge_data['income_total']

# 총 수익을 가족 수로 나누기
merge_data['15new_1'] = merge_data['income_total'] / merge_data['family_size']

# 융합 삭제
#merge_data['3new_1'] = merge_data['DAYS_EMPLOYED_month'] / merge_data['DAYS_BIRTH_month']
#merge_data['4new_1'] = merge_data['DAYS_EMPLOYED_month'] / merge_data['DAYS_BIRTH_week']
#merge_data['5new_1'] = merge_data['DAYS_EMPLOYED_week'] / merge_data['DAYS_BIRTH_month']
#merge_data['6new_1'] = merge_data['DAYS_EMPLOYED_week'] / merge_data['DAYS_BIRTH_week']

#merge_data['7new_1'] =  merge_data['begin_month'] / merge_data['DAYS_BIRTH_month']
#merge_data['8new_1'] =  merge_data['begin_month'] / merge_data['DAYS_EMPLOYED_month']
#merge_data['9new_1'] =  merge_data['begin_month'] / merge_data['before_EMPLOYED_month']

merge_data['new_1'] = merge_data['child_num'] / merge_data['income_total']
merge_data['new_2'] = merge_data['family_size'] / merge_data['income_total']
merge_data['new_3'] = merge_data['DAYS_BIRTH'] / merge_data['income_total']
merge_data['new_4'] = merge_data['DAYS_EMPLOYED'] / merge_data['income_total']
#merge_data['new_5'] = merge_data['begin_month'] / merge_data['income_total']
merge_data['new_6'] =  merge_data['DAYS_EMPLOYED'] / merge_data['DAYS_BIRTH']

# 소득 skewed-data 처리
merge_data['log1p_income_total'] = np.log1p(merge_data['income_total'])
#merge_data['log_income_total'] = np.log(merge_data['income_total'])
#merge_data['sqrt_income_total'] = np.sqrt(merge_data['income_total'])
#merge_data['boxcox_income_total'] = stats.boxcox(merge_data['income_total'])[0]

merge_data = merge_data.fillna(-999)
train = merge_data[merge_data['credit'] != -999]
test = merge_data[merge_data['credit'] == -999]
test.drop('credit', axis = 1, inplace = True)

train_cols = list(train.columns); train_cols.remove('credit'); train_cols.append('credit')
train = train[train_cols]

train = train[train['child_num']<=6].reset_index(drop=True) # 아이의 수가 7명 이상인 데이터 제거

## 인코딩

In [15]:
train_oh = train.copy()
train_noh = train.copy()
test_oh = test.copy()
test_noh = test.copy()

In [16]:
object_col = []
for col in train_noh.columns:
    if train_noh[col].dtype == 'object':
        train_noh[col] = train_noh[col].astype('category')
        test_noh[col] = test_noh[col].astype('category')

In [17]:
object_col = []
for col in train_oh.columns:
    if train_oh[col].dtype == 'object':
        object_col.append(col)
print(object_col)        
enc = OneHotEncoder()
enc.fit(train.loc[:,object_col])


train_onehot_df = pd.DataFrame(enc.transform(train_oh.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
train_oh.drop(object_col, axis=1, inplace=True)
train_oh = pd.concat([train_oh, train_onehot_df], axis=1)    

test_onehot_df = pd.DataFrame(enc.transform(test_oh.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
test_oh.drop(object_col, axis=1, inplace=True)
test_oh = pd.concat([test_oh, test_onehot_df], axis=1)

['gender', 'car', 'reality', 'income_type', 'edu_type', 'family_type', 'house_type', 'occyp_type']


## Feature 하나씩 빼면서 성능 체크

변수 하나씩 제거하면서 성능 체크<br>
제거하여 성능이 좋게 나온 것들은 리스트에 따로 저장해두기

In [18]:
train_x = train_noh.drop(['credit'], axis=1)
train_y = train_noh['credit']
test_x = test_noh.copy()

In [19]:
train_columns = list(train.columns)
train_columns.remove('credit')

In [None]:
# feature_selection = ['income_total','begin_month']

# for i in train_columns:
#     if i in feature_selection:
#         pass
#     else: feature_selection.append(i)
# feature_selection


In [22]:
# 기본 스코어
score = []
feature_selection = ['income_total','begin_month',
                     'family_size','house_type',
                     'DAYS_BIRTH','DAYS_EMPLOYED','work_phone','occyp_type','DAYS_BIRTH_month','DAYS_EMPLOYED_month','DAYS_EMPLOYED_week'
                    , 'before_EMPLOYED','1new_1','2new_1','log1p_income_total', 'gender','car','reality','edu_type','family_type','before_EMPLOYED_month'
                    , '10new_1', '12new_1','13new_1','14new_1','new_1','child_num'
                    ,'DAYS_BIRTH_week','new_2','income_type']

train_x = train_noh.loc[:,feature_selection]    
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
cv = np.zeros((train_x.shape[0], 3))

for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
    x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
    y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]
    lgbm = LGBMClassifier(n_estimators=1000, objective='multiclass')
    lgbm.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=None)
    cv[val_idx] = lgbm.predict_proba(x_val)
print(f' multi_logloss: {log_loss(train_y, cv)}')
score.append(log_loss(train_y, cv))

 multi_logloss: 0.71282953909921


In [20]:
input1 = ['income_total','begin_month',
                     'family_size','house_type',
                     'DAYS_BIRTH','DAYS_EMPLOYED','work_phone','occyp_type','DAYS_BIRTH_month','DAYS_EMPLOYED_month','DAYS_EMPLOYED_week'
                    , 'before_EMPLOYED','1new_1','2new_1','log1p_income_total', 'gender','car','reality','edu_type','family_type','before_EMPLOYED_month'
                    , '10new_1', '12new_1','13new_1','14new_1','new_1','child_num'
                    ,'DAYS_BIRTH_week','new_2','income_type']
train_x1 = train_noh.loc[:,input1]
train_x1.to_csv('data/feature_selection/train_x1.csv', index=False)

test_x1 = test_noh.loc[:, input1]
test_x1.shape
test_x1.to_csv('data/feature_selection/test_x1.csv', index=False)


In [23]:
for i in train_columns:
    if i in feature_selection:
        continue
    else: feature_selection.append(i)
        
    train_x = train_noh.loc[:,feature_selection]    
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    cv = np.zeros((train_x.shape[0], 3))
    
    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]
        lgbm = LGBMClassifier(n_estimators=1000, objective='multiclass')
        lgbm.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=None)
        cv[val_idx] = lgbm.predict_proba(x_val)
    print(f'{i} multi_logloss: {log_loss(train_y, cv)}')
    
    if log_loss(train_y, cv) < score[-1]:
        score.append(log_loss(train_y, cv))
    else: 
        feature_selection.remove(i)

FLAG_MOBIL multi_logloss: 0.71282953909921
phone multi_logloss: 0.7121395429164792
email multi_logloss: 0.7119200471102657
before_EMPLOYED_week multi_logloss: 0.7100241437950773
11new_1 multi_logloss: 0.7107256761136765
15new_1 multi_logloss: 0.7118014387952728
new_3 multi_logloss: 0.7119667075104508
new_4 multi_logloss: 0.7103687272671587
new_6 multi_logloss: 0.7087851189202885


In [24]:
feature_selection

['income_total',
 'begin_month',
 'family_size',
 'house_type',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'work_phone',
 'occyp_type',
 'DAYS_BIRTH_month',
 'DAYS_EMPLOYED_month',
 'DAYS_EMPLOYED_week',
 'before_EMPLOYED',
 '1new_1',
 '2new_1',
 'log1p_income_total',
 'gender',
 'car',
 'reality',
 'edu_type',
 'family_type',
 'before_EMPLOYED_month',
 '10new_1',
 '12new_1',
 '13new_1',
 '14new_1',
 'new_1',
 'child_num',
 'DAYS_BIRTH_week',
 'new_2',
 'income_type',
 'phone',
 'email',
 'before_EMPLOYED_week',
 'new_6']

In [25]:
input2 = feature_selection
train_x2 = train_noh.loc[:,input2]
train_x2.to_csv('data/feature_selection/train_x2.csv', index=False)

test_x2 = test_noh.loc[:, input2]
test_x2.shape
test_x2.to_csv('data/feature_selection/test_x2.csv', index=False)


In [28]:
train_y.to_csv('data/feature_selection/train_y.csv', index=False)

## 다시 인코딩 (lgb돌릴땐 할필요 없음)

In [None]:
# train_oh = train.copy()
# train_noh = train.copy()
# test_oh = test.copy()
# test_noh = test.copy()

In [None]:
# object_col = []
# for col in train_noh.columns:
#     if train_noh[col].dtype == 'object':
#         train_noh[col] = train_noh[col].astype('category')
#         test_noh[col] = test_noh[col].astype('category')

In [None]:
# object_col = []
# for col in train_oh.columns:
#     if train_oh[col].dtype == 'object':
#         object_col.append(col)
# print(object_col)        
# enc = OneHotEncoder()
# enc.fit(train.loc[:,object_col])


# train_onehot_df = pd.DataFrame(enc.transform(train_oh.loc[:,object_col]).toarray(), 
#              columns=enc.get_feature_names(object_col))
# train_oh.drop(object_col, axis=1, inplace=True)
# train_oh = pd.concat([train_oh, train_onehot_df], axis=1)    

# test_onehot_df = pd.DataFrame(enc.transform(test_oh.loc[:,object_col]).toarray(), 
#              columns=enc.get_feature_names(object_col))
# test_oh.drop(object_col, axis=1, inplace=True)
# test_oh = pd.concat([test_oh, test_onehot_df], axis=1)

In [2]:
train_x1 = pd.read_csv('data/feature_selection/train_x1.csv').fillna('NAN')
test_x1 = pd.read_csv('data/feature_selection/test_x1.csv').fillna('NAN')
train_x2 = pd.read_csv('data/feature_selection/train_x2.csv').fillna('NAN')
test_x2 = pd.read_csv('data/feature_selection/test_x2.csv').fillna('NAN')
sample_submission = pd.read_csv('data/sample_submission.csv')

# 6. 모델 학습

In [29]:
pred_dict = {}
pred_test_dict = {}

## (1) Lightgbm

In [12]:
# train_x = train_noh.drop(['credit'], axis=1)
# train_y = train_noh['credit']
# test_x = test_noh.copy()

NameError: name 'train_noh' is not defined

In [30]:
train_x = train_x1.copy()
test_x = test_x1.copy()

### Parameter Tuning

In [None]:
SEED=2000

# Hyperopt의 metric함수를 StratifiedKFold(cv=5)로 구하기
def score(params):
    print("Training with params: ")
    print(params)
    
    kfold = StratifiedKFold(n_splits=5, random_state = SEED, shuffle = True)
    cv = np.zeros((train_x.shape[0], 3))

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

        lgbmodel = LGBMClassifier(**params)

        lgbmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=100) 
        cv[val_idx, :] = lgbmodel.predict_proba(x_val)
        print(f'fold{n+1} multi_logloss: {log_loss(y_val, cv[val_idx, :])}')
    print('multi_logloss:', log_loss(train_y, cv))
    score = log_loss(train_y, cv)
    return {'loss': score, 'status': STATUS_OK}

# Hyperopt의 범위를 지정해주고 max_evals만큼 반복한 후 최적의 파라미터를 반환
def optimize(random_state=SEED):
    
#     param = {'objective':'multi:softprob', 'seed':SEED, 'num_class': 3, 'eval_metric':'mlogloss', 
#          'eta': 0.01, 'min_child_weight': 3,
#          'colsample_bytree': 0.3, 'colsample_bylevel': 0.6, 'subsample': 0.8
#         }
    space = {
        #'learning_rate': hp.quniform('learning_rate', 0.004, 0.006, 0.001),
        'learning_rate' : 0.0045,
        #'num_leaves': scope.int(hp.quniform('num_leaves', 1000, 1200, 50)),
        'num_leaves' : 1000,
        #'min_child_weight': hp.quniform('min_child_weight', 1, 3, 1),
        'min_child_weight' : 2,
        #'subsample': hp.quniform('subsample', 0.8, 1, 0.05),
        'subsample' : 1,
        #'colsample_bytree': hp.quniform('colsample_bytree', 0.3, 0.7, 0.05),
        'colsample_bytree' : 0.4,
        'max_depth' : -1,
        'n_estimators' : 5000,
        'objective' : 'multiclass',
        'num_class' : 3,
        'seed': SEED,
         'reg_alpha': 0.94, 
        'reg_lambda': 0.98
    }
    # Use the fmin function from Hyperopt to find the best hyperparameters
    best = fmin(score, space, algo=tpe.suggest, 
                # trials=trials, 
                max_evals=12)
    return best

best_hyperparams = optimize()
print("The best hyperparameters are: ", "\n")
print(best_hyperparams)

Training with params:                                 
{'colsample_bytree': 0.4, 'learning_rate': 0.0045, 'max_depth': -1, 'min_child_weight': 2, 'n_estimators': 5000, 'num_class': 3, 'num_leaves': 1000, 'objective': 'multiclass', 'reg_alpha': 0.94, 'reg_lambda': 0.98, 'seed': 2000, 'subsample': 1}
Training until validation scores don't improve for 30 rounds
[100]	valid_0's multi_logloss: 0.814234               
[200]	valid_0's multi_logloss: 0.770095               
[300]	valid_0's multi_logloss: 0.742408               
[400]	valid_0's multi_logloss: 0.722981               
[500]	valid_0's multi_logloss: 0.710038               
[600]	valid_0's multi_logloss: 0.701622               
[700]	valid_0's multi_logloss: 0.695501               
[800]	valid_0's multi_logloss: 0.691893               
[900]	valid_0's multi_logloss: 0.689553               
[1000]	valid_0's multi_logloss: 0.688487              
Early stopping, best iteration is:                    
[1052]	valid_0's multi_logloss: 0.

### 3 seeds x 5 folds

In [37]:
test_x1

Unnamed: 0,income_total,begin_month,family_size,house_type,DAYS_BIRTH,DAYS_EMPLOYED,work_phone,occyp_type,DAYS_BIRTH_month,DAYS_EMPLOYED_month,DAYS_EMPLOYED_week,before_EMPLOYED,1new_1,2new_1,log1p_income_total,gender,car,reality,edu_type,family_type,before_EMPLOYED_month,10new_1,12new_1,13new_1,14new_1,new_1,child_num,DAYS_BIRTH_week,new_2,income_type
0,112500.0,-60.0,2.0,House / apartment,-21990,365243,0,NAN,1.0,-7.0,-2.0,-387233,0.000009,0.000009,11.630717,M,Y,N,Secondary / secondary special,Civil marriage,7.0,-0.000062,-3.442071,0.000062,0.000027,0.0,0,1.0,0.000018,Pensioner
1,135000.0,-36.0,2.0,House / apartment,-18964,-8671,0,Core staff,8.0,1.0,2.0,-10293,0.000059,0.000007,11.813037,F,N,Y,Higher education,Married,7.0,0.000007,-0.076244,0.000052,0.000015,0.0,0,1.0,0.000015,State servant
2,69372.0,-40.0,2.0,House / apartment,-15887,-217,1,Laborers,1.0,7.0,3.0,-15670,0.000014,0.000014,11.147253,F,N,Y,Secondary / secondary special,Married,6.0,0.000101,-0.225884,0.000086,0.000029,0.0,0,1.0,0.000029,Working
3,112500.0,-41.0,2.0,House / apartment,-19270,-2531,1,Drivers,6.0,0.0,1.0,-16739,0.000053,0.000000,11.630717,M,Y,N,Secondary / secondary special,Married,5.0,0.000000,-0.148791,0.000044,0.000027,0.0,0,0.0,0.000018,Commercial associate
4,225000.0,-8.0,2.0,House / apartment,-17822,-9385,1,Managers,6.0,0.0,0.0,-8437,0.000027,0.000009,12.323860,F,Y,Y,Higher education,Married,5.0,0.000000,-0.037498,0.000022,0.000004,0.0,0,2.0,0.000009,State servant
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,202500.0,-19.0,2.0,House / apartment,-18593,-5434,1,Accountants,7.0,1.0,0.0,-13159,0.000035,0.000000,12.218500,F,Y,Y,Incomplete higher,Married,6.0,0.000005,-0.064983,0.000030,0.000015,0.0,0,0.0,0.000010,Working
9996,202500.0,-34.0,2.0,House / apartment,-10886,-1315,1,Laborers,2.0,7.0,3.0,-9571,0.000010,0.000015,12.218500,M,Y,Y,Secondary / secondary special,Civil marriage,7.0,0.000035,-0.047264,0.000035,0.000015,0.0,0,3.0,0.000010,Working
9997,292500.0,-55.0,2.0,House / apartment,-21016,-14018,0,Medicine staff,4.0,11.0,2.0,-6998,0.000014,0.000007,12.586223,F,N,Y,Secondary / secondary special,Married,5.0,0.000038,-0.023925,0.000017,0.000010,0.0,0,2.0,0.000007,Working
9998,180000.0,-33.0,2.0,House / apartment,-16541,-1085,0,NAN,11.0,0.0,3.0,-15456,0.000061,0.000017,12.100718,F,Y,N,Secondary / secondary special,Married,11.0,0.000000,-0.085867,0.000061,0.000000,0.0,0,3.0,0.000011,Commercial associate


In [39]:
lucky_seeds=[42,2019,91373] # Lucky seed 늘려가면서 하기
for i, seed in enumerate(lucky_seeds):

    kfold = StratifiedKFold(n_splits=10, random_state = seed, shuffle = True) # CV 늘려가면서 하기
    cv=np.zeros((train_x1.shape[0], 3))
    pred_test = np.zeros((test_x1.shape[0], 3), dtype=float)
    
    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x1, train_y)):

        x_train, x_val = train_x1.iloc[train_idx], train_x1.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]
        
        lgbmodel = LGBMClassifier(learning_rate=0.005, objective='multiclass', n_estimators=10000, num_leaves=1000, 
                                  max_depth=-1, min_child_weight=2, colsample_bytree=0.6, reg_alpha=0.94, reg_lambda=0.98,
                                   n_jobs=-1, random_state=seed)

        lgbmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=100) 
        #joblib.dump(lgbmodel, f'./pred_pkl/LGB_{n+1}_fold_{seed}_seed_lgb.pkl')

        # CROSS-VALIDATION , EVALUATE CV
        cv[val_idx,:] = lgbmodel.predict_proba(x_val)
        pred_test += lgbmodel.predict_proba(test_x1) / 10 # CV 바꾸면 이 숫자도 똑같이 바꿔야함
    pred_dict['lgb'+str(i+1)] = cv
    pred_test_dict['lgb'+str(i+1)] = pred_test
        
    print('multi_logloss :', log_loss(train_y, cv))

Training until validation scores don't improve for 30 rounds
[100]	valid_0's multi_logloss: 0.791807
[200]	valid_0's multi_logloss: 0.741585
[300]	valid_0's multi_logloss: 0.713539
[400]	valid_0's multi_logloss: 0.6962
[500]	valid_0's multi_logloss: 0.68591
[600]	valid_0's multi_logloss: 0.679928
[700]	valid_0's multi_logloss: 0.677246
[800]	valid_0's multi_logloss: 0.676356
Early stopping, best iteration is:
[836]	valid_0's multi_logloss: 0.676213
Training until validation scores don't improve for 30 rounds
[100]	valid_0's multi_logloss: 0.793394
[200]	valid_0's multi_logloss: 0.743415
[300]	valid_0's multi_logloss: 0.713765
[400]	valid_0's multi_logloss: 0.695178
[500]	valid_0's multi_logloss: 0.684008
[600]	valid_0's multi_logloss: 0.677174
[700]	valid_0's multi_logloss: 0.673221
[800]	valid_0's multi_logloss: 0.671414
[900]	valid_0's multi_logloss: 0.671073
Early stopping, best iteration is:
[894]	valid_0's multi_logloss: 0.671042
Training until validation scores don't improve for 

[200]	valid_0's multi_logloss: 0.740928
[300]	valid_0's multi_logloss: 0.712435
[400]	valid_0's multi_logloss: 0.695826
[500]	valid_0's multi_logloss: 0.685905
[600]	valid_0's multi_logloss: 0.680546
[700]	valid_0's multi_logloss: 0.677872
[800]	valid_0's multi_logloss: 0.67714
Early stopping, best iteration is:
[816]	valid_0's multi_logloss: 0.677085
multi_logloss : 0.6873519871514427
Training until validation scores don't improve for 30 rounds
[100]	valid_0's multi_logloss: 0.795908
[200]	valid_0's multi_logloss: 0.74973
[300]	valid_0's multi_logloss: 0.723283
[400]	valid_0's multi_logloss: 0.707757
[500]	valid_0's multi_logloss: 0.699072
[600]	valid_0's multi_logloss: 0.694494
[700]	valid_0's multi_logloss: 0.692528
Early stopping, best iteration is:
[746]	valid_0's multi_logloss: 0.692107
Training until validation scores don't improve for 30 rounds
[100]	valid_0's multi_logloss: 0.79014
[200]	valid_0's multi_logloss: 0.741489
[300]	valid_0's multi_logloss: 0.714262
[400]	valid_0's 

KeyboardInterrupt: 

lgbmodels_path = os.listdir('./pred_pkl/')
lgbmodels_list = [x for x in lgbmodels_path if x.endswith("lgb.pkl")]
assert len(lgbmodels_list) == 15
lgb_preds = np.zeros((test_x.shape[0], 3))

for m in lgbmodels_list:
    lgbmodel = joblib.load('./pred_pkl/'+m)
    lgb_preds_proba = lgbmodel.predict_proba(test)
    lgb_preds += lgb_preds_proba/15

## (2) XGBoost

원핫인코딩된 feature로 만들어주기 **꼭 밑에 코드 실행하고 XGBoost랑 Randomforest 돌리기!!**

In [None]:
train_x = train_oh.drop(['credit'], axis=1)
train_y = train_oh['credit']
test_x = test_oh.copy()

### Parameter Tuning (hyperopt)

이거 오래걸리므로 안해도됨, 그리고 이미 했음

In [None]:
SEED=0

# Hyperopt의 metric함수를 StratifiedKFold(cv=5)로 구하기
def score(params):
    print("Training with params: ")
    print(params)
    
    kfold = StratifiedKFold(n_splits=5, random_state = SEED, shuffle = True)
    cv = np.zeros((train_x.shape[0], 3))

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]
        
        dtrain = xgb.DMatrix(x_train, label=y_train)
        dvalid = xgb.DMatrix(x_val, label=y_val)
        watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

        xgbmodel = xgb.train(params, dtrain, 100000, watchlist, early_stopping_rounds=30, verbose_eval=100)
        cv[val_idx, :] = xgbmodel.predict(dvalid)
        print(f'fold{n+1} multi_logloss: {log_loss(y_val, cv[val_idx, :])}')
    print('multi_logloss:', log_loss(train_y, cv))
    score = log_loss(train_y, cv)
    return {'loss': score, 'status': STATUS_OK}

# Hyperopt의 범위를 지정해주고 max_evals만큼 반복한 후 최적의 파라미터를 반환
def optimize(random_state=SEED):
    

    space = {
        'eta': hp.quniform('eta', 0.003, 0.006, 0.001),
        'min_child_weight': hp.quniform('min_child_weight', 1, 3, 1),
        'subsample': hp.quniform('subsample', 0.6, 0.8, 0.05),
        'gamma': hp.quniform('gamma', 0.6, 0.8, 0.05),
        'colsample_bytree': hp.quniform('colsample_bytree', 0.3, 0.7, 0.05),
        'colsample_bylevel': hp.quniform('colsample_bylevel', 0.3, 0.7, 0.05),
        'max_depth' : 100,
        'eval_metric': 'mlogloss',
        'objective' : 'multi:softprob',
        'num_class' : 3,
        'seed': SEED,
    }
    # Use the fmin function from Hyperopt to find the best hyperparameters
    best = fmin(score, space, algo=tpe.suggest, 
                # trials=trials, 
                max_evals=10)
    return best

best_hyperparams = optimize()
print("The best hyperparameters are: ", "\n")
print(best_hyperparams)
params = best_hyperparams

### 3 seeds x 5 folds

In [None]:
lucky_seeds=[42, 2019, 91373] # 늘려가면서
xgtest = xgb.DMatrix(test_x)
for i, seed in enumerate(lucky_seeds):

    kfold = StratifiedKFold(n_splits=10, random_state = seed, shuffle = True) # 늘려가면서
    cv = np.zeros((train.shape[0], 3))
    pred_test = np.zeros((test_x.shape[0], 3), dtype=float)

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]
        
        dtrain = xgb.DMatrix(x_train, label=y_train)
        dvalid = xgb.DMatrix(x_val, label=y_val)
        watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

        param = params
#         {'colsample_bylevel': 0.5, 'colsample_bytree': 0.4, 'eta': 0.003, 'eval_metric': 'mlogloss', 
#          'gamma': 0.7, 'max_depth': 100, 'min_child_weight': 2.0, 'num_class': 3, 
#          'objective': 'multi:softprob', 'seed': 0, 'subsample': 0.7}
        xgbmodel = xgb.train(param, dtrain, 100000, watchlist, early_stopping_rounds=30, verbose_eval=100)
        #joblib.dump(xgbmodel, f'./pred_pkl/XGB_{n+1}_fold_{seed}_seed_xgb.pkl')

        cv[val_idx, :] = xgbmodel.predict(dvalid)
        print(f'fold{n+1} multi_logloss: {log_loss(y_val, cv[val_idx, :])}')
        pred_test += xgbmodel.predict(xgtest) / 10 # CV 바꾸면 이 숫자도 똑같이 바꿔야함
        
    pred_dict['xgb'+str(i+1)] = cv
    pred_test_dict['xgb'+str(i+1)] = pred_test
    print('multi_logloss:', log_loss(train_y, cv))

xgbmodels_path = os.listdir('./pred_pkl/')
xgbmodels_list = [x for x in xgbmodels_path if x.endswith("xgb.pkl")]
assert len(xgbmodels_list) == 15
xgb_preds = np.zeros((test_x.shape[0], 3))
xgtest = xgb.DMatrix(test_X)

for m in xgbmodels_list:
    xgbmodel = joblib.load('./pred_pkl/'+m)
    xgb_preds_proba = xgbmodel.predict_proba(xgtest
       xgb_preds += xgb_preds_proba/15

## (3) Random Forest

### GridSearchCV

params = {'max_depth': [55, 60, 65] # 튜닝할 파라미터 삽입
            }

rf_clf = RandomForestClassifier(random_state = 0, n_estimators = 1000, 
                                min_samples_leaf=2, min_samples_split=2,
                                criterion='entropy', n_jobs = -1)
grid_cv = GridSearchCV(rf_clf, param_grid = params, cv = 5, n_jobs = -1)
grid_cv.fit(df_train, y)

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

### 3 seeds, 5 folds

In [None]:
lucky_seeds=[42,2019,91373,53,1] # 늘려가면서
for i, seed in enumerate(lucky_seeds):

    kfold = StratifiedKFold(n_splits=10, random_state = seed, shuffle = True) # 늘려가면서
    cv = np.zeros((train_x.shape[0], 3))
    pred_test = np.zeros((test_x.shape[0], 3), dtype=float)
    
    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

        rfmodel = RandomForestClassifier(n_estimators=1200, criterion='entropy', max_depth=60,
                                         min_samples_leaf=2, min_samples_split=2,
                                         random_state=seed)
        rfmodel.fit(x_train, y_train)
        #joblib.dump(rfmodel, f'./pred_pkl/RF_{n+1}_fold_{seed}_seed_rf.pkl')
        
        cv[val_idx, :] = rfmodel.predict_proba(x_val)      
        print(f'fold{n+1} multi_logloss: {log_loss(y_val, cv[val_idx, :])}')
        pred_test += rfmodel.predict_proba(test_x) / 10 # CV 바꾸면 이 숫자도 똑같이 바꿔야함
        
    pred_dict['rf'+str(i+1)] = cv
    pred_test_dict['rf'+str(i+1)] = pred_test
    print('multi_logloss :', log_loss(train_y, cv))

rfmodels_path = os.listdir('./pred_pkl/')
rfmodels_list = [x for x in rfmodels_path if x.endswith("rf.pkl")]
assert len(rfmodels_list) == 15
rf_preds = np.zeros((test_x.shape[0], 3))

for m in rfmodels_list:
    rfmodel = joblib.load('./pred_pkl/'+m)
    rf_preds_proba = rfmodel.predict_proba(test_x)
    rf_preds += rf_preds_proba/15

## (4) Catboost (성능X)

lucky_seeds=[42,2019,91373]
for i, seed in enumerate(lucky_seeds):

    kfold = KFold(n_splits=5, random_state = seed, shuffle = True)
    cv = np.zeros((train.shape[0], 3))
    pred_test = np.zeros((test_x.shape[0], 3), dtype=float)
    
    for n, (train_idx, val_idx) in enumerate(kfold.split(train)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]
        _train = Pool(x_train, label=y_train)
        _valid = Pool(x_val, label=y_val)

        catmodel =  CatBoostClassifier(loss_function='MultiClass', early_stopping_rounds=50, 
                                       random_state=seed, learning_rate=0.02, iterations=100000
                                       #task_type="GPU"
                                      )
        
        catmodel.fit(_train, eval_set=_valid, use_best_model=True, verbose=2000)
        #joblib.dump(rfmodel, f'./pred_pkl/RF_{n+1}_fold_{seed}_seed_rf.pkl')
        
        cv[val_idx, :] = catmodel.predict_proba(x_val)        
        pred_test += catmodel.predict_proba(test_x) / 5
        
    pred_dict['cat'+str(i+1)] = cv
    pred_test_dict['cat'+str(i+1)] = pred_test
    print('multi_logloss :', log_loss(true, cv))

## (4) Stacking (AutoLGB)

### 27features = 3seed(42, 2019, 91373) x 3model(lgb, xgb, rf) x 3class(0, 1, 2)

In [None]:
X_train = pd.DataFrame(np.hstack([x for _, x in pred_dict.items()]))
X_test = pd.DataFrame(np.hstack([x for _, x in pred_test_dict.items()]))

pred = np.zeros((X_train.shape[0], 3), dtype=float)
pred_test = np.zeros((X_test.shape[0], 3), dtype=float)
#kfold = KFold(n_splits=5, random_state = seed, shuffle = True)
cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42) # 이건 CV 너무 크게하면 안됨, 3~6까지 테스트해보면 좋을듯

for i_cv, (i_trn, i_val) in enumerate(cv.split(X_train, train_y)):
    if i_cv == 0:
        clf = AutoLGB(objective='multiclass', metric='multi_logloss', params={'num_class': 3}, 
                      feature_selection=False, n_est=10000)
        clf.tune(X_train.iloc[i_trn], train_y[i_trn])
        n_best = clf.n_best
        features = clf.features
        params = clf.params
        print(f'best iteration: {n_best}')
        print(f'selected features ({len(features)}): {features}')        
        print(params)
        clf.fit(X_train.iloc[i_trn], train_y[i_trn])
    else:
        train_data = lgb.Dataset(X_train[features].iloc[i_trn], label=train_y[i_trn])
        clf = lgb.train(params, train_data, n_best, verbose_eval=100)
    
    pred[i_val] = clf.predict(X_train[features].iloc[i_val])
    pred_test += clf.predict(X_test[features]) / 4

In [None]:
pred_dict.items()

In [None]:
print(f'CV Log Loss: {log_loss(train_y, pred):.6f}')

# 결과 제출

In [None]:
submission = sample_submission.copy()
submission.iloc[:, 1:] = pred_test

In [None]:
submission.to_csv('submission/submission.csv', index=False)