In [1]:
import warnings
warnings.filterwarnings('ignore')
import glob
import os
import joblib
import pandas as pd
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations

from hyperopt.pyll.base import scope
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
import lightgbm as lgb
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import xgboost as xgb
from catboost import Pool, CatBoostClassifier
from kaggler.model import AutoLGB
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV, StratifiedShuffleSplit, train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, log_loss
import random

# 1. 문제 정의

# 2. 데이터 수집

## (1) 데이콘 기본 데이터

In [88]:
train = pd.read_csv('data/train.csv').drop(['index'], axis=1).fillna('NAN')
test = pd.read_csv('data/test.csv').drop(['index'], axis=1).fillna('NAN')
sample_submission = pd.read_csv('data/sample_submission.csv')

# 3. 탐색적 데이터 분석

In [55]:
train.head()

Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,DAYS_BIRTH_month,DAYS_BIRTH_week,DAYS_EMPLOYED_month,DAYS_EMPLOYED_week,before_EMPLOYED,before_EMPLOYED_month,before_EMPLOYED_week,1new_1,2new_1,10new_1,11new_1,12new_1,13new_1,14new_1,15new_1,new_1,new_2,new_3,new_4,new_6,log1p_income_total,credit
0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,NAN,2.0,-6.0,7.0,1.0,0.0,0.0,-9190,6.0,0.0,3.5e-05,5e-06,0.0,0.0,-0.045383,3e-05,0.0,101250.0,0.0,1e-05,-0.068637,-0.023254,0.338801,12.2185,1.0
1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0,7.0,1.0,3.0,0.0,-9840,4.0,1.0,2.8e-05,4e-06,1.2e-05,0.0,-0.039758,1.6e-05,4e-06,82500.0,4e-06,1.2e-05,-0.04598,-0.006222,0.135325,12.41917,1.0
2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,2.0,-22.0,0.0,2.0,3.0,1.0,-14653,8.0,1.0,0.0,4e-06,7e-06,2e-06,-0.032562,1.8e-05,2e-06,225000.0,0.0,4e-06,-0.042416,-0.009853,0.232305,13.017005,2.0
3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,1,0,1,0,Sales staff,2.0,-37.0,10.0,3.0,9.0,2.0,-12996,1.0,0.0,4.9e-05,1.5e-05,4.4e-05,1e-05,-0.064178,5e-06,0.0,101250.0,0.0,1e-05,-0.074509,-0.010331,0.138653,12.2185,0.0
4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,1,0,0,0,Managers,2.0,-26.0,9.0,0.0,10.0,0.0,-12932,11.0,3.0,5.7e-05,0.0,6.3e-05,0.0,-0.082108,7e-05,1.9e-05,78750.0,0.0,1.3e-05,-0.095473,-0.013365,0.139988,11.967187,2.0


# 4. 변수 조정

In [89]:
# train데이터와 test데이터 변수를 함께 조정하기 위해 병합
merge_data = pd.concat([train, test], axis = 0)

# DAYS_BIRTH
merge_data['DAYS_BIRTH_month']=np.floor((-merge_data['DAYS_BIRTH'])/30)-(
    (np.floor((-merge_data['DAYS_BIRTH'])/30)/12).astype(int)*12)
merge_data['DAYS_BIRTH_week']=np.floor((-merge_data['DAYS_BIRTH'])/7)-(
    (np.floor((-merge_data['DAYS_BIRTH'])/7)/4).astype(int)*4)

# DAYS_EMPLOYED
merge_data['DAYS_EMPLOYED_month']=np.floor((-merge_data['DAYS_EMPLOYED'])/30)-(
    (np.floor((-merge_data['DAYS_EMPLOYED'])/30)/12).astype(int)*12)
merge_data['DAYS_EMPLOYED_week']=np.floor((-merge_data['DAYS_EMPLOYED'])/7)-(
    (np.floor((-merge_data['DAYS_EMPLOYED'])/7)/4).astype(int)*4)

# before_EMPLOYED
merge_data['before_EMPLOYED']=merge_data['DAYS_BIRTH']-merge_data['DAYS_EMPLOYED']
merge_data['before_EMPLOYED_month']=np.floor((-merge_data['before_EMPLOYED'])/30)-(
    (np.floor((-merge_data['before_EMPLOYED'])/30)/12).astype(int)*12)
merge_data['before_EMPLOYED_week']=np.floor((-merge_data['before_EMPLOYED'])/7)-(
    (np.floor((-merge_data['before_EMPLOYED'])/7)/4).astype(int)*4)

# DAYS_BIRTH
merge_data['1new_1'] = merge_data['DAYS_BIRTH_month'] / merge_data['income_total']
merge_data['2new_1'] = merge_data['DAYS_BIRTH_week'] / merge_data['income_total']

# DAYS_EMPLOYED
merge_data['10new_1'] = merge_data['DAYS_EMPLOYED_month'] / merge_data['income_total']
merge_data['11new_1'] = merge_data['DAYS_EMPLOYED_week'] / merge_data['income_total']

# before_EMPLOYED
merge_data['12new_1'] = merge_data['before_EMPLOYED'] / merge_data['income_total']
merge_data['13new_1'] = merge_data['before_EMPLOYED_month'] / merge_data['income_total']
merge_data['14new_1'] = merge_data['before_EMPLOYED_week'] / merge_data['income_total']

# 총 수익을 가족 수로 나누기
merge_data['15new_1'] = merge_data['income_total'] / merge_data['family_size']

# 융합 삭제
#merge_data['3new_1'] = merge_data['DAYS_EMPLOYED_month'] / merge_data['DAYS_BIRTH_month']
#merge_data['4new_1'] = merge_data['DAYS_EMPLOYED_month'] / merge_data['DAYS_BIRTH_week']
#merge_data['5new_1'] = merge_data['DAYS_EMPLOYED_week'] / merge_data['DAYS_BIRTH_month']
#merge_data['6new_1'] = merge_data['DAYS_EMPLOYED_week'] / merge_data['DAYS_BIRTH_week']

#merge_data['7new_1'] =  merge_data['begin_month'] / merge_data['DAYS_BIRTH_month']
#merge_data['8new_1'] =  merge_data['begin_month'] / merge_data['DAYS_EMPLOYED_month']
#merge_data['9new_1'] =  merge_data['begin_month'] / merge_data['before_EMPLOYED_month']

merge_data['new_1'] = merge_data['child_num'] / merge_data['income_total']
merge_data['new_2'] = merge_data['family_size'] / merge_data['income_total']
merge_data['new_3'] = merge_data['DAYS_BIRTH'] / merge_data['income_total']
merge_data['new_4'] = merge_data['DAYS_EMPLOYED'] / merge_data['income_total']
#merge_data['new_5'] = merge_data['begin_month'] / merge_data['income_total']
merge_data['new_6'] =  merge_data['DAYS_EMPLOYED'] / merge_data['DAYS_BIRTH']

# 소득 skewed-data 처리
merge_data['log1p_income_total'] = np.log1p(merge_data['income_total'])
#merge_data['log_income_total'] = np.log(merge_data['income_total'])
#merge_data['sqrt_income_total'] = np.sqrt(merge_data['income_total'])
#merge_data['boxcox_income_total'] = stats.boxcox(merge_data['income_total'])[0]

merge_data = merge_data.fillna(-999)
train = merge_data[merge_data['credit'] != -999]
test = merge_data[merge_data['credit'] == -999]
test.drop('credit', axis = 1, inplace = True)

train_cols = list(train.columns); train_cols.remove('credit'); train_cols.append('credit')
train = train[train_cols]

In [54]:
merge_data

Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit,DAYS_BIRTH_month,DAYS_BIRTH_week,DAYS_EMPLOYED_month,DAYS_EMPLOYED_week,before_EMPLOYED,before_EMPLOYED_month,before_EMPLOYED_week,1new_1,2new_1,10new_1,11new_1,12new_1,13new_1,14new_1,15new_1,new_1,new_2,new_3,new_4,new_6,log1p_income_total
0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,NAN,2.0,-6.0,1.0,7.0,1.0,0.0,0.0,-9190,6.0,0.0,0.000035,0.000005,0.000000,0.000000,-0.045383,0.000030,0.000000,101250.0,0.000000,0.000010,-0.068637,-0.023254,0.338801,12.218500
1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0,1.0,7.0,1.0,3.0,0.0,-9840,4.0,1.0,0.000028,0.000004,0.000012,0.000000,-0.039758,0.000016,0.000004,82500.0,0.000004,0.000012,-0.045980,-0.006222,0.135325,12.419170
2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,2.0,-22.0,2.0,0.0,2.0,3.0,1.0,-14653,8.0,1.0,0.000000,0.000004,0.000007,0.000002,-0.032562,0.000018,0.000002,225000.0,0.000000,0.000004,-0.042416,-0.009853,0.232305,13.017005
3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,1,0,1,0,Sales staff,2.0,-37.0,0.0,10.0,3.0,9.0,2.0,-12996,1.0,0.0,0.000049,0.000015,0.000044,0.000010,-0.064178,0.000005,0.000000,101250.0,0.000000,0.000010,-0.074509,-0.010331,0.138653,12.218500
4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,1,0,0,0,Managers,2.0,-26.0,2.0,9.0,0.0,10.0,0.0,-12932,11.0,3.0,0.000057,0.000000,0.000063,0.000000,-0.082108,0.000070,0.000019,78750.0,0.000000,0.000013,-0.095473,-0.013365,0.139988,11.967187
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,F,Y,Y,0,202500.0,Working,Incomplete higher,Married,House / apartment,-18593,-5434,1,1,1,0,Accountants,2.0,-19.0,-999.0,7.0,0.0,1.0,0.0,-13159,6.0,3.0,0.000035,0.000000,0.000005,0.000000,-0.064983,0.000030,0.000015,101250.0,0.000000,0.000010,-0.091817,-0.026835,0.292261,12.218500
9996,M,Y,Y,0,202500.0,Working,Secondary / secondary special,Civil marriage,House / apartment,-10886,-1315,1,1,0,0,Laborers,2.0,-34.0,-999.0,2.0,3.0,7.0,3.0,-9571,7.0,3.0,0.000010,0.000015,0.000035,0.000015,-0.047264,0.000035,0.000015,101250.0,0.000000,0.000010,-0.053758,-0.006494,0.120797,12.218500
9997,F,N,Y,0,292500.0,Working,Secondary / secondary special,Married,House / apartment,-21016,-14018,1,0,0,0,Medicine staff,2.0,-55.0,-999.0,4.0,2.0,11.0,2.0,-6998,5.0,3.0,0.000014,0.000007,0.000038,0.000007,-0.023925,0.000017,0.000010,146250.0,0.000000,0.000007,-0.071850,-0.047925,0.667016,12.586223
9998,F,Y,N,0,180000.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-16541,-1085,1,0,1,0,NAN,2.0,-33.0,-999.0,11.0,3.0,0.0,3.0,-15456,11.0,0.0,0.000061,0.000017,0.000000,0.000017,-0.085867,0.000061,0.000000,90000.0,0.000000,0.000011,-0.091894,-0.006028,0.065595,12.100718


### occyp_type = 'NAN' 값들의 income_type

In [90]:
train_copy = train.copy()
train_copy.head(20)
# 비교하기 위해 새로운 열을 추가
train['new_occyp_type']=train['occyp_type']
## income_type이 pensioner이고 occyp_type이 NAN인 것에 'nan_pen'으로 넣기
train.loc[(train['income_type'] == 'Pensioner')&(train['occyp_type'] == 'NAN'), "new_occyp_type"] = "nan_pen"

train.loc[:,['new_occyp_type','occyp_type']].head(20)

Unnamed: 0,new_occyp_type,occyp_type
0,NAN,NAN
1,Laborers,Laborers
2,Managers,Managers
3,Sales staff,Sales staff
4,Managers,Managers
5,High skill tech staff,High skill tech staff
6,Core staff,Core staff
7,Drivers,Drivers
8,NAN,NAN
9,Medicine staff,Medicine staff


In [91]:
test_copy = test.copy()

# 비교하기 위해 새로운 열을 추가
test['new_occyp_type']=test['occyp_type']
## income_type이 pensioner이고 occyp_type이 NAN인 것에 'nan_pen'으로 넣기
test.loc[(test['income_type'] == 'Pensioner')&(test['occyp_type'] == 'NAN'), "new_occyp_type"] = "nan_pen"
test.loc[:,['new_occyp_type','occyp_type']].head(20)

Unnamed: 0,new_occyp_type,occyp_type
0,nan_pen,NAN
1,Core staff,Core staff
2,Laborers,Laborers
3,Drivers,Drivers
4,Managers,Managers
5,Core staff,Core staff
6,NAN,NAN
7,nan_pen,NAN
8,NAN,NAN
9,Laborers,Laborers


In [68]:
# oc = train[['income_type','occyp_type']].pivot_table(index='income_type',columns='occyp_type',aggfunc='size').fillna(0).T
# oc # income_type에 따른 oocyp_type
# ocratio =oc/(np.ones((oc.shape))*oc.sum(axis=1).values.reshape(-1,1))*100
# # ocratio
# train[train.income_type =='State servant'].credit.value_counts()

income_type,Commercial associate,Pensioner,State servant,Student,Working
occyp_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Accountants,363.0,1.0,69.0,0.0,469.0
Cleaning staff,80.0,0.0,16.0,0.0,307.0
Cooking staff,92.0,0.0,41.0,0.0,324.0
Core staff,609.0,2.0,655.0,4.0,1376.0
Drivers,418.0,1.0,87.0,0.0,1069.0
HR staff,21.0,0.0,3.0,0.0,38.0
High skill tech staff,306.0,0.0,148.0,0.0,586.0
IT staff,15.0,0.0,6.0,0.0,20.0
Laborers,1075.0,3.0,142.0,2.0,3290.0
Low-skill Laborers,33.0,0.0,3.0,0.0,91.0


train = train[train['child_num']<=6].reset_index(drop=True) # 아이의 수가 7명 이상인 데이터 제거

## 인코딩

In [92]:
train_oh = train.copy()
train_noh = train.copy()
test_oh = test.copy()
test_noh = test.copy()

In [93]:
train_noh.columns

Index(['gender', 'car', 'reality', 'child_num', 'income_total', 'income_type',
       'edu_type', 'family_type', 'house_type', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'FLAG_MOBIL', 'work_phone', 'phone', 'email', 'occyp_type',
       'family_size', 'begin_month', 'DAYS_BIRTH_month', 'DAYS_BIRTH_week',
       'DAYS_EMPLOYED_month', 'DAYS_EMPLOYED_week', 'before_EMPLOYED',
       'before_EMPLOYED_month', 'before_EMPLOYED_week', '1new_1', '2new_1',
       '10new_1', '11new_1', '12new_1', '13new_1', '14new_1', '15new_1',
       'new_1', 'new_2', 'new_3', 'new_4', 'new_6', 'log1p_income_total',
       'credit', 'new_occyp_type'],
      dtype='object')

In [94]:
object_col = []
for col in train_noh.columns:
    if train_noh[col].dtype == 'object':
        train_noh[col] = train_noh[col].astype('category')
        test_noh[col] = test_noh[col].astype('category')

In [54]:
object_col = []
for col in train_oh.columns:
    if train_oh[col].dtype == 'object':
        object_col.append(col)
print(object_col)        
enc = OneHotEncoder()
enc.fit(train.loc[:,object_col])


train_onehot_df = pd.DataFrame(enc.transform(train_oh.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
train_oh.drop(object_col, axis=1, inplace=True)
train_oh = pd.concat([train_oh, train_onehot_df], axis=1)    

test_onehot_df = pd.DataFrame(enc.transform(test_oh.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
test_oh.drop(object_col, axis=1, inplace=True)
test_oh = pd.concat([test_oh, test_onehot_df], axis=1)

['gender', 'car', 'reality', 'income_type', 'edu_type', 'family_type', 'house_type', 'occyp_type', 'new_occyp_type']


## Feature 하나씩 빼면서 성능 체크

변수 하나씩 제거하면서 성능 체크<br>
제거하여 성능이 좋게 나온 것들은 리스트에 따로 저장해두기

In [95]:
train.columns

Index(['gender', 'car', 'reality', 'child_num', 'income_total', 'income_type',
       'edu_type', 'family_type', 'house_type', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'FLAG_MOBIL', 'work_phone', 'phone', 'email', 'occyp_type',
       'family_size', 'begin_month', 'DAYS_BIRTH_month', 'DAYS_BIRTH_week',
       'DAYS_EMPLOYED_month', 'DAYS_EMPLOYED_week', 'before_EMPLOYED',
       'before_EMPLOYED_month', 'before_EMPLOYED_week', '1new_1', '2new_1',
       '10new_1', '11new_1', '12new_1', '13new_1', '14new_1', '15new_1',
       'new_1', 'new_2', 'new_3', 'new_4', 'new_6', 'log1p_income_total',
       'credit', 'new_occyp_type'],
      dtype='object')

In [122]:
list(train_copy.columns).index('new_occyp_type')

39

In [97]:
# train_cols = list(train.columns); train_cols.remove('credit'); train_cols.append('credit')
# train = train[train_cols]
# list(train.columns).index('new_occyp_type')

39

In [98]:
train.head()

Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,DAYS_BIRTH_month,DAYS_BIRTH_week,DAYS_EMPLOYED_month,DAYS_EMPLOYED_week,before_EMPLOYED,before_EMPLOYED_month,before_EMPLOYED_week,1new_1,2new_1,10new_1,11new_1,12new_1,13new_1,14new_1,15new_1,new_1,new_2,new_3,new_4,new_6,log1p_income_total,new_occyp_type,credit
0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,NAN,2.0,-6.0,7.0,1.0,0.0,0.0,-9190,6.0,0.0,3.5e-05,5e-06,0.0,0.0,-0.045383,3e-05,0.0,101250.0,0.0,1e-05,-0.068637,-0.023254,0.338801,12.2185,NAN,1.0
1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0,7.0,1.0,3.0,0.0,-9840,4.0,1.0,2.8e-05,4e-06,1.2e-05,0.0,-0.039758,1.6e-05,4e-06,82500.0,4e-06,1.2e-05,-0.04598,-0.006222,0.135325,12.41917,Laborers,1.0
2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,2.0,-22.0,0.0,2.0,3.0,1.0,-14653,8.0,1.0,0.0,4e-06,7e-06,2e-06,-0.032562,1.8e-05,2e-06,225000.0,0.0,4e-06,-0.042416,-0.009853,0.232305,13.017005,Managers,2.0
3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,1,0,1,0,Sales staff,2.0,-37.0,10.0,3.0,9.0,2.0,-12996,1.0,0.0,4.9e-05,1.5e-05,4.4e-05,1e-05,-0.064178,5e-06,0.0,101250.0,0.0,1e-05,-0.074509,-0.010331,0.138653,12.2185,Sales staff,0.0
4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,1,0,0,0,Managers,2.0,-26.0,9.0,0.0,10.0,0.0,-12932,11.0,3.0,5.7e-05,0.0,6.3e-05,0.0,-0.082108,7e-05,1.9e-05,78750.0,0.0,1.3e-05,-0.095473,-0.013365,0.139988,11.967187,Managers,2.0


In [99]:
# train_copy = train.copy()
# test_copy = test.copy()

In [123]:
remove_features = [39]
train = train_copy.drop(train_copy.columns[remove_features], axis=1)
test = test_copy.drop(test_copy.columns[remove_features], axis=1)

train_oh = train.copy()
train_noh = train.copy()
test_oh = test.copy()
test_noh = test.copy()

object_col = []
for col in train_noh.columns:
    if train_noh[col].dtype == 'object':
        train_noh[col] = train_noh[col].astype('category')
        test_noh[col] = test_noh[col].astype('category')

In [124]:
list(train.columns)

['gender',
 'car',
 'reality',
 'child_num',
 'income_total',
 'income_type',
 'edu_type',
 'family_type',
 'house_type',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'FLAG_MOBIL',
 'work_phone',
 'phone',
 'email',
 'occyp_type',
 'family_size',
 'begin_month',
 'DAYS_BIRTH_month',
 'DAYS_BIRTH_week',
 'DAYS_EMPLOYED_month',
 'DAYS_EMPLOYED_week',
 'before_EMPLOYED',
 'before_EMPLOYED_month',
 'before_EMPLOYED_week',
 '1new_1',
 '2new_1',
 '10new_1',
 '11new_1',
 '12new_1',
 '13new_1',
 '14new_1',
 '15new_1',
 'new_1',
 'new_2',
 'new_3',
 'new_4',
 'new_6',
 'log1p_income_total',
 'credit']

In [125]:
train_x = train_noh.drop(['credit'], axis=1)
train_y = train_noh['credit']
test_x = test_noh.copy()

In [121]:
# new_occyp_type -> 연금수령자 / 아닌사람 나눈거

lucky_seed = [0,10,258]
for i, seed in enumerate(lucky_seed):
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    cv = np.zeros((train_x.shape[0], 3))
    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]
        lgbm = LGBMClassifier(n_estimators=1000, objective='multiclass')
        lgbm.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=None)
        cv[val_idx] = lgbm.predict_proba(x_val)
    print(f'{i+1} multi_logloss: {log_loss(train_y, cv)}')

1 multi_logloss: 0.7093443937813859
2 multi_logloss: 0.7122336887787285
3 multi_logloss: 0.7139243037169044


In [126]:
# occyp_type

lucky_seed = [0,10,258]
for i, seed in enumerate(lucky_seed):
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
    cv = np.zeros((train_x.shape[0], 3))
    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]
        lgbm = LGBMClassifier(n_estimators=1000, objective='multiclass')
        lgbm.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=None)
        cv[val_idx] = lgbm.predict_proba(x_val)
    print(f'{i+1} multi_logloss: {log_loss(train_y, cv)}')

1 multi_logloss: 0.6995398056653737
2 multi_logloss: 0.7027786073180856
3 multi_logloss: 0.7030508980779455


In [None]:
# train_x = train_noh.drop(['credit'], axis=1)
# train_y = train_noh['credit']
# test_x = test_noh.copy()

In [None]:
# kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
# cv = np.zeros((train_x.shape[0], 3))
# for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
#     x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
#     y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]
#     lgbm = LGBMClassifier(n_estimators=1000, objective='multiclass')
#     lgbm.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=None)
#     cv[val_idx] = lgbm.predict_proba(x_val)
# print(f'Initial_multi_logloss: {log_loss(train_y, cv)}')
# remove_features = []
# for i in range(1, 2):
#     for j in combinations(list(range(0, train_x.shape[1])), i):
#         train_new_x = train_x.drop(train_x.columns[list(j)], axis=1)
        
#         kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
#         cv = np.zeros((train_new_x.shape[0], 3))
#         for n, (train_idx, val_idx) in enumerate(kfold.split(train_new_x, train_y)):
#             x_train, x_val = train_new_x.iloc[train_idx], train_new_x.iloc[val_idx]
#             y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]
#             lgbm = LGBMClassifier(n_estimators=1000, objective='multiclass')
#             lgbm.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=None)
#             cv[val_idx] = lgbm.predict_proba(x_val)
#         print(f'{j} multi_logloss: {log_loss(train_y, cv)}')
#         if log_loss(train_y, cv)<0.708:
#             remove_features.append(list(j)[0])

## 저장한 변수 지우는 코드

In [None]:
remove_features = [1, 3, 4, 8, 11, 32]
train = train.drop(train.columns[remove_features], axis=1)
test = test.drop(test.columns[remove_features], axis=1)

## 다시 인코딩

In [None]:
train_oh = train.copy()
train_noh = train.copy()
test_oh = test.copy()
test_noh = test.copy()

In [None]:
object_col = []
for col in train_noh.columns:
    if train_noh[col].dtype == 'object':
        train_noh[col] = train_noh[col].astype('category')
        test_noh[col] = test_noh[col].astype('category')

In [None]:
object_col = []
for col in train_oh.columns:
    if train_oh[col].dtype == 'object':
        object_col.append(col)
print(object_col)        
enc = OneHotEncoder()
enc.fit(train.loc[:,object_col])


train_onehot_df = pd.DataFrame(enc.transform(train_oh.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
train_oh.drop(object_col, axis=1, inplace=True)
train_oh = pd.concat([train_oh, train_onehot_df], axis=1)    

test_onehot_df = pd.DataFrame(enc.transform(test_oh.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
test_oh.drop(object_col, axis=1, inplace=True)
test_oh = pd.concat([test_oh, test_onehot_df], axis=1)

# 6. 모델 학습

In [None]:
pred_dict = {}
pred_test_dict = {}

## (1) Lightgbm

In [None]:
train_x = train_noh.drop(['credit'], axis=1)
train_y = train_noh['credit']
test_x = test_noh.copy()

### Parameter Tuning

In [None]:
SEED=42

# Hyperopt의 metric함수를 StratifiedKFold(cv=5)로 구하기
def score(params):
    print("Training with params: ")
    print(params)
    
    kfold = StratifiedKFold(n_splits=5, random_state = SEED, shuffle = True)
    cv = np.zeros((train_x.shape[0], 3))

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

        lgbmodel = LGBMClassifier(**params)

        lgbmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=100) 
        cv[val_idx, :] = lgbmodel.predict_proba(x_val)
        print(f'fold{n+1} multi_logloss: {log_loss(y_val, cv[val_idx, :])}')
    print('multi_logloss:', log_loss(train_y, cv))
    score = log_loss(train_y, cv)
    return {'loss': score, 'status': STATUS_OK}

# Hyperopt의 범위를 지정해주고 max_evals만큼 반복한 후 최적의 파라미터를 반환
def optimize(random_state=SEED):
    
#     param = {'objective':'multi:softprob', 'seed':SEED, 'num_class': 3, 'eval_metric':'mlogloss', 
#          'eta': 0.01, 'min_child_weight': 3,
#          'colsample_bytree': 0.3, 'colsample_bylevel': 0.6, 'subsample': 0.8
#         }
    space = {
        'learning_rate': hp.quniform('learning_rate', 0.003, 0.006, 0.001),
        #'learning_rate' : 0.005,
        'num_leaves': scope.int(hp.quniform('num_leaves', 1000, 1200, 50)),
        'num_leaves' : 1000,
        #'min_child_weight': hp.quniform('min_child_weight', 1, 3, 1),
        'min_child_weight' : 2,
        #'subsample': hp.quniform('subsample', 0.8, 1, 0.05),
        'subsample' : 1,
        'colsample_bytree': hp.quniform('colsample_bytree', 0.3, 0.7, 0.05),
        #'colsample_bytree' : 0.4,
        'max_depth' : -1,
        'n_estimators' : 5000,
        'objective' : 'multiclass',
        'num_class' : 3,
        'seed': SEED,
    }
    # Use the fmin function from Hyperopt to find the best hyperparameters
    best = fmin(score, space, algo=tpe.suggest, 
                # trials=trials, 
                max_evals=12)
    return best

best_hyperparams = optimize()
print("The best hyperparameters are: ", "\n")
print(best_hyperparams)

### 3 seeds x 5 folds

In [None]:
lucky_seeds=[42] # Lucky seed 늘려가면서 하기
for i, seed in enumerate(lucky_seeds):

    kfold = StratifiedKFold(n_splits=5, random_state = seed, shuffle = True) # CV 늘려가면서 하기
    cv=np.zeros((train_x.shape[0], 3))
    pred_test = np.zeros((test_x.shape[0], 3), dtype=float)
    
    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):

        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]
        
        lgbmodel = LGBMClassifier(learning_rate=0.1, objective='multiclass', n_estimators=10000, num_leaves=1000, 
                                  max_depth=-1, min_child_weight=2, colsample_bytree=0.55,  
                                   n_jobs=-1, random_state=seed)

        lgbmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=100) 
        #joblib.dump(lgbmodel, f'./pred_pkl/LGB_{n+1}_fold_{seed}_seed_lgb.pkl')

        # CROSS-VALIDATION , EVALUATE CV
        cv[val_idx,:] = lgbmodel.predict_proba(x_val)
        pred_test += lgbmodel.predict_proba(test_x) / 5 # CV 바꾸면 이 숫자도 똑같이 바꿔야함
    pred_dict['lgb'+str(i+1)] = cv
    pred_test_dict['lgb'+str(i+1)] = pred_test
        
    print('multi_logloss :', log_loss(train_y, cv))

lgbmodels_path = os.listdir('./pred_pkl/')
lgbmodels_list = [x for x in lgbmodels_path if x.endswith("lgb.pkl")]
assert len(lgbmodels_list) == 15
lgb_preds = np.zeros((test_x.shape[0], 3))

for m in lgbmodels_list:
    lgbmodel = joblib.load('./pred_pkl/'+m)
    lgb_preds_proba = lgbmodel.predict_proba(test)
    lgb_preds += lgb_preds_proba/15

# 결과 제출

In [None]:
submission = sample_submission.copy()
submission.iloc[:, 1:] = pred_test

In [None]:
submission.to_csv('submission/submission.csv', index=False)