In [1]:
import warnings
warnings.filterwarnings('ignore')
import glob
import os
import joblib
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgb
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import xgboost as xgb
from catboost import Pool, CatBoostClassifier
from kaggler.model import AutoLGB
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV, StratifiedShuffleSplit, train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, log_loss
import random

# 1. 문제 정의

# 2. 데이터 수집

## (1) 데이콘 기본 데이터

In [15]:
train = pd.read_csv('data/train.csv').drop(['index'], axis=1).fillna('NAN')
test = pd.read_csv('data/test.csv').drop(['index'], axis=1).fillna('NAN')
sample_submission = pd.read_csv('data/sample_submission.csv')

# 4. 탐색적 데이터 분석

# 5. 변수 조정

## (1) 이상치 제거

In [16]:
object_col = []
for col in train.columns:
    if (train[col].dtype == 'O'):
        object_col.append(col)

In [17]:
enc = OneHotEncoder()
enc.fit(train.loc[:,object_col])

train_onehot_df = pd.DataFrame(enc.transform(train.loc[:, object_col]).toarray(), 
                               columns=enc.get_feature_names(object_col))
train.drop(object_col, axis=1, inplace=True)
train = pd.concat([train, train_onehot_df], axis=1)

test_onehot_df = pd.DataFrame(enc.transform(test.loc[:, object_col]).toarray(),
                              columns=enc.get_feature_names(object_col))
test.drop(object_col, axis=1, inplace=True)
test = pd.concat([test, test_onehot_df], axis=1)

In [18]:
# DAYS_BIRTH
train['DAYS_BIRTH_month']=np.floor((-train['DAYS_BIRTH'])/30)-(
    (np.floor((-train['DAYS_BIRTH'])/30)/12).astype(int)*12)
train['DAYS_BIRTH_week']=np.floor((-train['DAYS_BIRTH'])/7)-(
    (np.floor((-train['DAYS_BIRTH'])/7)/4).astype(int)*4)

# DAYS_EMPLOYED
train['DAYS_EMPLOYED_month']=np.floor((-train['DAYS_EMPLOYED'])/30)-(
    (np.floor((-train['DAYS_EMPLOYED'])/30)/12).astype(int)*12)
train['DAYS_EMPLOYED_week']=np.floor((-train['DAYS_EMPLOYED'])/7)-(
    (np.floor((-train['DAYS_EMPLOYED'])/7)/4).astype(int)*4)

# before_EMPLOYED
train['before_EMPLOYED']=train['DAYS_BIRTH']-train['DAYS_EMPLOYED']
train['before_EMPLOYED_month']=np.floor((-train['before_EMPLOYED'])/30)-(
    (np.floor((-train['before_EMPLOYED'])/30)/12).astype(int)*12)
train['before_EMPLOYED_week']=np.floor((-train['before_EMPLOYED'])/7)-(
    (np.floor((-train['before_EMPLOYED'])/7)/4).astype(int)*4)

# DAYS_BIRTH
test['DAYS_BIRTH_month']=np.floor((-test['DAYS_BIRTH'])/30)-(
    (np.floor((-test['DAYS_BIRTH'])/30)/12).astype(int)*12)
test['DAYS_BIRTH_week']=np.floor((-test['DAYS_BIRTH'])/7)-(
    (np.floor((-test['DAYS_BIRTH'])/7)/4).astype(int)*4)

# DAYS_EMPLOYED
test['DAYS_EMPLOYED_month']=np.floor((-test['DAYS_EMPLOYED'])/30)-(
    (np.floor((-test['DAYS_EMPLOYED'])/30)/12).astype(int)*12)
test['DAYS_EMPLOYED_week']=np.floor((-test['DAYS_EMPLOYED'])/7)-(
    (np.floor((-test['DAYS_EMPLOYED'])/7)/4).astype(int)*4)

# before_EMPLOYED
test['before_EMPLOYED']=test['DAYS_BIRTH']-test['DAYS_EMPLOYED']
test['before_EMPLOYED_month']=np.floor((-test['before_EMPLOYED'])/30)-(
    (np.floor((-test['before_EMPLOYED'])/30)/12).astype(int)*12)
test['before_EMPLOYED_week']=np.floor((-test['before_EMPLOYED'])/7)-(
    (np.floor((-test['before_EMPLOYED'])/7)/4).astype(int)*4)

In [19]:
# 아이의 수가 7명 이상인 데이터 제거
train = train[train['child_num']<=6].reset_index(drop=True)

In [20]:
# 총 수익 skewed data 처리
train['log_income_total'] = np.log(train['income_total'])
train['sqrt_income_total'] = np.sqrt(train['income_total'])
train['boxcox_income_total'] = stats.boxcox(train['income_total'])[0]
test['log_income_total'] = np.log(test['income_total'])
test['sqrt_income_total'] = np.sqrt(test['income_total'])
test['boxcox_income_total'] = stats.boxcox(test['income_total'])[0]

In [8]:
train_x = train.drop(['credit'], axis=1)
train_y = train['credit']
test_x = test.copy()

In [9]:
true = train[['credit']]
true['0'] = true['credit'][true['credit']==0.0]
true['1'] = true['credit'][true['credit']==1.0]
true['2'] = true['credit'][true['credit']==2.0]
del true['credit']
true = true.replace([0.0, 2.0], [1.0, 1.0])
true = true.fillna(0)
true = true.values

# 6. 모델 학습

In [10]:
pred_dict = {}
pred_test_dict = {}

## (1) Lightgbm

### Parameter Tuning

### 3 seeds x 5 folds

In [36]:
lucky_seeds=[42,2019,91373]

for i, seed in enumerate(lucky_seeds):

    kfold = KFold(n_splits=5, random_state = seed, shuffle = True)
    cv=np.zeros((train.shape[0], 3))
    #pred_test = np.zeros((test_x.shape[0], 3), dtype=float)
    
    for n, (train_idx, val_idx) in enumerate(kfold.split(train)):

        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

        lgbmodel = LGBMClassifier(learning_rate=0.01, objective='multiclass', n_estimators=1000,
                                   n_jobs=-1, random_state=seed)

        lgbmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=None) 
        #joblib.dump(lgbmodel, f'./pred_pkl/LGB_{n+1}_fold_{seed}_seed_lgb.pkl')

        # CROSS-VALIDATION , EVALUATE CV
        cv[val_idx,:] = lgbmodel.predict_proba(x_val)
        #pred_test += lgbmodel.predict_proba(test_x) / 5
    #pred_dict['lgb'+str(i+1)] = cv
    #pred_test_dict['lgb'+str(i+1)] = pred_test
        
    print('multi_logloss :', log_loss(true, cv))

Training until validation scores don't improve for 30 rounds
[100]	valid_0's multi_logloss: 0.798839
[200]	valid_0's multi_logloss: 0.778206
[300]	valid_0's multi_logloss: 0.76643
[400]	valid_0's multi_logloss: 0.757688
[500]	valid_0's multi_logloss: 0.750063
[600]	valid_0's multi_logloss: 0.743378
[700]	valid_0's multi_logloss: 0.737894
[800]	valid_0's multi_logloss: 0.733222
[900]	valid_0's multi_logloss: 0.728691
[1000]	valid_0's multi_logloss: 0.725023
Did not meet early stopping. Best iteration is:
[1000]	valid_0's multi_logloss: 0.725023
Training until validation scores don't improve for 30 rounds
[100]	valid_0's multi_logloss: 0.798262
[200]	valid_0's multi_logloss: 0.77794
[300]	valid_0's multi_logloss: 0.767207
[400]	valid_0's multi_logloss: 0.758899
[500]	valid_0's multi_logloss: 0.753058
[600]	valid_0's multi_logloss: 0.747388
[700]	valid_0's multi_logloss: 0.742806
[800]	valid_0's multi_logloss: 0.739606
[900]	valid_0's multi_logloss: 0.735866
[1000]	valid_0's multi_logloss

KeyboardInterrupt: 

# 여기까지

In [11]:
lucky_seeds=[42,2019,91373]

for i, seed in enumerate(lucky_seeds):

    kfold = KFold(n_splits=5, random_state = seed, shuffle = True)
    cv=np.zeros((train.shape[0], 3))
    #pred_test = np.zeros((test_x.shape[0], 3), dtype=float)
    
    for n, (train_idx, val_idx) in enumerate(kfold.split(train)):

        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

        lgbmodel = LGBMClassifier(learning_rate=0.005, objective='multiclass', n_estimators=10000, num_leaves=1000, 
                                  max_depth=-1, min_child_weight=2, colsample_bytree=0.4,  
                                   n_jobs=-1, random_state=seed)

        lgbmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=None) 
        #joblib.dump(lgbmodel, f'./pred_pkl/LGB_{n+1}_fold_{seed}_seed_lgb.pkl')

        # CROSS-VALIDATION , EVALUATE CV
        cv[val_idx,:] = lgbmodel.predict_proba(x_val)
        #pred_test += lgbmodel.predict_proba(test_x) / 5
    #pred_dict['lgb'+str(i+1)] = cv
    #pred_test_dict['lgb'+str(i+1)] = pred_test
        
    print('multi_logloss :', log_loss(true, cv))

multi_logloss : 0.7015993077380257
multi_logloss : 0.7008750060407489
multi_logloss : 0.7009575587001414


In [None]:
lucky_seeds=[42,2019,91373]

for i, seed in enumerate(lucky_seeds):

    kfold = KFold(n_splits=5, random_state = seed, shuffle = True)
    cv=np.zeros((train.shape[0], 3))
    #pred_test = np.zeros((test_x.shape[0], 3), dtype=float)
    
    for n, (train_idx, val_idx) in enumerate(kfold.split(train)):

        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

        lgbmodel = LGBMClassifier(learning_rate=0.005, objective='multiclass', n_estimators=10000, num_leaves=800, 
                                  max_depth=-1, min_child_weight=2, colsample_bytree=0.4,  
                                   n_jobs=-1, random_state=seed)

        lgbmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=None) 
        #joblib.dump(lgbmodel, f'./pred_pkl/LGB_{n+1}_fold_{seed}_seed_lgb.pkl')

        # CROSS-VALIDATION , EVALUATE CV
        cv[val_idx,:] = lgbmodel.predict_proba(x_val)
        #pred_test += lgbmodel.predict_proba(test_x) / 5
    #pred_dict['lgb'+str(i+1)] = cv
    #pred_test_dict['lgb'+str(i+1)] = pred_test
        
    print('multi_logloss :', log_loss(true, cv))

In [12]:
lucky_seeds=[42,2019,91373]

for i, seed in enumerate(lucky_seeds):

    kfold = KFold(n_splits=5, random_state = seed, shuffle = True)
    cv=np.zeros((train.shape[0], 3))
    #pred_test = np.zeros((test_x.shape[0], 3), dtype=float)
    
    for n, (train_idx, val_idx) in enumerate(kfold.split(train)):

        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

        lgbmodel = LGBMClassifier(learning_rate=0.005, objective='multiclass', n_estimators=10000, num_leaves=600, 
                                  max_depth=-1, min_child_weight=2, colsample_bytree=0.4,  
                                   n_jobs=-1, random_state=seed)

        lgbmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=None) 
        #joblib.dump(lgbmodel, f'./pred_pkl/LGB_{n+1}_fold_{seed}_seed_lgb.pkl')

        # CROSS-VALIDATION , EVALUATE CV
        cv[val_idx,:] = lgbmodel.predict_proba(x_val)
        #pred_test += lgbmodel.predict_proba(test_x) / 5
    #pred_dict['lgb'+str(i+1)] = cv
    #pred_test_dict['lgb'+str(i+1)] = pred_test
        
    print('multi_logloss :', log_loss(true, cv))

KeyboardInterrupt: 

lgbmodels_path = os.listdir('./pred_pkl/')
lgbmodels_list = [x for x in lgbmodels_path if x.endswith("lgb.pkl")]
assert len(lgbmodels_list) == 15
lgb_preds = np.zeros((test_x.shape[0], 3))

for m in lgbmodels_list:
    lgbmodel = joblib.load('./pred_pkl/'+m)
    lgb_preds_proba = lgbmodel.predict_proba(test)
    lgb_preds += lgb_preds_proba/15

## (2) XGBoost

### Parameter Tuning

params = {'learning_rate':[0.01, 0.005],
          'max_depth': [30, 35, 40] # 튜닝할 파라미터 삽입
            }

xgb_clf = XGBClassifier(n_estimators=100, min_child_weight=2, 
                        colsample_bytree=0.8, colsample_bylevel=0.8, subsample=0.8,
                        num_class=3, objective='multiclass', n_jobs=-1)

grid_cv = GridSearchCV(xgb_clf, param_grid=params, cv=5, n_jobs=-1)
grid_cv.fit(train_x, train_y)

hr_grid_df = pd.DataFrame(grid_cv.cv_results_)
hr_grid_df.loc[:, ['mean_test_score', "params"]]

### 3 seeds x 5 folds

In [None]:
lucky_seeds=[42, 2019, 91373]
xgtest = xgb.DMatrix(test_x)
for i, seed in enumerate(lucky_seeds):

    kfold = KFold(n_splits=5, random_state = seed, shuffle = True)
    cv = np.zeros((train.shape[0], 3))
    pred_test = np.zeros((test_x.shape[0], 3), dtype=float)

    for n, (train_idx, val_idx) in enumerate(kfold.split(train)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]
        
        dtrain = xgb.DMatrix(x_train, label=y_train)
        dvalid = xgb.DMatrix(x_val, label=y_val)
        watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

        param = {'objective':'multi:softprob', 'seed':seed, 'num_class': 3, 'eval_metric':'mlogloss', 
                 'eta': 0.004, 'max_depth': 70, 'min_child_weight': 3,
                 'colsample_bytree': 0.3, 'colsample_bylevel': 0.6, 'subsample': 0.8
                }

        xgbmodel = xgb.train(param, dtrain, 10000, watchlist, early_stopping_rounds=30, verbose_eval=None)
        #joblib.dump(xgbmodel, f'./pred_pkl/XGB_{n+1}_fold_{seed}_seed_xgb.pkl')

        cv[val_idx, :] = xgbmodel.predict(dvalid)
        pred_test += xgbmodel.predict(xgtest) / 5
        
    pred_dict['xgb'+str(i+1)] = cv
    pred_test_dict['xgb'+str(i+1)] = pred_test
    print('multi_logloss:', log_loss(true, cv))

xgbmodels_path = os.listdir('./pred_pkl/')
xgbmodels_list = [x for x in xgbmodels_path if x.endswith("xgb.pkl")]
assert len(xgbmodels_list) == 15
xgb_preds = np.zeros((test_x.shape[0], 3))
xgtest = xgb.DMatrix(test_X)

for m in xgbmodels_list:
    xgbmodel = joblib.load('./pred_pkl/'+m)
    xgb_preds_proba = xgbmodel.predict_proba(xgtest)
    xgb_preds += xgb_preds_proba/15

## (3) Random Forest

### GridSearchCV

params = {'max_depth': [55, 60, 65] # 튜닝할 파라미터 삽입
            }

rf_clf = RandomForestClassifier(random_state = 0, n_estimators = 1000, 
                                min_samples_leaf=2, min_samples_split=2,
                                criterion='entropy', n_jobs = -1)
grid_cv = GridSearchCV(rf_clf, param_grid = params, cv = 5, n_jobs = -1)
grid_cv.fit(df_train, y)

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

### 3 seeds, 5 folds

In [None]:
lucky_seeds=[42,2019,91373]
for i, seed in enumerate(lucky_seeds):

    kfold = KFold(n_splits=5, random_state = seed, shuffle = True)
    cv = np.zeros((train.shape[0], 3))
    pred_test = np.zeros((test_x.shape[0], 3), dtype=float)
    
    for n, (train_idx, val_idx) in enumerate(kfold.split(train)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

        rfmodel = RandomForestClassifier(n_estimators=1000, criterion='entropy', max_depth=55,
                                         min_samples_leaf=2, min_samples_split=2,
                                         random_state=seed)
        rfmodel.fit(x_train, y_train)
        #joblib.dump(rfmodel, f'./pred_pkl/RF_{n+1}_fold_{seed}_seed_rf.pkl')
        
        cv[val_idx, :] = rfmodel.predict_proba(x_val)        
        pred_test += rfmodel.predict_proba(test_x) / 5
        
    pred_dict['rf'+str(i+1)] = cv
    pred_test_dict['rf'+str(i+1)] = pred_test
    print('multi_logloss :', log_loss(true, cv))

rfmodels_path = os.listdir('./pred_pkl/')
rfmodels_list = [x for x in rfmodels_path if x.endswith("rf.pkl")]
assert len(rfmodels_list) == 15
rf_preds = np.zeros((test_x.shape[0], 3))

for m in rfmodels_list:
    rfmodel = joblib.load('./pred_pkl/'+m)
    rf_preds_proba = rfmodel.predict_proba(test_x)
    rf_preds += rf_preds_proba/15

## (4) Catboost (성능X)

lucky_seeds=[42,2019,91373]
for i, seed in enumerate(lucky_seeds):

    kfold = KFold(n_splits=5, random_state = seed, shuffle = True)
    cv = np.zeros((train.shape[0], 3))
    pred_test = np.zeros((test_x.shape[0], 3), dtype=float)
    
    for n, (train_idx, val_idx) in enumerate(kfold.split(train)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]
        _train = Pool(x_train, label=y_train)
        _valid = Pool(x_val, label=y_val)

        catmodel =  CatBoostClassifier(loss_function='MultiClass', early_stopping_rounds=50, 
                                       random_state=seed, learning_rate=0.02, iterations=100000
                                       #task_type="GPU"
                                      )
        
        catmodel.fit(_train, eval_set=_valid, use_best_model=True, verbose=2000)
        #joblib.dump(rfmodel, f'./pred_pkl/RF_{n+1}_fold_{seed}_seed_rf.pkl')
        
        cv[val_idx, :] = catmodel.predict_proba(x_val)        
        pred_test += catmodel.predict_proba(test_x) / 5
        
    pred_dict['cat'+str(i+1)] = cv
    pred_test_dict['cat'+str(i+1)] = pred_test
    print('multi_logloss :', log_loss(true, cv))

## (4) Stacking (AutoLGB)

### 27features = 3seed(42, 2019, 91373) x 3model(lgb, xgb, rf) x 3class(0, 1, 2)

In [None]:
X_train = pd.DataFrame(np.hstack([x for _, x in pred_dict.items()]))
X_test = pd.DataFrame(np.hstack([x for _, x in pred_test_dict.items()]))

pred = np.zeros((X_train.shape[0], 3), dtype=float)
pred_test = np.zeros((X_test.shape[0], 3), dtype=float)
#kfold = KFold(n_splits=5, random_state = seed, shuffle = True)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for i_cv, (i_trn, i_val) in enumerate(cv.split(X_train, train_y)):
    if i_cv == 0:
        clf = AutoLGB(objective='multiclass', metric='multi_logloss', params={'num_class': 3}, 
                      feature_selection=False, n_est=10000)
        clf.tune(X_train.iloc[i_trn], train_y[i_trn])
        n_best = clf.n_best
        features = clf.features
        params = clf.params
        print(f'best iteration: {n_best}')
        print(f'selected features ({len(features)}): {features}')        
        print(params)
        clf.fit(X_train.iloc[i_trn], train_y[i_trn])
    else:
        train_data = lgb.Dataset(X_train[features].iloc[i_trn], label=train_y[i_trn])
        clf = lgb.train(params, train_data, n_best, verbose_eval=100)
    
    pred[i_val] = clf.predict(X_train[features].iloc[i_val])
    pred_test += clf.predict(X_test[features]) / 5

In [None]:
print(f'CV Log Loss: {log_loss(train_y, pred):.6f}')

# 결과 제출

In [None]:
submission = sample_submission.copy()
submission.iloc[:, 1:] = pred_test

In [None]:
submission.to_csv('submission/submission.csv', index=False)