In [36]:
import warnings
warnings.filterwarnings('ignore')
import glob
import os
import joblib
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import log_loss
import random

# 1. 문제 정의

# 2. 데이터 수집

## (1) 데이콘 기본 데이터

In [2]:
train = pd.read_csv('data/train.csv').drop(['index'], axis=1).fillna('NAN')
test = pd.read_csv('data/test.csv').drop(['index'], axis=1).fillna('NAN')
sample_submission = pd.read_csv('data/sample_submission.csv')

In [3]:
train

Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,NAN,2.0,-6.0,1.0
1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0,1.0
2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,2.0,-22.0,2.0
3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,1,0,1,0,Sales staff,2.0,-37.0,0.0
4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,1,0,0,0,Managers,2.0,-26.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,F,N,N,2,225000.0,State servant,Secondary / secondary special,Married,House / apartment,-12079,-1984,1,0,0,0,Core staff,4.0,-2.0,1.0
26453,F,N,Y,1,180000.0,Working,Higher education,Separated,House / apartment,-15291,-2475,1,0,0,0,NAN,2.0,-47.0,2.0
26454,F,Y,N,0,292500.0,Working,Secondary / secondary special,Civil marriage,With parents,-10082,-2015,1,0,0,0,Core staff,2.0,-25.0,2.0
26455,M,N,Y,0,171000.0,Working,Incomplete higher,Single / not married,House / apartment,-10145,-107,1,0,0,0,Laborers,1.0,-59.0,2.0


# 3. 데이터 전처리

In [4]:
object_col = []
for col in train.columns:
    if train[col].dtype == 'O':
        object_col.append(col)

In [5]:
enc = OneHotEncoder()
enc.fit(train.loc[:,object_col])

train_onehot_df = pd.DataFrame(enc.transform(train.loc[:, object_col]).toarray(), 
                               columns=enc.get_feature_names(object_col))
train.drop(object_col, axis=1, inplace=True)
train = pd.concat([train, train_onehot_df], axis=1)

In [6]:
test_onehot_df = pd.DataFrame(enc.transform(test.loc[:, object_col]).toarray(),
                              columns=enc.get_feature_names(object_col))
test.drop(object_col, axis=1, inplace=True)
test = pd.concat([test, test_onehot_df], axis=1)

# 4. 탐색적 데이터 분석

# 5. 변수 조정

In [81]:
train_x = train.drop(['credit'], axis=1)
train_y = train['credit']
test_x = test.copy()

true = train[['credit']]
true['0'] = true['credit'][true['credit']==0.0]
true['1'] = true['credit'][true['credit']==1.0]
true['2'] = true['credit'][true['credit']==2.0]
del true['credit']
true = true.replace([0.0, 2.0], [1.0, 1.0])
true = true.fillna(0)
true = true.values

# 6. 모델 학습

## (1) Lightgbm

### Parameter Tuning

In [52]:
params = {
    'learning_rate' : 0.001,
    'num_leaves' : 400,
    'n_estimators' : 10000,
    'max_depth': -1,
    'min_child_weight' : 2, # fix
    'colsample_bytree' : 0.4, # fix
    'objective' : 'multiclass',
    'n_jobs': -1
}

### 3 seeds x 4 folds = 12 model

In [78]:
lucky_seeds=[42,2019,91373]
scores = []

for seed in lucky_seeds:

    kfold = KFold(n_splits=4, random_state = seed, shuffle = True)

    # dacon code
    cv=np.zeros((train.shape[0], 3))

    for n, (train_idx, val_idx) in enumerate(kfold.split(train)):

        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

        lgbmodel = LGBMClassifier(**params, random_state=seed)

        model.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=None) 
        joblib.dump(lgbmodel, f'./pred_pkl/{n}fold_{seed}seed_lgb.pkl')

        # CROSS-VALIDATION , EVALUATE CV
        cv[val_idx,:] = model.predict_proba(x_val)
        
    print('multi logloss :', log_loss(true, cv))

Training until validation scores don't improve for 30 rounds
[100]	valid_0's multi_logloss: 0.730841
Early stopping, best iteration is:
[78]	valid_0's multi_logloss: 0.727387
Training until validation scores don't improve for 30 rounds
[100]	valid_0's multi_logloss: 0.724274
Early stopping, best iteration is:
[73]	valid_0's multi_logloss: 0.719993
Training until validation scores don't improve for 30 rounds
[100]	valid_0's multi_logloss: 0.716162
Early stopping, best iteration is:
[72]	valid_0's multi_logloss: 0.710215
Training until validation scores don't improve for 30 rounds
[100]	valid_0's multi_logloss: 0.723607
Early stopping, best iteration is:
[78]	valid_0's multi_logloss: 0.719586
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[66]	valid_0's multi_logloss: 0.724935
Training until validation scores don't improve for 30 rounds
[100]	valid_0's multi_logloss: 0.71775
Early stopping, best iteration is:
[77]	valid_0's multi_logloss: 

In [80]:
# MODEL LOAD & TEST PREDICT
# 12 MODELS 평균 사용
lgbmodels_path = os.listdir('./pred_pkl/')
lgbmodels_list = [x for x in lgbmodels_path if x.endswith("lgb.pkl")]
assert len(lgbmodels_list) ==12
lgb_preds = np.zeros((test_x.shape[0], 3))

for m in lgbmodels_list:
    lgbmodel = joblib.load('./pred_pkl/'+m)
    lgb_preds_proba = lgbmodel.predict_proba(test)
    lgb_preds += lgb_preds_proba/12
lgb_preds.shape

## (2) XGBoost

In [8]:
params = {
    'learning_rate' : 0.06,
    'n_estimators' : 1000,
    'max_depth': 8,
    'min_child_weight' : 10, 
    'subsample' : 0.8,
    'colsample_bylevel' : 0.8,
    'colsample_bytree' : 0.8,
    'num_class' : 3,
    'objective' : 'multiclass',
    'n_jobs': -1
}

In [101]:
params = {'max_depth': [5, 8, 10] # 튜닝할 파라미터 삽입
            }

# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
xgb_clf = XGBClassifier(learning_rate=0.08, n_estimators=1000, min_child_weight=10, 
                        colsample_bytree=0.8, colsample_bylevel=0.8, subsample=0.8,
                        num_class=3, objective='multiclass', n_jobs=-1)

grid_cv = GridSearchCV(xgb_clf, param_grid=params, cv=5, n_jobs=-1)
grid_cv.fit(train_x, train_y)

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

KeyboardInterrupt: 

### 3 seeds x 4 folds = 12 models

In [105]:
lucky_seeds=[42,2019,91373]


for seed in lucky_seeds:

    kfold = KFold(n_splits=4, random_state = seed, shuffle = True)

    # dacon code
    cv=np.zeros((train.shape[0], 3))

    for n, (train_idx, val_idx) in enumerate(kfold.split(train)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]
        
        dtrain = xgb.DMatrix(x_train, label=y_train)
        dvalid = xgb.DMatrix(x_val, label=y_val)
        watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

        param = {'objective':'multi:softprob', 'seed':seed, 'num_class': 3, 'eval_metric':'mlogloss', 
                 'eta': 0.08, 'max_depth': 5, 'min_child_weight': 2,
                 'colsample_bytree': 0.8, 'colsample_bylevel': 0.8, 'subsample': 0.8,
                 #'tree_model':'gpu_hist',
                }

        xgbmodel = xgb.train(param, dtrain, 2000, watchlist, early_stopping_rounds=30, verbose_eval=None)
        joblib.dump(xgbmodel, f'./pred_pkl/{n}fold_{seed}seed_xgb.pkl')

        cv[val_idx, :] = xgbmodel.predict(dvalid)
    scores.append(log_loss(true, cv))
    print('multi_logloss:', log_loss(true, cv))

KeyboardInterrupt: 

In [None]:
# MODEL LOAD & TEST PREDICT
# 12 MODELS 평균 사용
xgbmodels_path = os.listdir('./pred_pkl/')
xgbmodels_list = [x for x in xgbmodels_path if x.endswith("xgb.pkl")]
assert len(xgbmodels_list) ==12
xgb_preds = np.zeros((test_x.shape[0], 3))

for m in xgbmodels_list:
    rfmodel = joblib.load('./pred_pkl/'+m)
    xgb_preds_proba = xgbmodel.predict_proba(test_x)
    xgb_preds += xgb_preds_proba/12
xgb_preds.shape

## (3) Random Forest

### GridSearchCV

In [76]:
from sklearn.model_selection import GridSearchCV

params = {'max_depth': [55, 60, 65] # 튜닝할 파라미터 삽입
            }

# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
rf_clf = RandomForestClassifier(random_state = 0, n_estimators = 1000, 
                                min_samples_leaf=2, min_samples_split=2,
                                criterion='entropy', n_jobs = -1)
grid_cv = GridSearchCV(rf_clf, param_grid = params, cv = 5, n_jobs = -1)
grid_cv.fit(df_train, y)

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

KeyboardInterrupt: 

### 3 seeds x 4 folds = 12 models

In [79]:
scores = []
for seed in lucky_seeds:

    kfold = KFold(n_splits=5, random_state = seed, shuffle = True)

    cv = np.zeros((train.shape[0], 3))
    
    for n, (train_idx, val_idx) in enumerate(kfold.split(train)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

        rfmodel = RandomForestClassifier(n_estimators=1000, criterion='entropy', max_depth=55,
                                         min_samples_leaf=2, min_samples_split=2,
                                         random_state=seed)
        rfmodel.fit(x_train, y_train)
        
        cv[val_idx, :] = rfmodel.predict_proba(x_val)
        joblib.dump(rfmodel, f'./pred_pkl/{n}fold_{seed}seed_rf.pkl')
        
    scores.append(log_loss(true, cv))
    print('multi logloss :', log_loss(true, cv))

0.707421279275772
0.7091183932019504
0.7043251004785559


In [None]:
# MODEL LOAD & TEST PREDICT
# 12 MODELS 평균 사용
rfmodels_path = os.listdir('./pred_pkl/')
rfmodels_list = [x for x in rfmodels_path if x.endswith("rf.pkl")]
assert len(rfmodels_list) ==12
rf_preds = np.zeros((test_x.shape[0], 3))

for m in rfmodels_list:
    rfmodel = joblib.load('./pred_pkl/'+m)
    rf_preds_proba = rfmodel.predict_proba(test_x)
    rf_preds += rf_preds_proba/12
rf_preds.shape

## (X) Ensemble

# 결과 제출

In [84]:
submission = sample_submission.copy()

In [89]:
submission.to_csv('submission/seed.csv', index=False)