In [1]:
import pandas as pd
import joblib
import numpy as np

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier

### 전처리 작업을 통해 최종적으로 생성된 데이터 불러와서 
### LGBMClassifier 모델에 넣고 성능을 확인하는 작업을 수행.

In [5]:
# 데이터 로드
df = joblib.load(filename='../newdata2/df_corr_fi.joblib')

In [3]:
df

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,POS_NAME_CONTRACT_STATUS_Active_MEAN,POS_NAME_CONTRACT_STATUS_Amortized debt_MEAN,POS_NAME_CONTRACT_STATUS_Approved_MEAN,POS_NAME_CONTRACT_STATUS_Canceled_MEAN,POS_NAME_CONTRACT_STATUS_Completed_MEAN,POS_NAME_CONTRACT_STATUS_Demand_MEAN,POS_NAME_CONTRACT_STATUS_Returned to the store_MEAN,POS_NAME_CONTRACT_STATUS_Signed_MEAN,POS_NAME_CONTRACT_STATUS_XNA_MEAN,POS_CNT_INSTALMENT_LAST
0,100002,1.0,0,1,0,1,0.0,202500.0,406597.5,24700.5,...,1.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,24.000000
1,100003,0.0,0,0,0,0,0.0,270000.0,1293502.5,35698.5,...,0.916667,0.0,0.0,0.0,0.083333,0.0,0.000000,0.000000,0.0,8.333333
2,100004,0.0,1,1,1,1,0.0,67500.0,135000.0,6750.0,...,0.750000,0.0,0.0,0.0,0.250000,0.0,0.000000,0.000000,0.0,3.000000
3,100006,0.0,0,0,0,1,0.0,135000.0,312682.5,29686.5,...,0.805556,0.0,0.0,0.0,0.152778,0.0,0.041667,0.000000,0.0,6.000000
4,100007,0.0,0,1,0,1,0.0,121500.0,513000.0,21865.5,...,0.942735,0.0,0.0,0.0,0.041880,0.0,0.000000,0.015385,0.0,15.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,456221,,0,0,0,1,0.0,121500.0,412560.0,17473.5,...,0.750000,0.0,0.0,0.0,0.250000,0.0,0.000000,0.000000,0.0,3.000000
48740,456222,,0,0,0,0,2.0,157500.0,622413.0,31909.5,...,0.938333,0.0,0.0,0.0,0.040833,0.0,0.000000,0.020833,0.0,17.500000
48741,456223,,0,0,1,1,1.0,202500.0,315000.0,33205.5,...,0.925000,0.0,0.0,0.0,0.075000,0.0,0.000000,0.000000,0.0,7.400000
48742,456224,,0,1,0,0,0.0,225000.0,450000.0,25128.0,...,0.916667,0.0,0.0,0.0,0.041667,0.0,0.000000,0.041667,0.0,17.000000


In [4]:
# TARGET을 기준으로 df을 각각 train, test 데이터프레임으로 나눔
train_df = df[df['TARGET'].notnull()]
test_df = df[df['TARGET'].isnull()]

# 교차검증을 수행하기 위해 KFold모델 사용
folds = KFold(n_splits=5, shuffle=True, random_state=55)

# 결과값을 저장하기 위해 array 생성
# 교차검증에서 검증세트의 최적 예측값을 저장하기 위한 변수
oof_preds = np.zeros(train_df.shape[0])

# submit(제출)할 예측값을 저장하기 위한 변수
sub_preds = np.zeros(test_df.shape[0])

In [5]:
# 데이터프레임이 LGBMClassifier 모델에 적용될 수 있도록 데이터프레임의 컬럼이름을 재설정
train_df.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in train_df.columns]
test_df.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in test_df.columns]

In [7]:
# 모델 성능 평가시 TARGET, SK_ID_CURR 변수를 제외
feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR']]

# 교차검증을 진행하기 위해 KFold모델의 split옵션을 사용하여 train 데이터프레임을 훈련세트와 검증세트로 나눔 
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
    train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
    valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]
    
    # 모델 생성
    clf = LGBMClassifier(
            objective= 'binary',
            n_estimators=10000,
            n_jobs=-1,
            silent= -1,
            verbose= -1,
            random_state=55     
    )
            
    # 모델에 데이터 적용 및 평가
    clf.fit(train_x, train_y, eval_set = [(train_x, train_y), (valid_x, valid_y)],
       eval_metric='auc', verbose=150, early_stopping_rounds=150)
    
    # 교차검증에서 검증세트의 최적 예측값의 index를 저장
    oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
    
    # 제출할 예측값을 저장
    sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits
    
    # 교차검증에서 검증세트의 최적 예측값의 AUC값을 출력
    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))

Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.777024	training's binary_logloss: 0.244507	valid_1's auc: 0.750682	valid_1's binary_logloss: 0.248782
[400]	training's auc: 0.799387	training's binary_logloss: 0.234954	valid_1's auc: 0.764963	valid_1's binary_logloss: 0.243174
[600]	training's auc: 0.813372	training's binary_logloss: 0.229143	valid_1's auc: 0.772182	valid_1's binary_logloss: 0.240613
[800]	training's auc: 0.824015	training's binary_logloss: 0.224745	valid_1's auc: 0.776158	valid_1's binary_logloss: 0.239223
[1000]	training's auc: 0.83295	training's binary_logloss: 0.221075	valid_1's auc: 0.778354	valid_1's binary_logloss: 0.238469
[1200]	training's auc: 0.840926	training's binary_logloss: 0.217838	valid_1's auc: 0.779733	valid_1's binary_logloss: 0.237993
[1400]	training's auc: 0.84796	training's binary_logloss: 0.214918	valid_1's auc: 0.780613	valid_1's binary_logloss: 0.237691
[1600]	training's auc: 0.85451	training's binary_loglos

[200]	training's auc: 0.774819	training's binary_logloss: 0.244912	valid_1's auc: 0.762798	valid_1's binary_logloss: 0.247173
[400]	training's auc: 0.797316	training's binary_logloss: 0.2354	valid_1's auc: 0.777317	valid_1's binary_logloss: 0.241101
[600]	training's auc: 0.81171	training's binary_logloss: 0.229566	valid_1's auc: 0.784097	valid_1's binary_logloss: 0.238404
[800]	training's auc: 0.822461	training's binary_logloss: 0.225205	valid_1's auc: 0.78701	valid_1's binary_logloss: 0.237182
[1000]	training's auc: 0.831552	training's binary_logloss: 0.221566	valid_1's auc: 0.788658	valid_1's binary_logloss: 0.236505
[1200]	training's auc: 0.839577	training's binary_logloss: 0.218341	valid_1's auc: 0.789766	valid_1's binary_logloss: 0.236065
[1400]	training's auc: 0.846845	training's binary_logloss: 0.215371	valid_1's auc: 0.790357	valid_1's binary_logloss: 0.235845
[1600]	training's auc: 0.853628	training's binary_logloss: 0.212562	valid_1's auc: 0.790717	valid_1's binary_logloss: 0

In [8]:
# 제출용 파일 생성
submit = test_df[['SK_ID_CURR']]
submit['TARGET'] = sub_preds

# csv 파일로 저장
submit.to_csv('lgbm.csv', index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
