In [31]:
import pandas as pd
import numpy as np
import os
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats
from scipy import stats
from scipy.stats import ttest_rel
from scipy.stats import ttest_ind
from scipy.stats import shapiro
from scipy.stats import chi2_contingency
from scipy.stats import mannwhitneyu

from statsmodels.stats.proportion import proportions_ztest
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.stattools import omni_normtest

from statsmodels.formula.api import ols
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.api import qqplot, add_constant

from sklearn.preprocessing import scale, minmax_scale, robust_scale
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report

# %matplotlib inline
# plt.set(font="AppleGothic")
# plt.rcParams['axes.unicode_minus']=False

%matplotlib inline
matplotlib.rc('font', family='AppleGothic')
plt.rc('font',  family='AppleGothic')
plt.rcParams['axes.unicode_minus']=False

# 파일 불러오기

In [5]:
df = pd.read_csv('/Users/dasol/Documents/Job/포스코아카데미/강의자료/포스코/빅데이터프로젝트/A2_신용카드/CreDB_chaju.csv', engine='python', encoding='cp949')

# overdue 1개라도 있으면 1 , 전혀 없으면 0으로 계산된 데이터
df_target = pd.read_csv('/Users/dasol/Documents/Job/포스코아카데미/강의자료/포스코/빅데이터프로젝트/A2_신용카드/df_final.csv', engine='python')

In [10]:
df_target.columns

Index(['chaju_id', 'credit_class', 'score_risk', 'gender', 'age', 'agegroup',
       'job', 'edu', 'dwell_type', 'house_type', 'marriage', 'income_home_grp',
       'income_ind_grp', 'expenditure_grp', 'child_cd', 'no_child',
       'flag_priv_biz_cd', 'net_income', 'family_size', 'income_person',
       'card_count', 'loan_amt', 'loan_dur', 'loan_count', 'over_amt',
       'over_dur', 'over_count', 'is_overdue', 'sum_loan_at_time',
       'sum_cashservice_3y', 'sum_cardloan_3y', 'no_cashservice_3y',
       'no_cardloan_3y', 'no_creditcard_over', 'sum_creditcard_over',
       'max_loan_amt', 'max_over_amt', 'income_ind_test', 'expend_ind_test',
       'ind_net_income', 'loan_at_time_div_by_net', 'cardloan_no_diff_com',
       'cashservice_no_diff_com'],
      dtype='object')

In [11]:
# 1. 기존 모델과의 KS통계량 비교시 사용하는 데이터: 기존 모델에서 사용한 변수들과 비슷한 변수들만 사용하는 버전
# data=df_raw[['is_overdue','agegroup','job','edu','dwell_type','house_type','marriage',
#              'income_home_grp','expenditure_grp','card_count','loan_count',
#               'flag_priv_biz_cd', 'loan_amt',
#              'over_count','over_amt']]
# data['loan_amt']=np.log10(data['loan_amt']+1) # 금액의 범위가 너무 다양하므로 log로 변환 + 금액이0일 경우 log 에러 방지를 위해 +1
# data['over_amt']=np.log10(data['over_amt']+1)

# 2. 최종 모델에서 사용한 데이터: 기존 모델 변수들 - (대출금액, 연체금액) + 파생 변수
df_target = df_target[['gender','agegroup','job','edu','dwell_type', 'house_type', 'flag_priv_biz_cd', 'card_count', 'loan_dur','loan_count',
 'over_dur', 'over_count', 'is_overdue', 'loan_at_time_div_by_net', 'cardloan_no_diff_com', 'cashservice_no_diff_com']]

In [14]:
list(df_target.columns)

['gender',
 'agegroup',
 'job',
 'edu',
 'dwell_type',
 'house_type',
 'flag_priv_biz_cd',
 'card_count',
 'loan_dur',
 'loan_count',
 'over_dur',
 'over_count',
 'is_overdue',
 'loan_at_time_div_by_net',
 'cardloan_no_diff_com',
 'cashservice_no_diff_com']

# 모델


In [15]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings, random
warnings.filterwarnings(action='ignore')

from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from category_encoders.ordinal import OrdinalEncoder
from sklearn.model_selection import StratifiedKFold

from sklearn.cluster import KMeans
from catboost import CatBoostClassifier, Pool

import time

### 모델에 사용할 변수만 불러오기

In [16]:
df_raw = df_target

In [18]:
df_raw.columns

Index(['gender', 'agegroup', 'job', 'edu', 'dwell_type', 'house_type',
       'flag_priv_biz_cd', 'card_count', 'loan_dur', 'loan_count', 'over_dur',
       'over_count', 'is_overdue', 'loan_at_time_div_by_net',
       'cardloan_no_diff_com', 'cashservice_no_diff_com'],
      dtype='object')

In [20]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_raw, test_size=0.3, random_state=1234)
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

print('train data size: {}'.format(train.shape))
print('test data size: {}'.format(test.shape))

train data size: (1294, 16)
test data size: (555, 16)


## Numeric, Category 컬럼 분류

In [23]:
numerical_feats = train.dtypes[train.dtypes != "object"].index.tolist()
numerical_feats.remove('is_overdue')
print(numerical_feats)
print("Number of Numerical features: ", len(numerical_feats))
print('\n')

categorical_feats = train.dtypes[train.dtypes == "object"].index.tolist()
print(categorical_feats)
print("Number of Categorical features: ", len(categorical_feats))

['card_count', 'loan_dur', 'loan_count', 'over_dur', 'over_count', 'loan_at_time_div_by_net', 'cardloan_no_diff_com', 'cashservice_no_diff_com']
Number of Numerical features:  8


['gender', 'agegroup', 'job', 'edu', 'dwell_type', 'house_type', 'flag_priv_biz_cd']
Number of Categorical features:  7


# OrdinalEncoder: 범주형을 숫자형으로 변환

In [25]:
encoder = OrdinalEncoder(categorical_feats)
train[categorical_feats] = encoder.fit_transform(train[categorical_feats], train['is_overdue'])
test[categorical_feats] = encoder.transform(test[categorical_feats])


In [26]:
train

Unnamed: 0,gender,agegroup,job,edu,dwell_type,house_type,flag_priv_biz_cd,card_count,loan_dur,loan_count,over_dur,over_count,is_overdue,loan_at_time_div_by_net,cardloan_no_diff_com,cashservice_no_diff_com
0,1,1,1,1,1,1,1,2.0,4.00,2.0,0.0,0.0,0.0,1.880814,0.0,1.0
1,2,2,2,1,1,1,1,0.0,0.00,0.0,11.0,1.0,0.0,0.000000,0.0,0.0
2,1,3,1,2,1,1,1,1.0,13.00,1.0,0.0,0.0,0.0,4.380229,0.0,0.0
3,2,1,3,1,1,2,1,1.0,7.50,2.0,0.0,0.0,0.0,0.927508,0.0,0.0
4,1,1,1,1,2,1,1,2.0,8.00,2.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1289,2,6,3,1,1,1,2,6.0,13.00,2.0,0.0,0.0,0.0,1.652540,0.0,0.0
1290,1,6,3,1,1,1,2,3.0,2.50,2.0,0.0,0.0,0.0,0.000000,1.0,1.0
1291,1,4,1,1,1,1,1,2.0,13.00,1.0,0.0,0.0,0.0,4.041432,0.0,0.0
1292,2,7,4,7,1,1,1,2.0,10.86,7.0,0.0,0.0,0.0,3.969229,1.0,1.0


# train, test 데이터 생성

In [27]:
target = 'is_overdue'
train_x, train_y = train.drop(target, axis=1), train[target]
test_x, test_y = test.drop(target, axis=1), test[target]
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape) # 데이터 개수 확인

(1294, 15) (555, 15) (1294,) (555,)


# LigthGBM
# : Accuracy: 94.2 %, f1-score 0.807,Time 0.18s

In [35]:
import lightgbm as lgb
start = time.time() # 시작 시간 지정
lgb_dtrain = lgb.Dataset(data = train_x, label = train_y) # 학습 데이터를 LightGBM 모델에 맞게 변환
lgb_param = {'max_depth': 10, # 트리 깊이
            'learning_rate': 0.01, # Step Size
            'n_estimators': 100, # Number of trees, 트리 생성 개수
            'objective': 'multiclass', # 목적 함수
            'num_class': len(set(train_y)) + 1} # 파라미터 추가, Label must be in [0, num_class) -> num_class보다 1 커야한다.
lgb_model = lgb.train(params = lgb_param, train_set = lgb_dtrain) # 학습 진행
lgb_model_predict = np.argmax(lgb_model.predict(test_x), axis = 1) # 평가 데이터 예측, Softmax의 결과값 중 가장 큰 값의 Label로 예측
print("Accuracy: %.2f" % (accuracy_score(test_y, lgb_model_predict) * 100), "%") # 정확도 % 계산
print("Time: %.2f" % (time.time() - start), "seconds") # 코드 실행 시간 계산
print(classification_report(test_y, lgb_model_predict,digits=3))

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 407
[LightGBM] [Info] Number of data points in the train set: 1294, number of used features: 15
[LightGBM] [Info] Start training from score -0.183559
[LightGBM] [Info] Start training from score -1.785596
[LightGBM] [Info] Start training from score -34.538776
Accuracy: 94.23 %
Time: 0.18 seconds
              precision    recall  f1-score   support

         0.0      0.942     0.991     0.966       460
         1.0      0.944     0.705     0.807        95

    accuracy                          0.942       555
   macro avg      0.943     0.848     0.887       555
weighted avg      0.942     0.942     0.939       555



# XGboost
# : Accuracy: 93.3 %, f1-score 0.786,Time 0.06s

In [37]:
# !pip install xgboost
import xgboost as xgb
start = time.time() # 시작 시간 지정
xgb_dtrain = xgb.DMatrix(data = train_x, label = train_y) # 학습 데이터를 XGBoost 모델에 맞게 변환
xgb_dtest = xgb.DMatrix(data = test_x) # 평가 데이터를 XGBoost 모델에 맞게 변환
# 기본적인 hyperparameter
xgb_param = {'max_depth': 10, # 트리 깊이
         'learning_rate': 0.01, # Step Size
         'n_estimators': 100, # Number of trees, 트리 생성 개수
         'objective': 'multi:softmax', # 목적 함수
        'num_class': len(set(train_y)) + 1} # 파라미터 추가, Label must be in [0, num_class) -> num_class보다 1 커야한다.
xgb_model = xgb.train(params = xgb_param, dtrain = xgb_dtrain) # 학습 진행
xgb_model_predict = xgb_model.predict(xgb_dtest) # 평가 데이터 예측
print("Accuracy: %.2f" % (accuracy_score(test_y, xgb_model_predict) * 100), "%") # 정확도 % 계산
print("Time: %.2f" % (time.time() - start), "seconds") # 코드 실행 시간 계산
print(classification_report(test_y, xgb_model_predict,digits=3))

Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Accuracy: 93.33 %
Time: 0.06 seconds
              precision    recall  f1-score   support

         0.0      0.943     0.978     0.961       460
         1.0      0.872     0.716     0.786        95

    accuracy                          0.933       555
   macro avg      0.908     0.847     0.873       555
weighted avg      0.931     0.933     0.931       555



# AdaBoost
# : Accuracy: 93.51 %, f1-score 0.798,Time 0.10s

In [40]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

start = time.time() # 시작 시간 지정
tree_model = DecisionTreeClassifier(max_depth = 5) # 트리 최대 깊이 5
Adaboost_model = AdaBoostClassifier(base_estimator = tree_model, # 트리모델을 기본으로 추정
                                     n_estimators = 20, # 20회 추정
                                     random_state = 42) # 시드값 고정
AdaBoost_model = Adaboost_model.fit(train_x, train_y) # 학습 진행
AdaBoost_predict = AdaBoost_model.predict(test_x) # 평가 데이터 예측
print("Accuracy: %.2f" % (accuracy_score(test_y, AdaBoost_predict) * 100), "%") # 정확도 % 계산
print("Time: %.2f" % (time.time() - start), "seconds") # 코드 실행 시간 계산
print(classification_report(test_y, AdaBoost_predict,digits=3))

Accuracy: 93.51 %
Time: 0.10 seconds
              precision    recall  f1-score   support

         0.0      0.949     0.974     0.961       460
         1.0      0.855     0.747     0.798        95

    accuracy                          0.935       555
   macro avg      0.902     0.861     0.880       555
weighted avg      0.933     0.935     0.933       555



# RandomForest
# : Accuracy: 94.59 %, f1-score 0.840,Time 0.05s

In [41]:
from sklearn.ensemble import RandomForestClassifier

start = time.time() # 시작 시간 지정
RF_clf = RandomForestClassifier(n_estimators = 20, # 20번 추정
                                             max_depth = 5, # 트리 최대 깊이 5
                                             random_state = 42) # 시드값 고정
RF_model = RF_clf.fit(train_x, train_y) # 학습 진행
RF_predict = RF_model.predict(test_x) # 평가 데이터 예측
print("Accuracy: %.2f" % (accuracy_score(test_y, RF_predict) * 100), "%") # 정확도 % 계산
print("Time: %.2f" % (time.time() - start), "seconds") # 코드 실행 시간 계산
print(classification_report(test_y, RF_predict,digits=3))

Accuracy: 94.59 %
Time: 0.05 seconds
              precision    recall  f1-score   support

         0.0      0.965     0.970     0.967       460
         1.0      0.849     0.832     0.840        95

    accuracy                          0.946       555
   macro avg      0.907     0.901     0.904       555
weighted avg      0.946     0.946     0.946       555



# 📌 모델 평가

###  f1-score

RF(0.840) > LightGBM(0.807) > AdaBoost(0.798)> XGB(0.786)


# 📌 모델 선택

### 기준: 연체 예측 성능이 높은 모델 개발을 목표로 했으므로 f1-score가 높은 모델을 선정

### 결과: RandomForest 모델을 선택
