In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats
from scipy import stats
from scipy.stats import ttest_rel
from scipy.stats import ttest_ind
from scipy.stats import shapiro
from scipy.stats import chi2_contingency
from scipy.stats import mannwhitneyu

from statsmodels.stats.proportion import proportions_ztest
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.stattools import omni_normtest

from statsmodels.formula.api import ols
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.api import qqplot, add_constant

from sklearn.preprocessing import scale, minmax_scale, robust_scale
from sklearn.model_selection import train_test_split


# %matplotlib inline
# plt.set(font="AppleGothic")
# plt.rcParams['axes.unicode_minus']=False

%matplotlib inline
matplotlib.rc('font', family='AppleGothic')
plt.rc('font',  family='AppleGothic')
plt.rcParams['axes.unicode_minus']=False

# 파일 불러오기

In [48]:
df = pd.read_csv('CreDB_chaju.csv', engine='python', encoding='cp949')

# overdue 1개라도 있으면 1 , 전혀 없으면 0으로 계산된 데이터
df_target = pd.read_csv('final_target.csv', engine='python', encoding='cp949')

Unnamed: 0,chaju_id,credit_class,score_risk,gender,age,agegroup,job,edu,dwell_type,house_type,marriage,income_home_grp,income_ind_grp,expenditure_grp,child_cd,no_child,flag_priv_biz_cd
0,C-1001,C05,478,남성,30,30대,기능/노무직,대학교 졸업,자가,아파트,미혼,900-999만원,300-399만원,100-199만원,Y,0,N


# Chaju 데이터에 overdue 여부 데이터 merge

In [49]:
df_final = pd.merge(df_target,df, on = 'chaju_id', how = 'inner')
df_final['agegroup'] = df_final['agegroup'].apply(lambda x: str(x[:2]) )

# 필요없는 변수 제거

In [50]:
df_final.drop(['credit_class','child_cd','age','income_ind_grp', 'flag_priv_biz_cd'], axis=1, inplace=True)

# 차주 가족 수입, 차주 개인 지출->구간 평균 점수(int)로 변환

In [51]:

# 정수형
df_final['income_home_grp_head'] = df_final['income_home_grp'].apply(lambda x: int(x.split('-')[0])+50 if '-' in x else x.split('/')[0])
df_final['income_home_grp_head'] = df_final['income_home_grp'].apply(lambda x: int(x.split('-')[0])+50 if '-' in x else x.split('/')[0])
df_final['income_home_grp_head'] = df_final['income_home_grp'].apply(lambda x: int(x.split('-')[0])+50 if '-' in x else x.split('/')[0])
df_final['income_home_grp_head'] = df_final['income_home_grp_head'].apply(lambda x: 1050 if x=='1000만원 이상' else x)
df_final['income_home_grp_head'] = df_final['income_home_grp_head'].apply(lambda x: 50 if x=='99만원 이하' in x else x)

# 정수형
df_final['expenditure_grp_head'] = df_final['expenditure_grp'].apply(lambda x: int(x.split('-')[0])+50 if '-' in x else x.split('/')[0])
df_final['expenditure_grp_head'] = df_final['expenditure_grp'].apply(lambda x: int(x.split('-')[0])+50 if '-' in x else x.split('/')[0])
df_final['expenditure_grp_head'] = df_final['expenditure_grp'].apply(lambda x: int(x.split('-')[0])+50 if '-' in x else x.split('/')[0])
df_final['expenditure_grp_head'] = df_final['expenditure_grp_head'].apply(lambda x: 1050 if x=='1000만원 이상' else x)
df_final['expenditure_grp_head'] = df_final['expenditure_grp_head'].apply(lambda x: 50 if x=='99만원 이하' in x else x)

In [52]:
df_final

Unnamed: 0,chaju_id,loan_yes,overdue_yes,score_risk,gender,agegroup,job,edu,dwell_type,house_type,marriage,income_home_grp,expenditure_grp,no_child,income_home_grp_head,expenditure_grp_head
0,C-1001,1,1,478,남성,30,기능/노무직,대학교 졸업,자가,아파트,미혼,900-999만원,100-199만원,0,950,150
1,C-1002,1,0,484,여성,60,전업주부,초등학교 졸업,자가,아파트,기혼,200-299만원,99만원 이하,3,250,50
2,C-1003,1,0,478,남성,40,판매/서비스직,대학교 졸업,자가,아파트,기혼,500-599만원,100-199만원,2,550,150
3,C-1004,1,0,445,남성,20,무직/기타,대학교 졸업,자가,아파트,미혼,500-599만원,99만원 이하,0,550,50
4,C-1005,1,0,421,남성,50,행정관리/전문직,대학교 졸업,자가,아파트,기혼,500-599만원,200-299만원,2,550,250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1323,C-2993,1,0,398,여성,30,사무직,전문대학 졸업,전세,아파트,기혼,500-599만원,99만원 이하,1,550,50
1324,C-2994,1,1,468,남성,40,기능/노무직,전문대학 졸업,전세,아파트,기혼,300-399만원,100-199만원,2,350,150
1325,C-2995,1,0,528,여성,30,전업주부,전문대학 졸업,자가,아파트,기혼,300-399만원,100-199만원,1,350,150
1326,C-2996,1,0,448,남성,40,사무직,대학교 졸업,자가,단독주택,미혼,300-399만원,100-199만원,0,350,150


# 파생변수 생성: 가족 구성원의 총 수 (family_num)
## 가족 구성원의 총 수=  (미혼=1, 기혼=1, 기타(이혼/별거/사별)=1)+자녀수

# 파생변수 생성: 차주의 순수익 (pure_income)
## pure_income=  (차주의 가족 수입- 개인지출)/가족 구성원의 총 수

In [54]:
df_final['family_num'] = df_final['marriage']

df_final.replace({'family_num': '미혼'}, 1, inplace=True)
df_final.replace({'family_num': '기타(이혼/별거/사별)'}, 1, inplace=True)
df_final.replace({'family_num': '기혼'}, 2, inplace=True)

df_final['family_num'] = df_final['family_num']+df_final['no_child']

In [55]:
df_final['pure_income']=(df_final['income_home_grp_head']-df_final['expenditure_grp_head'])/df_final['family_num']

In [56]:
df_final

Unnamed: 0,chaju_id,loan_yes,overdue_yes,score_risk,gender,agegroup,job,edu,dwell_type,house_type,marriage,income_home_grp,expenditure_grp,no_child,income_home_grp_head,expenditure_grp_head,family_num,pure_income
0,C-1001,1,1,478,남성,30,기능/노무직,대학교 졸업,자가,아파트,미혼,900-999만원,100-199만원,0,950,150,1,800.000000
1,C-1002,1,0,484,여성,60,전업주부,초등학교 졸업,자가,아파트,기혼,200-299만원,99만원 이하,3,250,50,5,40.000000
2,C-1003,1,0,478,남성,40,판매/서비스직,대학교 졸업,자가,아파트,기혼,500-599만원,100-199만원,2,550,150,4,100.000000
3,C-1004,1,0,445,남성,20,무직/기타,대학교 졸업,자가,아파트,미혼,500-599만원,99만원 이하,0,550,50,1,500.000000
4,C-1005,1,0,421,남성,50,행정관리/전문직,대학교 졸업,자가,아파트,기혼,500-599만원,200-299만원,2,550,250,4,75.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1323,C-2993,1,0,398,여성,30,사무직,전문대학 졸업,전세,아파트,기혼,500-599만원,99만원 이하,1,550,50,3,166.666667
1324,C-2994,1,1,468,남성,40,기능/노무직,전문대학 졸업,전세,아파트,기혼,300-399만원,100-199만원,2,350,150,4,50.000000
1325,C-2995,1,0,528,여성,30,전업주부,전문대학 졸업,자가,아파트,기혼,300-399만원,100-199만원,1,350,150,3,66.666667
1326,C-2996,1,0,448,남성,40,사무직,대학교 졸업,자가,단독주택,미혼,300-399만원,100-199만원,0,350,150,1,200.000000


# 모델에 넣기 전 필요없는 변수 삭제

In [58]:
df_final.drop(['marriage', 'edu', 'dwell_type', 'house_type','loan_yes'], axis=1, inplace=True)

# 모델


In [347]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings, random
warnings.filterwarnings(action='ignore')

from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from category_encoders.ordinal import OrdinalEncoder
from sklearn.model_selection import StratifiedKFold

from sklearn.cluster import KMeans
from catboost import CatBoostClassifier, Pool

### 모델에 사용할 변수만 불러오기

In [348]:
df_raw = df_final[['pure_income', 'overdue_yes','gender','agegroup']]

In [349]:
df_raw

Unnamed: 0,pure_income,overdue_yes,gender,agegroup
0,800.000000,1,남성,30
1,40.000000,0,여성,60
2,100.000000,0,남성,40
3,500.000000,0,남성,20
4,75.000000,0,남성,50
...,...,...,...,...
1323,166.666667,0,여성,30
1324,50.000000,1,남성,40
1325,66.666667,0,여성,30
1326,200.000000,0,남성,40


In [351]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_raw, test_size=0.3, random_state=1234)
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

print('train data size: {}'.format(df_train.shape))
print('test data size: {}'.format(df_test.shape))

train data size: (929, 6)
test data size: (399, 6)


## Numeric, Category 컬럼 분류

In [352]:
numerical_feats = train.dtypes[train.dtypes != "object"].index.tolist()
numerical_feats.remove('overdue_yes')
print(numerical_feats)
print("Number of Numerical features: ", len(numerical_feats))
print('\n')

categorical_feats = train.dtypes[train.dtypes == "object"].index.tolist()
print(categorical_feats)
print("Number of Categorical features: ", len(categorical_feats))

['pure_income']
Number of Numerical features:  1


['gender', 'agegroup']
Number of Categorical features:  2


# OrdinalEncoder: 범주형을 숫자형으로 변환

In [353]:
encoder = OrdinalEncoder(categorical_feats)
train[categorical_feats] = encoder.fit_transform(train[categorical_feats], train['overdue_yes'])
test[categorical_feats] = encoder.transform(test[categorical_feats])


In [354]:
train

Unnamed: 0,pure_income,overdue_yes,gender,agegroup
0,200.000000,0,1,1
1,125.000000,0,1,2
2,500.000000,0,2,1
3,125.000000,0,1,2
4,500.000000,0,2,1
...,...,...,...,...
924,133.333333,1,1,2
925,100.000000,1,2,2
926,100.000000,0,1,4
927,600.000000,0,2,5


# train, test 데이터 생성

In [359]:
target = 'overdue_yes'
train_x, train_y = train.drop(target, axis=1), train[target]
test_x, test_y = test.drop(target, axis=1), test[target]
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape) # 데이터 개수 확인

(929, 3) (399, 3) (929,) (399,)


# LigthGBM=>Accuracy: 77.94 %

In [362]:
import lightgbm as lgb
lgb_dtrain = lgb.Dataset(data = train_x, label = train_y) # 학습 데이터를 LightGBM 모델에 맞게 변환
lgb_param = {'max_depth': 10, # 트리 깊이
            'learning_rate': 0.01, # Step Size
            'n_estimators': 100, # Number of trees, 트리 생성 개수
            'objective': 'multiclass', # 목적 함수
            'num_class': len(set(train_y)) + 1} # 파라미터 추가, Label must be in [0, num_class) -> num_class보다 1 커야한다.
lgb_model = lgb.train(params = lgb_param, train_set = lgb_dtrain) # 학습 진행
lgb_model_predict = np.argmax(lgb_model.predict(test_x), axis = 1) # 평가 데이터 예측, Softmax의 결과값 중 가장 큰 값의 Label로 예측
print("Accuracy: %.2f" % (accuracy_score(test_y, lgb_model_predict) * 100), "%") # 정확도 % 계산

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 39
[LightGBM] [Info] Number of data points in the train set: 929, number of used features: 3
[LightGBM] [Info] Start training from score -0.257639
[LightGBM] [Info] Start training from score -1.482251
[LightGBM] [Info] Start training from score -34.538776
Accuracy: 77.94 %


# XGboost=>Accuracy: 76.94 %

In [366]:

# !pip install xgboost
import xgboost as xgb
import time
start = time.time() # 시작 시간 지정
xgb_dtrain = xgb.DMatrix(data = train_x, label = train_y) # 학습 데이터를 XGBoost 모델에 맞게 변환
xgb_dtest = xgb.DMatrix(data = test_x) # 평가 데이터를 XGBoost 모델에 맞게 변환
# 기본적인 hyperparameter
xgb_param = {'max_depth': 10, # 트리 깊이
         'learning_rate': 0.01, # Step Size
         'n_estimators': 100, # Number of trees, 트리 생성 개수
         'objective': 'multi:softmax', # 목적 함수
        'num_class': len(set(train_y)) + 1} # 파라미터 추가, Label must be in [0, num_class) -> num_class보다 1 커야한다.
xgb_model = xgb.train(params = xgb_param, dtrain = xgb_dtrain) # 학습 진행
xgb_model_predict = xgb_model.predict(xgb_dtest) # 평가 데이터 예측
print("Accuracy: %.2f" % (accuracy_score(test_y, xgb_model_predict) * 100), "%") # 정확도 % 계산
print("Time: %.2f" % (time.time() - start), "seconds") # 코드 실행 시간 계산

Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Accuracy: 76.94 %
Time: 0.03 seconds


# AdaBoost=>Accuracy: 76.94 %

In [370]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

tree_model = DecisionTreeClassifier(max_depth = 5) # 트리 최대 깊이 5
Adaboost_model1 = AdaBoostClassifier(base_estimator = tree_model, # 트리모델을 기본으로 추정
                                     n_estimators = 20, # 20회 추정
                                     random_state = 42) # 시드값 고정
model1 = Adaboost_model1.fit(train_x, train_y) # 학습 진행
predict1 = model1.predict(test_x) # 평가 데이터 예측
print("Accuracy: %.2f" % (accuracy_score(test_y, predict1) * 100), "%") # 정확도 % 계산

Accuracy: 76.94 %


# RandomForest=>Accuracy: 77.19 %

In [369]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
random_forest_model1 = RandomForestClassifier(n_estimators = 20, # 20번 추정
                                             max_depth = 5, # 트리 최대 깊이 5
                                             random_state = 42) # 시드값 고정
model1 = random_forest_model1.fit(train_x, train_y) # 학습 진행
predict1 = model1.predict(test_x) # 평가 데이터 예측
print("Accuracy: %.2f" % (accuracy_score(test_y, predict1) * 100), "%") # 정확도 % 계산

Accuracy: 77.19 %
