# 1. credit_data
- 프로젝트 목적 : index와 Label값을 제외한 18개의 컬럼을 바탕으로 **Label_신용도** 0, 1, 2(낮을수록 좋은 신용도)예측할 수 있는 모델 구축

In [3]:
#데이터 분석 및 머신러닝 구축을 위한 모듈
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd   
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, log_loss

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from factor_analyzer import FactorAnalyzer
from sklearn.ensemble import VotingClassifier
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import make_scorer
from sklearn.preprocessing import MinMaxScaler, StandardScaler

## 데이터 불러오기 및 전처리

In [11]:
url = "https://raw.githubusercontent.com/DataResolvere/Project/main/first_project/train.csv"

raw_df = pd.read_csv(url)
raw_df

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,,2.0,-6.0,1.0
1,1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0,1.0
2,2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,2.0,-22.0,2.0
3,3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,1,0,1,0,Sales staff,2.0,-37.0,0.0
4,4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,1,0,0,0,Managers,2.0,-26.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,26452,F,N,N,2,225000.0,State servant,Secondary / secondary special,Married,House / apartment,-12079,-1984,1,0,0,0,Core staff,4.0,-2.0,1.0
26453,26453,F,N,Y,1,180000.0,Working,Higher education,Separated,House / apartment,-15291,-2475,1,0,0,0,,2.0,-47.0,2.0
26454,26454,F,Y,N,0,292500.0,Working,Secondary / secondary special,Civil marriage,With parents,-10082,-2015,1,0,0,0,Core staff,2.0,-25.0,2.0
26455,26455,M,N,Y,0,171000.0,Working,Incomplete higher,Single / not married,House / apartment,-10145,-107,1,0,0,0,Laborers,1.0,-59.0,2.0


In [19]:
# binary 형태의 데이터를 int형태로 변환

raw_df = raw_df.iloc[:, 1:]
raw_df["gender"] = raw_df["gender"].apply(lambda x: 0 if x == "F" else 1)
raw_df["car"] = raw_df["car"].apply(lambda x: 0 if x == "N" else 1)
raw_df["reality"] = raw_df["reality"].apply(lambda x: 0 if x == "N" else 1)

In [12]:
# DAYS_EMPLOYED 컬럼의 양수값은 고용된 일수가 없는 데이터이기 때문에 0 초과는 0으로 통일

raw_df.loc[raw_df['DAYS_EMPLOYED'] > 0, 'DAYS_EMPLOYED'] = 0

In [13]:
# 연속형 자료들의 경우 음수를 기준으로 일수가 카운터 되기 때문에 절대값으로 바꿔 양수로 표시

raw_df["DAYS_EMPLOYED"] = raw_df["DAYS_EMPLOYED"].abs()
raw_df["DAYS_BIRTH"] = raw_df["DAYS_BIRTH"].abs()
raw_df["begin_month"] = raw_df["begin_month"].abs()

In [20]:
raw_df

Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,0,0,0,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,13899,4709,1,0,0,0,,2.0,6.0,1.0
1,0,0,1,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,11380,1540,1,0,0,1,Laborers,3.0,5.0,1.0
2,1,1,1,0,450000.0,Working,Higher education,Married,House / apartment,19087,4434,1,0,1,0,Managers,2.0,22.0,2.0
3,0,0,1,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,15088,2092,1,0,1,0,Sales staff,2.0,37.0,0.0
4,0,1,1,0,157500.0,State servant,Higher education,Married,House / apartment,15037,2105,1,0,0,0,Managers,2.0,26.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,0,0,0,2,225000.0,State servant,Secondary / secondary special,Married,House / apartment,12079,1984,1,0,0,0,Core staff,4.0,2.0,1.0
26453,0,0,1,1,180000.0,Working,Higher education,Separated,House / apartment,15291,2475,1,0,0,0,,2.0,47.0,2.0
26454,0,1,0,0,292500.0,Working,Secondary / secondary special,Civil marriage,With parents,10082,2015,1,0,0,0,Core staff,2.0,25.0,2.0
26455,1,0,1,0,171000.0,Working,Incomplete higher,Single / not married,House / apartment,10145,107,1,0,0,0,Laborers,1.0,59.0,2.0


In [22]:
# 범주형 자료의 value_count 확인을 통해 이상치 값 확인

for name in raw_df.columns:
    print(name)
    print(raw_df[name].value_counts())
    print("=" * 50)

gender
0    17697
1     8760
Name: gender, dtype: int64
car
0    16410
1    10047
Name: car, dtype: int64
reality
1    17830
0     8627
Name: reality, dtype: int64
child_num
0     18340
1      5386
2      2362
3       306
4        47
5        10
14        3
7         2
19        1
Name: child_num, dtype: int64
income_total
135000.0    3164
157500.0    2233
180000.0    2225
112500.0    2178
225000.0    2170
            ... 
57150.0        1
51750.0        1
87448.5        1
227250.0       1
191700.0       1
Name: income_total, Length: 249, dtype: int64
income_type
Working                 13645
Commercial associate     6202
Pensioner                4449
State servant            2154
Student                     7
Name: income_type, dtype: int64
edu_type
Secondary / secondary special    17995
Higher education                  7162
Incomplete higher                 1020
Lower secondary                    257
Academic degree                     23
Name: edu_type, dtype: int64
family_type
Mar

In [23]:
# income_type과 child_num 데이터에서 이상치 값을 제거

drop_index = raw_df[raw_df["income_type"] == "Student"].index
raw_df.drop(index=drop_index, axis=0, inplace=True)

drop_index =  raw_df[raw_df["child_num"] >= 5].index
raw_df.drop(index=drop_index, axis=0, inplace=True)

In [25]:
# FLAG_MOBIL 컬럼의 경우 하나의 고유값만 가지고 있기에 데이터로서 유의미한 차이를 만들지 못해서 drop

raw_df.drop(["FLAG_MOBIL"], axis=1, inplace=True)

In [26]:
raw_df

Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,0,0,0,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,13899,4709,0,0,0,,2.0,6.0,1.0
1,0,0,1,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,11380,1540,0,0,1,Laborers,3.0,5.0,1.0
2,1,1,1,0,450000.0,Working,Higher education,Married,House / apartment,19087,4434,0,1,0,Managers,2.0,22.0,2.0
3,0,0,1,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,15088,2092,0,1,0,Sales staff,2.0,37.0,0.0
4,0,1,1,0,157500.0,State servant,Higher education,Married,House / apartment,15037,2105,0,0,0,Managers,2.0,26.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,0,0,0,2,225000.0,State servant,Secondary / secondary special,Married,House / apartment,12079,1984,0,0,0,Core staff,4.0,2.0,1.0
26453,0,0,1,1,180000.0,Working,Higher education,Separated,House / apartment,15291,2475,0,0,0,,2.0,47.0,2.0
26454,0,1,0,0,292500.0,Working,Secondary / secondary special,Civil marriage,With parents,10082,2015,0,0,0,Core staff,2.0,25.0,2.0
26455,1,0,1,0,171000.0,Working,Incomplete higher,Single / not married,House / apartment,10145,107,0,0,0,Laborers,1.0,59.0,2.0


## Feature Analysis

### begin_month의 컬럼만 다르고, 나머지 데이터가 같은 여러 데이터를 발견하고 두 가지 가정을 세움
- 1. 한 사람의 데이터가 중복적으로 중첩되서 쌓였다.
- 2. 한 사람이 자신의 정보를 토대로 복수의 신용카드를 발급했다.

### 해결방안
- 1. 여러 컬럼의 조합으로 한 사람의 데이터라는 것을 식별할 수 있는 고유 ID 값 생성
- 2. 중복 데이터를 삭제함으로써 고유 데이터만을 가지고 모델링 구축

In [29]:
raw_df[raw_df["DAYS_BIRTH"] == 13899]

Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,0,0,0,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,13899,4709,0,0,0,,2.0,6.0,1.0
1773,0,0,0,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,13899,4709,0,0,0,,2.0,31.0,2.0
11402,0,0,0,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,13899,4709,0,0,0,,2.0,46.0,2.0
17641,0,0,0,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,13899,4709,0,0,0,,2.0,58.0,1.0
20389,0,0,0,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,13899,4709,0,0,0,,2.0,25.0,2.0


### 또한, occupy_type에 NaN값이 존재해서 해당 데이터를 어떻게 처리할지 문제가 발생
- 1. 해결방안 : income_type과 연결해서 확인해본 결과 연금 수령가자가 Nan값인 경우가 많으나, 자영업자 및 공무원도 NaN값인 데이터가 발견됨 그래서 직업의 유형을 적지 않았거나, 분류가 어려울 때 NaN값으로 처리했다고 가정하고 두 컬럼을 합쳐서 고유한 새로운 컬럼을 만듦

In [31]:
raw_df[raw_df["occyp_type"].isna()]

Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,0,0,0,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,13899,4709,0,0,0,,2.0,6.0,1.0
8,1,1,1,1,180000.0,Commercial associate,Higher education,Married,House / apartment,15131,1466,0,0,1,,3.0,38.0,2.0
14,0,0,1,0,157500.0,Pensioner,Secondary / secondary special,Single / not married,House / apartment,22922,0,0,1,0,,1.0,41.0,2.0
18,0,0,1,0,216000.0,Pensioner,Secondary / secondary special,Married,House / apartment,23113,0,0,0,0,,2.0,37.0,2.0
19,0,0,1,0,180000.0,Working,Secondary / secondary special,Married,House / apartment,13727,6031,0,0,0,,2.0,7.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26439,0,0,1,0,67500.0,Pensioner,Secondary / secondary special,Married,House / apartment,19238,0,0,0,0,,2.0,57.0,2.0
26441,0,0,1,0,90000.0,Pensioner,Incomplete higher,Separated,House / apartment,24129,0,0,0,0,,1.0,43.0,2.0
26443,0,0,1,0,90000.0,Pensioner,Secondary / secondary special,Single / not married,House / apartment,22286,0,0,0,0,,1.0,21.0,1.0
26449,0,0,0,0,90000.0,Working,Secondary / secondary special,Married,House / apartment,10498,2418,1,1,0,,2.0,2.0,1.0


In [32]:
raw_df[raw_df["occyp_type"].isna()]["income_type"].value_counts()

Pensioner               4440
Working                 2311
Commercial associate    1026
State servant            392
Name: income_type, dtype: int64

In [33]:
raw_df["income_type"].value_counts()

Working                 13629
Commercial associate     6202
Pensioner                4449
State servant            2154
Name: income_type, dtype: int64

In [39]:
# feature 분석을 기반으로 새로운 feature 생성

# 개인의 식별번호 컬럼
raw_df["SSN"] = raw_df["gender"].astype("str") +  raw_df["car"].astype("str") + raw_df["child_num"].astype("str") +\
raw_df["income_total"].astype("str") + raw_df["income_type"].astype("str") + raw_df["edu_type"].astype("str") + \
raw_df["family_type"].astype("str") + raw_df["house_type"].astype("str") + raw_df["DAYS_BIRTH"].astype("str") + \
raw_df["DAYS_EMPLOYED"].astype("str") + raw_df["work_phone"].astype("str") + raw_df["phone"].astype("str") + \
raw_df["email"].astype("str") + raw_df["occyp_type"].astype("str") + raw_df["family_size"].astype("str")

# income_type과 occupy_type를 합친 수입과 직업의 새로운 컬럼과 자동차와 부동산 유무를 합친 새로운 컬럼 생성
raw_df.fillna("etc", inplace=True)
raw_df["income_occupy"] = raw_df["income_type"] + "_" + raw_df["occyp_type"]
raw_df["car_reality"] = raw_df["car"].astype("str") + "_" + raw_df["reality"].astype("str")

# 분석에 유의미한 차이를 만들지 못하는 데이터 drop
raw_df.drop(["work_phone", "phone", "email", "income_type", "occyp_type"], axis=1, inplace=True)
raw_df.drop(["gender", "car", "reality", "child_num"], axis=1, inplace=True)

# 카드 발급 날짜를 기준으로 음수일 경우 직장이 없는 개월 수이며, 양수일 경우 카드 발급 받기 전부터 직장을 다님
raw_df["card_begin_before_employed"] = (raw_df["DAYS_EMPLOYED"] // 30) - raw_df["begin_month"]

# 연속형 데이터의 단위를 조정
raw_df["income_total"] = raw_df["income_total"] / 10000
raw_df["DAYS_BIRTH"] = raw_df["DAYS_BIRTH"] / 365
raw_df["DAYS_EMPLOYED"] = raw_df["DAYS_EMPLOYED"] / 12
raw_df["credit"] = raw_df["credit"].astype('int')

KeyError: 'gender'

In [40]:
raw_df

Unnamed: 0,income_total,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,family_size,begin_month,credit,SSN,income_occupy,car_reality,card_begin_before_employed
0,20.25,Higher education,Married,Municipal apartment,38.079452,392.416667,2.0,6.0,1,000202500.0Commercial associateHigher educatio...,Commercial associate_etc,0_0,150.0
1,24.75,Secondary / secondary special,Civil marriage,House / apartment,31.178082,128.333333,3.0,5.0,1,001247500.0Commercial associateSecondary / sec...,Commercial associate_Laborers,0_1,46.0
2,45.00,Higher education,Married,House / apartment,52.293151,369.500000,2.0,22.0,2,110450000.0WorkingHigher educationMarriedHouse...,Working_Managers,1_1,125.0
3,20.25,Secondary / secondary special,Married,House / apartment,41.336986,174.333333,2.0,37.0,0,000202500.0Commercial associateSecondary / sec...,Commercial associate_Sales staff,0_1,32.0
4,15.75,Higher education,Married,House / apartment,41.197260,175.416667,2.0,26.0,2,010157500.0State servantHigher educationMarrie...,State servant_Managers,1_1,44.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,22.50,Secondary / secondary special,Married,House / apartment,33.093151,165.333333,4.0,2.0,1,002225000.0State servantSecondary / secondary ...,State servant_Core staff,0_0,64.0
26453,18.00,Higher education,Separated,House / apartment,41.893151,206.250000,2.0,47.0,2,001180000.0WorkingHigher educationSeparatedHou...,Working_etc,0_1,35.0
26454,29.25,Secondary / secondary special,Civil marriage,With parents,27.621918,167.916667,2.0,25.0,2,010292500.0WorkingSecondary / secondary specia...,Working_Core staff,1_0,42.0
26455,17.10,Incomplete higher,Single / not married,House / apartment,27.794521,8.916667,1.0,59.0,2,100171000.0WorkingIncomplete higherSingle / no...,Working_Laborers,0_1,-56.0


In [41]:
make_label = LabelEncoder()

raw_df["edu_type"] = make_label.fit_transform(raw_df["edu_type"])
raw_df["family_type"] = make_label.fit_transform(raw_df["family_type"])
raw_df["house_type"] = make_label.fit_transform(raw_df["house_type"])
raw_df["income_occupy"] = make_label.fit_transform(raw_df["income_occupy"])
raw_df["car_reality"] = make_label.fit_transform(raw_df["car_reality"])
raw_df["SSN"] = make_label.fit_transform(raw_df["SSN"])

In [45]:
raw_df

Unnamed: 0,income_total,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,family_size,begin_month,credit,SSN,income_occupy,car_reality,card_begin_before_employed
0,20.25,1,1,2,38.079452,392.416667,2.0,6.0,1,1676,18,0,150.0
1,24.75,4,0,1,31.178082,128.333333,3.0,5.0,1,3740,8,1,46.0
2,45.00,1,1,1,52.293151,369.500000,2.0,22.0,2,7982,54,3,125.0
3,20.25,4,1,1,41.336986,174.333333,2.0,37.0,0,1690,14,1,32.0
4,15.75,1,1,1,41.197260,175.416667,2.0,26.0,2,4586,36,3,44.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,22.50,4,1,1,33.093151,165.333333,4.0,2.0,1,4147,29,0,64.0
26453,18.00,1,2,1,41.893151,206.250000,2.0,47.0,2,3599,62,1,35.0
26454,29.25,4,0,5,27.621918,167.916667,2.0,25.0,2,4987,47,2,42.0
26455,17.10,2,3,1,27.794521,8.916667,1.0,59.0,2,6022,52,1,-56.0


## 고유ID 번호를 바탕으로 모델 성능 테스트

In [48]:
# 반복 작업을 없애기 위해 3가지(xgbcBoost, lgbm, CatBoostClassifier) 모델 성능 테스트 함수 생성

def log_loss_gridCv(x_train, y_train, x_test, y_test):
    xg = XGBClassifier()
    lgbm = LGBMClassifier()
    
    # xgbcBoost
    xg = XGBClassifier()
    params = {
        "n_estimators" : [1000, 2000],
        "max_depth" : [5, 9],
        "random_state" : [13]
    }
    clf_grid = GridSearchCV(xg, param_grid=params, cv=2, scoring="neg_log_loss")
    clf_grid.fit(x_train, y_train)

    df_importance = pd.DataFrame(columns=clf_grid.feature_names_in_)

    pred = clf_grid.predict_proba(x_test)
    print("xgbm_best : ",clf_grid.best_estimator_)
    print("xgbm_result_loss: ",log_loss(y_test, pred))
    pred = clf_grid.predict(x_test)
    print("xgbm_result_ac : ", accuracy_score(y_test, pred))
    df_importance.loc[0] = clf_grid.best_estimator_.feature_importances_.tolist()
    print("=" * 50)
    
    #lgbm
    params = {
    "n_estimators" : [1000, 1500, 2000],
    "random_state" : [13]
    }
    clf_grid = GridSearchCV(lgbm, param_grid=params, cv=2, scoring="neg_log_loss")
    clf_grid.fit(x_train, y_train)


    pred = clf_grid.predict_proba(x_test)
    print("lgbm_best : ",clf_grid.best_estimator_)
    print("lgbm_result_loss : ",log_loss(y_test, pred))
    pred = clf_grid.predict(x_test)
    print("lgbm_result_ac : ", accuracy_score(y_test, pred))
    df_importance.loc[1] = clf_grid.best_estimator_.feature_importances_.tolist()
    print("=" * 50)
    
    cat = CatBoostClassifier(n_estimators=3000, max_depth=10, random_seed=1000, learning_rate=0.04, bootstrap_type ='Bernoulli')
    cat.fit(x_train, y_train,
    eval_set=[(x_train, y_train), (x_test, y_test)],
    early_stopping_rounds=50,
    verbose=100)

    pred = cat.predict_proba(x_test)
    print("cat_log_loss : ",log_loss(y_test, pred))

    pred = cat.predict(x_test)
    print("cat_ac_score : ",accuracy_score(y_test, pred))
    
    df_importance.loc[2] = cat.feature_importances_.tolist()
    
    df_importance.rename(index={0 : "XGBC", 1 : "LGBMC", 2 : "Cat"}, inplace=True)
    return df_importance

In [49]:
x = raw_df.drop(["credit"], axis=1)
y = raw_df[["credit"]]

In [51]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=13)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((21147, 12), (5287, 12), (21147, 1), (5287, 1))

In [52]:
# 만들어둔 함수를 통해 모델 성능을 테스트

feature_df = log_loss_gridCv(x_train, y_train, x_test, y_test)
feature_df

xgbm_best :  XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=1000, n_jobs=None, num_parallel_tree=None,
              objective='multi:softprob', predictor=None, ...)
xgbm_result_loss:  0.8062094451587699
xgbm_result_ac :  0.712880650652544
lgbm_best :  LGBMClassifier(n_estimators=1000, random_state=13)
lgbm_result_loss :  0.7933091701301722
lgbm_result_ac :  0.7062606393039531
0:	learn: 1.07

Unnamed: 0,income_total,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,family_size,begin_month,SSN,income_occupy,car_reality,card_begin_before_employed
XGBC,0.080391,0.074186,0.07488,0.087024,0.084723,0.088426,0.082715,0.119787,0.081612,0.086795,0.080928,0.058532
LGBMC,9183.0,2019.0,1859.0,887.0,16524.0,9648.0,1672.0,13188.0,12344.0,6966.0,2465.0,13245.0
Cat,11.644687,8.325246,4.779809,1.581195,12.1652,7.176072,5.005735,14.558713,10.872497,10.152823,7.796243,5.94178


In [55]:
# catBoost의 Parameter 값들을 다시 조정해서 성능을 테스트 해봄, catBoost의 경우 데이터셋에 따라 feature값은 넣어야하기 때문에
함수로 한 번에 최적의 결과를 만들기 힘들기 때문에 별도로 돌려봄

cat = CatBoostClassifier(n_estimators=3000, max_depth=10, random_seed=1000, learning_rate=0.04, bootstrap_type ='Bernoulli')
cat.fit(x_train, y_train,
eval_set=[(x_train, y_train), (x_test, y_test)],
early_stopping_rounds=50, cat_features=["house_type","edu_type","income_occupy","SSN"],
verbose=100)

pred = cat.predict_proba(x_test)
print("log_loss : ",log_loss(y_test, pred))

pred = cat.predict(x_test)
print("ac_score : ", accuracy_score(y_test, pred))

0:	learn: 1.0754713	test: 1.0754411	test1: 1.0754517	best: 1.0754517 (0)	total: 96.6ms	remaining: 4m 49s
100:	learn: 0.7171938	test: 0.5219265	test1: 0.6795311	best: 0.6795311 (100)	total: 20.1s	remaining: 9m 37s
200:	learn: 0.6869387	test: 0.4909766	test1: 0.6708828	best: 0.6708828 (200)	total: 42.7s	remaining: 9m 54s
300:	learn: 0.6532160	test: 0.4765807	test1: 0.6690413	best: 0.6689954 (299)	total: 1m 7s	remaining: 10m 8s
400:	learn: 0.6190408	test: 0.4659101	test1: 0.6689250	best: 0.6684857 (350)	total: 1m 34s	remaining: 10m 13s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.6684856538
bestIteration = 350

Shrink model to first 351 iterations.
log_loss :  0.6684856538442059
ac_score :  0.7359561187819179


## 중복 데이터 제거를 통한 모델 성능 테스트

In [64]:
# 팀원과 회의를 통해 우선적으로 전처리한 데이터셋 불러옴

url = "https://raw.githubusercontent.com/DataResolvere/Project/main/first_project/data_0407.csv"

raw_df = pd.read_csv(url)
raw_df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,work_phone,phone,email,occyp_type,begin_month,credit
0,0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,13899,4709,0,0,0,,6.0,1.0
1,1,1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,11380,1540,0,0,1,Laborers,5.0,1.0
2,2,2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,19087,4434,0,1,0,Managers,22.0,2.0
3,3,3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,15088,2092,0,1,0,Sales staff,37.0,0.0
4,4,4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,15037,2105,0,0,0,Managers,26.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26448,26448,26452,F,N,N,2,225000.0,State servant,Secondary / secondary special,Married,House / apartment,12079,1984,0,0,0,Core staff,2.0,1.0
26449,26449,26453,F,N,Y,1,180000.0,Working,Higher education,Separated,House / apartment,15291,2475,0,0,0,,47.0,2.0
26450,26450,26454,F,Y,N,0,292500.0,Working,Secondary / secondary special,Civil marriage,With parents,10082,2015,0,0,0,Core staff,25.0,2.0
26451,26451,26455,M,N,Y,0,171000.0,Working,Incomplete higher,Single / not married,House / apartment,10145,107,0,0,0,Laborers,59.0,2.0


In [65]:
# 필요없는 feature 값을 제거
raw_df = raw_df.iloc[:, 2:]

# 이상치 데이터 제거
drop_index = raw_df[raw_df["income_type"] == "Student"].index
raw_df.drop(index=drop_index, axis=0, inplace=True)
drop_index =  raw_df[raw_df["child_num"] >= 5].index
raw_df.drop(index=drop_index, axis=0, inplace=True)

# begin_month를 제외한 나머지 데이터의 중복이 있는 경우 데이터 삭제
df_index = raw_df[raw_df.drop(["begin_month"], axis=1).duplicated()].index
raw_df = raw_df.loc[df_index, :]

# binary 데이터를 int형으로 변환
raw_df["gender"] = raw_df["gender"].apply(lambda x: 0 if x == "F" else 1)
raw_df["car"] = raw_df["car"].apply(lambda x: 0 if x == "N" else 1)
raw_df["reality"] = raw_df["reality"].apply(lambda x: 0 if x == "N" else 1)

# 범주형 데이터 분석을 통한 새로운 컬럼 생성
raw_df.fillna("etc", inplace=True)
raw_df["income_occupy"] = raw_df["income_type"] + "_" + raw_df["occyp_type"]
raw_df["car_reality"] = raw_df["car"].astype("str") + "_" + raw_df["reality"].astype("str")

# 연속형 데이터 분석을 통한 새로운 컬럼 생성
raw_df["card_begin_before_employed"] = (raw_df["DAYS_EMPLOYED"] // 30) - raw_df["begin_month"]

# 불필요한 데이터 컬럼 삭제
raw_df.drop(["work_phone", "phone", "email", "income_type", "occyp_type"], axis=1, inplace=True)
raw_df.drop(["gender", "car", "reality", "child_num"], axis=1, inplace=True)

# 연속형 데이터의 단위 조정
raw_df["income_total"] = raw_df["income_total"] / 10000
raw_df["DAYS_BIRTH"] = raw_df["DAYS_BIRTH"] / 365
raw_df["DAYS_EMPLOYED"] = raw_df["DAYS_EMPLOYED"] / 12
raw_df["credit"] = raw_df["credit"].astype('int')

# 범주형 데이터에 대한 Label값 조정
make_label = LabelEncoder()

raw_df["edu_type"] = make_label.fit_transform(raw_df["edu_type"])
raw_df["family_type"] = make_label.fit_transform(raw_df["family_type"])
raw_df["house_type"] = make_label.fit_transform(raw_df["house_type"])
raw_df["income_occupy"] = make_label.fit_transform(raw_df["income_occupy"])
raw_df["car_reality"] = make_label.fit_transform(raw_df["car_reality"])

In [67]:
x = raw_df.drop(["credit"], axis=1)
y = raw_df[["credit"]]

In [68]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=13)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((11480, 10), (2871, 10), (11480, 1), (2871, 1))

In [70]:
x_train

Unnamed: 0,income_total,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,begin_month,income_occupy,car_reality,card_begin_before_employed
20035,31.500,4,1,2,44.095890,664.916667,42.0,11,3,223.0
20703,13.500,4,1,1,43.394521,127.083333,27.0,52,0,23.0
13278,25.200,1,1,1,41.717808,441.666667,53.0,46,2,123.0
10758,11.250,4,1,1,37.846575,19.916667,33.0,40,1,-26.0
421,18.000,4,1,1,28.358904,110.166667,16.0,56,2,28.0
...,...,...,...,...,...,...,...,...,...,...
4583,22.500,4,2,1,57.473973,0.000000,22.0,19,1,-22.0
21952,6.750,4,1,1,50.413699,179.083333,27.0,46,0,44.0
17269,12.375,4,1,1,45.715068,49.916667,17.0,39,1,2.0
1421,36.000,4,1,1,42.038356,26.166667,15.0,4,3,-5.0


In [71]:
feature_df = log_loss_gridCv(x_train, y_train, x_test, y_test)
feature_df

xgbm_best :  XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=1000, n_jobs=None, num_parallel_tree=None,
              objective='multi:softprob', predictor=None, ...)
xgbm_result_loss:  0.7335782866206809
xgbm_result_ac :  0.7662835249042146
lgbm_best :  LGBMClassifier(n_estimators=1000, random_state=13)
lgbm_result_loss :  0.734862324234023
lgbm_result_ac :  0.7701149425287356
0:	learn: 1.07

Unnamed: 0,income_total,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,begin_month,income_occupy,car_reality,card_begin_before_employed
XGBC,0.09554,0.101777,0.096183,0.114139,0.098376,0.118092,0.113657,0.101277,0.100852,0.060107
LGBMC,12982.0,2242.0,2436.0,830.0,19069.0,11170.0,14339.0,7906.0,3373.0,15653.0
Cat,15.637598,9.081993,6.751092,1.509386,14.829271,8.374724,12.650371,12.588595,12.052384,6.524586


In [73]:
cat = CatBoostClassifier(n_estimators=3000, max_depth=10, random_seed=1000, learning_rate=0.04, bootstrap_type ='Bernoulli')
cat.fit(x_train, y_train,
eval_set=[(x_train, y_train), (x_test, y_test)],
early_stopping_rounds=50, cat_features=["edu_type","family_type","house_type","income_occupy","car_reality"],
verbose=100)

pred = cat.predict_proba(x_test)
print("log_loss : ",log_loss(y_test, pred))

pred = cat.predict(x_test)
print("ac_score : ", accuracy_score(y_test, pred))

0:	learn: 1.0706244	test: 1.0706244	test1: 1.0708394	best: 1.0708394 (0)	total: 24.3ms	remaining: 1m 12s
100:	learn: 0.7150409	test: 0.7172132	test1: 0.7337177	best: 0.7337177 (100)	total: 14.6s	remaining: 6m 59s
200:	learn: 0.6688931	test: 0.6671273	test1: 0.7028888	best: 0.7028875 (199)	total: 35.2s	remaining: 8m 10s
300:	learn: 0.6084891	test: 0.6248272	test1: 0.6808349	best: 0.6808349 (300)	total: 59.8s	remaining: 8m 55s
400:	learn: 0.5504012	test: 0.5852738	test1: 0.6609717	best: 0.6609717 (400)	total: 1m 23s	remaining: 9m 3s
500:	learn: 0.5033817	test: 0.5574525	test1: 0.6499072	best: 0.6499072 (500)	total: 1m 49s	remaining: 9m 4s
600:	learn: 0.4638246	test: 0.5335504	test1: 0.6408465	best: 0.6408015 (597)	total: 2m 13s	remaining: 8m 52s
700:	learn: 0.4278778	test: 0.5138007	test1: 0.6336727	best: 0.6336727 (700)	total: 2m 38s	remaining: 8m 40s
800:	learn: 0.3954014	test: 0.4970184	test1: 0.6292577	best: 0.6292108 (799)	total: 3m 4s	remaining: 8m 26s
900:	learn: 0.3658461	test: 0