## Module Import

In [1]:
import pandas as pd
import numpy as np

# 시각화
import seaborn as sns

# 경고 메시지 무시
import warnings
warnings.filterwarnings(action='ignore') 

# 모델링 패키지
import sklearn
import matplotlib
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import validation_curve
from sklearn.metrics import mean_squared_log_error


# 버전 확인
print('Pandas : %s'%(pd.__version__))
print('seaborn : %s'%(sns.__version__))
print('Scikit-Learn : %s'%(sklearn.__version__))
!python --version

Pandas : 1.4.3
seaborn : 0.11.2
Scikit-Learn : 1.1.2
Python 3.8.8


## Load Data

In [2]:
# 데이터 불러오기
## 예측을 위한 pk는 인덱스이기 때문에 불러올때 인덱스를 컬럼으로 가져옴
data = pd.read_csv('../data/data.csv').reset_index()

In [3]:
data.head(3)

Unnamed: 0,index,application_id,loanapply_insert_time,bank_id,product_id,loan_limit,loan_rate,is_applied,month,user_id,...,income_type,company_enter_month,employment_type,houseown_type,desired_amount,purpose,personal_rehabilitation_yn,personal_rehabilitation_complete_yn,existing_loan_cnt,existing_loan_amt
0,0,1748340,2022-06-07 13:05:41,7,191,42000000.0,13.6,,6,430982,...,EARNEDINCOME,20220201.0,정규직,자가,25000000.0,대환대출,1,0.0,2.0,15000000.0
1,1,1748340,2022-06-07 13:05:41,25,169,24000000.0,17.9,,6,430982,...,EARNEDINCOME,20220201.0,정규직,자가,25000000.0,대환대출,1,0.0,2.0,15000000.0
2,2,1748340,2022-06-07 13:05:41,2,7,24000000.0,18.5,,6,430982,...,EARNEDINCOME,20220201.0,정규직,자가,25000000.0,대환대출,1,0.0,2.0,15000000.0


## ML Model

- 데이터 분리

In [4]:
x_test = data[data['month'] == 6].drop('is_applied', axis=1).reset_index(drop=True)
x_train = data[data['month'] != 6].drop('is_applied', axis=1).reset_index(drop=True)
y_train = data[data['month'] != 6]['is_applied'].reset_index(drop=True)

- Preprocessing

In [5]:
# encoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
# scaler
from sklearn.preprocessing import StandardScaler
# eval
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score, accuracy_score
# model
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

In [6]:
# 범주형 변수와 수치형 변수를 분리
## train
cat_feature_tr = x_train.iloc[:,1:].select_dtypes(include=['object']).columns.to_list()
num_feature_tr = x_train.iloc[:,1:].select_dtypes(exclude='object').columns.to_list() 
## test
cat_feature_te = x_test.iloc[:,1:].select_dtypes(include=['object']).columns.to_list()
num_feature_te = x_test.iloc[:,1:].select_dtypes(exclude='object').columns.to_list()

    - Scaling -> 주석 처리해놓고 어떤 것이 더 잘나오는지 확인하여 사용여부 결정

In [7]:
# # 스케일링
# scaler = StandardScaler()
# x_train[num_feature_tr] = scaler.fit_transform(x_train[num_feature_tr])
# x_test[num_feature_te] = scaler.transform(x_test[num_feature_te])

    - Encoding

In [8]:
# #onehotencoder
# ohe = OneHotEncoder(sparse=False)

# # fit_transform은 train에만 사용하고 test에는 학습된 인코더에 fit만 해야한다
# ohe_ = ohe.fit(pd.concat([x_train, x_test]).reset_index(drop=True)[cat_feature_tr])
# one_cat_tr = ohe_.transform(x_train[cat_feature_tr])
# one_cat_te = ohe_.transform(x_test[cat_feature_te])

# cat_tr = pd.DataFrame(one_cat_tr, columns = ohe.get_feature_names(cat_feature_tr)) # train
# cat_te = pd.DataFrame(one_cat_te, columns = ohe.get_feature_names(cat_feature_te)) # test

In [9]:
# Labelencoder
encoder = LabelEncoder()
encoded_cat_tr = []
encoded_cat_te = []
for f in cat_feature_tr:
    encoder = encoder.fit(pd.concat([x_train, x_test]).reset_index(drop=True)[f])
    encoded_cat_tr.append(pd.DataFrame(encoder.transform(x_train[f]),columns = [f])) # train
    encoded_cat_te.append(pd.DataFrame(encoder.transform(x_test[f]),columns = [f]))	# test

# 합쳐줌
cat_tr = pd.concat(encoded_cat_tr, axis = 1) # train
cat_te = pd.concat(encoded_cat_te, axis = 1) # test

In [10]:
# 인코딩 이후 수치형 변수와 범주형 변수를 합쳐줌
x_train_imp = pd.concat([x_train['index'], cat_tr], axis=1)
x_test_imp = pd.concat([x_test['index'], cat_te], axis=1)

x_train = pd.concat([x_train_imp, x_train[num_feature_tr]], axis=1)
x_test = pd.concat([x_test_imp, x_test[num_feature_te]], axis=1)

- Modeling

    - catboost -> 개오래걸림

In [11]:
# cv를 사용하여 모델의 성능 평가, train score확인
cat = CatBoostClassifier(random_state=42, verbose=0)

score = np.mean(cross_val_score(cat, x_train.iloc[:,1:], y_train, cv=5, scoring = 'f1'));score

0.17677563981612238

In [12]:
# # test 예측
# cat.fit(x_train.iloc[:,1:], y_train)
# pred_cat = cat.predict(x_test.iloc[:,1:])

    - randomforest -> ㄹㅇ개오래걸림 / 포기

In [14]:
# rf = RandomForestClassifier(random_state=42)
# score = np.mean(cross_val_score(rf, x_train.iloc[:,1:], y_train, cv=5, scoring = 'f1'));score

In [None]:
# # test 예측
# rf.fit(x_train.iloc[:,1:], y_train)
# pred_rf = rf.predict(x_test.iloc[:,1:])

    - LGBM

In [15]:
lgbm = LGBMClassifier(random_state=42)
score = np.mean(cross_val_score(lgbm, x_train.iloc[:,1:], y_train, cv=5, scoring = 'f1'));score

0.07073166027277913

In [None]:
# # test 예측
# lgbm.fit(x_train.iloc[:,1:], y_train)
# pred_lgbm = lgbm.predict(x_test.iloc[:,1:])

In [None]:
# import win32api
# win32api.Beep(3000,3000)

## END