# 기본

In [None]:
!pip install chefboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting chefboost
  Downloading chefboost-0.0.17-py3-none-any.whl (26 kB)
Installing collected packages: chefboost
Successfully installed chefboost-0.0.17


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import statsmodels.api as sm
# from chefboost import Chefboost as chef
import seaborn as sns
from sklearn.feature_selection import f_regression, SelectKBest
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
df = pd.read_csv('/content/drive/Shareddrives/데캡디/Result/Raw_data/x_plus_y_final.csv')

In [None]:
config1 = {'algorithm' : 'ID3'}
config2 = {'algorithm' : 'CART'}
config3 = {'algorithm' : 'CHAID'}
config4 = {'algorithm' : 'C4.5'}

# 변수 추출

In [None]:
df.corr(method = 'pearson')

In [None]:
fig, ax = plt.subplots(figsize=(60,60))
sns.heatmap(df.corr(method = 'pearson'),linewidth = 0.5, annot = True, fmt = '.2f', cmap = 'Blues', ax = ax)

 0.8≤r 일 때, 강한 상관이 있다. \\
 0.6≤r<0.8 일 때, 상관이 있다. \\
 0.4≤r<0.6 일 때, 약한 상관이 있다.

#Under Sampling

y_label 데이터 개수의 차이가 너무 크면 under sampling은 부적절하다 \\
 2:1 정도의 비율이었으니 under sampling 사용

In [3]:
# feature, label 분리
X = df.drop('is_applied', axis = 1)
Y = df['is_applied']

In [4]:
# 그냥 원본 데이터 분리
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

In [5]:
y_train.value_counts()

0.0    24916
1.0    13712
Name: is_applied, dtype: int64

In [None]:
from imblearn.under_sampling import NearMiss
NM_model = NearMiss(version = 2) # sampling_strategy를 활용하여 비율 조정 가능

In [None]:
U_x_train, U_y_train = NM_model.fit_resample(x_train, y_train)
U_x_train = pd.DataFrame(U_x_train, columns = X.columns)
U_y_train = pd.Series(U_y_train)

In [None]:
U_y_train.value_counts()

0.0    13712
1.0    13712
Name: is_applied, dtype: int64

#### XGboost

In [None]:
selector = SelectKBest(score_func = f_regression, k=30)
x_train_s = selector.fit_transform(U_x_train, U_y_train)
x_test_s = selector.transform(x_test)

xgb = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=300,
       n_jobs=1, nthread=-1, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=2019,
       silent=True, subsample=0.8)

xgb.fit(x_train_s, U_y_train)

  correlation_coefficient /= X_norms


XGBClassifier(colsample_bytree=0.8, learning_rate=0.01, n_estimators=300,
              nthread=-1, seed=2019, silent=True, subsample=0.8)

In [None]:
 y_pred_xgb = xgb.predict(x_test_s)

In [None]:
print(classification_report(y_test, y_pred_xgb))

              precision    recall  f1-score   support

         0.0       0.72      0.47      0.57      7884
         1.0       0.40      0.66      0.50      4188

    accuracy                           0.53     12072
   macro avg       0.56      0.56      0.53     12072
weighted avg       0.61      0.53      0.54     12072



In [None]:
confusion_matrix(y_test, y_pred_xgb)

array([[3668, 4216],
       [1409, 2779]])

####Grid

In [None]:
xgb_param_grid={
    'n_estimators' : [300,500],
    'learning_rate' : [0.01,0.05,0.1],
    'max_depth' : [3,5,7,10],
    'gamma' : [0,1,2],
    'colsample_bytree' : [0.8,0.9],
} # 개수 적당히

In [None]:
grid = XGBClassifier()

In [None]:
xgb_grid=GridSearchCV(grid, param_grid = xgb_param_grid, scoring="f1_macro", n_jobs=-1, verbose = 2)
xgb_grid.fit(x_train_s, U_y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


GridSearchCV(estimator=XGBClassifier(), n_jobs=-1,
             param_grid={'colsample_bytree': [0.8, 0.9], 'gamma': [0, 1, 2],
                         'learning_rate': [0.01, 0.05, 0.1],
                         'max_depth': [3, 5, 7, 10],
                         'n_estimators': [300, 500]},
             scoring='f1_macro', verbose=2)

In [None]:
print(xgb_grid.best_params_)

{'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300}


확연히 precision이 떨어지고 recall 값이 조금 상승하는 것을 확인할 수 있음

####CART, CHAID, C4.5

In [None]:
k = 7
selector = SelectKBest(score_func = f_regression, k= k)
x_train_s = selector.fit_transform(U_x_train, U_y_train)
x_test_s = selector.transform(x_test)

  correlation_coefficient /= X_norms


In [None]:
# 변수 개수 추출해서 진행
y_list = U_y_train.to_list()
x_train_sd = pd.DataFrame(x_train_s)
x_train_sd['is_applied'] = y_list

y_test_list = y_test.to_list()
x_test_sd = pd.DataFrame(x_test_s)
x_test_sd['is_applied'] = y_test_list


In [None]:
i = 0
list1 = []
while i != k + 1:
    list1.append(str(i))
    i += 1
x_train_sd.columns = list1
x_test_sd.columns = list1

In [None]:
x_train_sd.loc[x_train_sd['{0}'.format(k)] == 0.0, '{0}'.format(k)] = "No"
x_train_sd.loc[x_train_sd['{0}'.format(k)] == 1.0, '{0}'.format(k)] = "YES"
x_test_sd.loc[x_test_sd['{0}'.format(k)] == 0.0, '{0}'.format(k)] = "No"
x_test_sd.loc[x_test_sd['{0}'.format(k)] == 1.0, '{0}'.format(k)] = "YES"

In [None]:
x_test_sd

Unnamed: 0,0,1,2,3,4,5,6,7
0,1.0,2.0,2.0,0.0,5066.666667,1311.111111,64920.0,YES
1,1.0,4.0,4.0,0.0,11907.446809,2900.000000,85980.0,No
2,1.0,0.0,0.0,0.0,0.000000,0.000000,0.0,No
3,1.0,0.0,0.0,0.0,0.000000,0.000000,0.0,No
4,1.0,6.0,6.0,0.0,29489.795918,12663.265306,79590.0,YES
...,...,...,...,...,...,...,...,...
12067,1.0,4.0,4.0,0.0,14662.337662,4890.909091,35820.0,YES
12068,1.0,0.0,0.0,0.0,0.000000,0.000000,0.0,YES
12069,1.0,1.0,1.0,0.0,0.000000,0.000000,0.0,No
12070,1.0,0.0,0.0,0.0,0.000000,0.000000,0.0,No


In [None]:
x_test_sd.rename(columns = {'{0}'.format(k) : 'Decision'}, inplace = True)

In [None]:
x_test_sd

Unnamed: 0,0,1,2,3,4,5,6,Decision
0,1.0,2.0,2.0,0.0,5066.666667,1311.111111,64920.0,YES
1,1.0,4.0,4.0,0.0,11907.446809,2900.000000,85980.0,No
2,1.0,0.0,0.0,0.0,0.000000,0.000000,0.0,No
3,1.0,0.0,0.0,0.0,0.000000,0.000000,0.0,No
4,1.0,6.0,6.0,0.0,29489.795918,12663.265306,79590.0,YES
...,...,...,...,...,...,...,...,...
12067,1.0,4.0,4.0,0.0,14662.337662,4890.909091,35820.0,YES
12068,1.0,0.0,0.0,0.0,0.000000,0.000000,0.0,YES
12069,1.0,1.0,1.0,0.0,0.000000,0.000000,0.0,No
12070,1.0,0.0,0.0,0.0,0.000000,0.000000,0.0,No


In [None]:
model1 = chef.fit(x_train_sd, config = config2, target_label = '{0}'.format(k))

[INFO]:  1 CPU cores will be allocated in parallel running
CART  tree is going to be built...
-------------------------
finished in  21.311728715896606  seconds
-------------------------
Evaluate  train set
-------------------------
Accuracy:  67.49532418952619 % on  25664  instances
Labels:  ['No' 'YES']
Confusion matrix:  [[8494, 4004], [4338, 8828]]
Precision:  67.9629 %, Recall:  66.1939 %, F1:  67.0667 %


In [None]:
chef.evaluate(model1, x_test_sd)

-------------------------
Evaluate  test set
-------------------------
Accuracy:  76.53247183565276 % on  12072  instances
Labels:  ['YES' 'No']
Confusion matrix:  [[2886, 1531], [1302, 6353]]
Precision:  65.3385 %, Recall:  68.9112 %, F1:  67.0773 %


In [None]:
model2 = chef.fit(x_train_sd, config = config3, target_label = '{0}'.format(k))

[INFO]:  1 CPU cores will be allocated in parallel running
CHAID  tree is going to be built...
-------------------------
finished in  26.145038604736328  seconds
-------------------------
Evaluate  train set
-------------------------
Accuracy:  67.21867206982543 % on  25664  instances
Labels:  ['No' 'YES']
Confusion matrix:  [[8247, 3828], [4585, 9004]]
Precision:  68.2981 %, Recall:  64.269 %, F1:  66.2223 %


In [None]:
chef.evaluate(model2, x_test_sd)

['0', '1', '2', '3', '4', '5', '6', 'Prediction', 'Decision']
-------------------------
Evaluate  test set
-------------------------
Accuracy:  76.39993373094765 % on  12072  instances
Labels:  ['YES' 'No']
Confusion matrix:  [[2951, 1612], [1237, 6272]]
Precision:  64.6724 %, Recall:  70.4632 %, F1:  67.4437 %


In [None]:
model3 = chef.fit(x_train_sd, config = config4, target_label = '{0}'.format(k))

[INFO]:  1 CPU cores will be allocated in parallel running
C4.5  tree is going to be built...
-------------------------
finished in  9.631849765777588  seconds
-------------------------
Evaluate  train set
-------------------------
Accuracy:  67.00436408977556 % on  25664  instances
Labels:  ['No' 'YES']
Confusion matrix:  [[8276, 3912], [4556, 8920]]
Precision:  67.9029 %, Recall:  64.495 %, F1:  66.1551 %


In [None]:
chef.evaluate(model3, x_test_sd)

['0', '1', '2', '3', '4', '5', '6', 'Prediction', 'Decision']
-------------------------
Evaluate  test set
-------------------------
Accuracy:  76.42478462557986 % on  12072  instances
Labels:  ['YES' 'No']
Confusion matrix:  [[2927, 1585], [1261, 6299]]
Precision:  64.8715 %, Recall:  69.8902 %, F1:  67.2874 %
