In [111]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings(action='ignore')

from pycaret.classification import *
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import RobustScaler

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

import statsmodels.api as sm

In [80]:
df = pd.read_csv('./data/train.csv')

In [81]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income_>50K
0,67,Private,366425,Doctorate,16,Divorced,Exec-managerial,Not-in-family,White,Male,99999,0,60,United-States,1
1,17,Private,244602,12th,8,Never-married,Other-service,Own-child,White,Male,0,0,15,United-States,0
2,31,Private,174201,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,1
3,58,State-gov,110199,7th-8th,4,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States,0
4,25,State-gov,149248,Some-college,10,Never-married,Other-service,Not-in-family,Black,Male,0,0,40,United-States,0


In [82]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43957 entries, 0 to 43956
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              43957 non-null  int64 
 1   workclass        41459 non-null  object
 2   fnlwgt           43957 non-null  int64 
 3   education        43957 non-null  object
 4   educational-num  43957 non-null  int64 
 5   marital-status   43957 non-null  object
 6   occupation       41451 non-null  object
 7   relationship     43957 non-null  object
 8   race             43957 non-null  object
 9   gender           43957 non-null  object
 10  capital-gain     43957 non-null  int64 
 11  capital-loss     43957 non-null  int64 
 12  hours-per-week   43957 non-null  int64 
 13  native-country   43194 non-null  object
 14  income_>50K      43957 non-null  int64 
dtypes: int64(7), object(8)
memory usage: 5.0+ MB


In [83]:
df.shape

(43957, 15)

### 전처리

In [84]:
new_col = {'income_>50K':'income'}
df = df.rename(columns=new_col)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,67,Private,366425,Doctorate,16,Divorced,Exec-managerial,Not-in-family,White,Male,99999,0,60,United-States,1
1,17,Private,244602,12th,8,Never-married,Other-service,Own-child,White,Male,0,0,15,United-States,0
2,31,Private,174201,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,1
3,58,State-gov,110199,7th-8th,4,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States,0
4,25,State-gov,149248,Some-college,10,Never-married,Other-service,Not-in-family,Black,Male,0,0,40,United-States,0


In [85]:
#NULL값 처리

df['workclass'] = df['workclass'].fillna('None')
df['occupation'] = df['occupation'].fillna('None')
df['native-country'] = df['native-country'].fillna('Others')
df.isna().sum()

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64

In [86]:
df = df[df['native-country'] == 'United-States']
df.drop(columns = ['native-country'], inplace = True)

In [87]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39429 entries, 0 to 43956
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              39429 non-null  int64 
 1   workclass        39429 non-null  object
 2   fnlwgt           39429 non-null  int64 
 3   education        39429 non-null  object
 4   educational-num  39429 non-null  int64 
 5   marital-status   39429 non-null  object
 6   occupation       39429 non-null  object
 7   relationship     39429 non-null  object
 8   race             39429 non-null  object
 9   gender           39429 non-null  object
 10  capital-gain     39429 non-null  int64 
 11  capital-loss     39429 non-null  int64 
 12  hours-per-week   39429 non-null  int64 
 13  income           39429 non-null  int64 
dtypes: int64(7), object(7)
memory usage: 4.5+ MB


In [88]:
# educational-num 편집
print(df.shape)
df = df[df['educational-num'] != 1]
print(df.shape)
df.loc[df['educational-num'].isin([2, 3]), 'educational-num'] = 0
df.loc[df['educational-num'].isin([4, 5]), 'educational-num'] = 1
df.loc[df['educational-num'].isin([6, 7, 8]), 'educational-num'] = 2
for i in range(9, 17):
    df.loc[df['educational-num'].isin([i]), 'educational-num'] = i - 6
print(df['educational-num'].value_counts())

(39429, 14)
(39407, 14)
3     13099
4      9079
7      6468
2      3121
8      2123
5      1702
6      1326
1      1201
9       655
10      432
0       201
Name: educational-num, dtype: int64


In [89]:
df.drop(columns = ['education', 'race', 'gender'], inplace = True)
df.head()

Unnamed: 0,age,workclass,fnlwgt,educational-num,marital-status,occupation,relationship,capital-gain,capital-loss,hours-per-week,income
0,67,Private,366425,10,Divorced,Exec-managerial,Not-in-family,99999,0,60,1
1,17,Private,244602,2,Never-married,Other-service,Own-child,0,0,15,0
2,31,Private,174201,7,Married-civ-spouse,Exec-managerial,Husband,0,0,40,1
3,58,State-gov,110199,1,Married-civ-spouse,Transport-moving,Husband,0,0,40,0
4,25,State-gov,149248,4,Never-married,Other-service,Not-in-family,0,0,40,0


In [90]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39407 entries, 0 to 43956
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              39407 non-null  int64 
 1   workclass        39407 non-null  object
 2   fnlwgt           39407 non-null  int64 
 3   educational-num  39407 non-null  int64 
 4   marital-status   39407 non-null  object
 5   occupation       39407 non-null  object
 6   relationship     39407 non-null  object
 7   capital-gain     39407 non-null  int64 
 8   capital-loss     39407 non-null  int64 
 9   hours-per-week   39407 non-null  int64 
 10  income           39407 non-null  int64 
dtypes: int64(7), object(4)
memory usage: 3.6+ MB


#### 범주형 변수 처리

In [91]:
df = pd.get_dummies(df)

In [92]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39407 entries, 0 to 43956
Data columns (total 44 columns):
 #   Column                                Non-Null Count  Dtype
---  ------                                --------------  -----
 0   age                                   39407 non-null  int64
 1   fnlwgt                                39407 non-null  int64
 2   educational-num                       39407 non-null  int64
 3   capital-gain                          39407 non-null  int64
 4   capital-loss                          39407 non-null  int64
 5   hours-per-week                        39407 non-null  int64
 6   income                                39407 non-null  int64
 7   workclass_Federal-gov                 39407 non-null  uint8
 8   workclass_Local-gov                   39407 non-null  uint8
 9   workclass_Never-worked                39407 non-null  uint8
 10  workclass_None                        39407 non-null  uint8
 11  workclass_Private                     394

### OLS 회귀분석

In [93]:
X = df.drop(columns = ['educational-num'])
y = df[['educational-num']]

# for b0, 상수항 추가
# x_data = sm.add_constant(x_data, has_constant = "add")

# OLS 검정
multi_model = sm.OLS(y.astype(float), X.astype(float))
fitted_multi_model = multi_model.fit()
fitted_multi_model.summary()

0,1,2,3
Dep. Variable:,educational-num,R-squared:,0.405
Model:,OLS,Adj. R-squared:,0.404
Method:,Least Squares,F-statistic:,705.4
Date:,"Sun, 12 Dec 2021",Prob (F-statistic):,0.0
Time:,14:32:45,Log-Likelihood:,-73957.0
No. Observations:,39407,AIC:,148000.0
Df Residuals:,39368,BIC:,148300.0
Df Model:,38,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
age,-0.0028,0.001,-3.730,0.000,-0.004,-0.001
fnlwgt,-1.581e-07,7.6e-08,-2.081,0.037,-3.07e-07,-9.2e-09
capital-gain,1.211e-05,1.09e-06,11.140,0.000,9.98e-06,1.42e-05
capital-loss,0.0001,1.98e-05,6.228,0.000,8.47e-05,0.000
hours-per-week,0.0089,0.001,12.514,0.000,0.008,0.010
income,0.9331,0.023,40.952,0.000,0.888,0.978
workclass_Federal-gov,0.8889,0.060,14.734,0.000,0.771,1.007
workclass_Local-gov,1.0743,0.053,20.272,0.000,0.970,1.178
workclass_Never-worked,-0.4670,0.366,-1.276,0.202,-1.184,0.250

0,1,2,3
Omnibus:,827.444,Durbin-Watson:,2.013
Prob(Omnibus):,0.0,Jarque-Bera (JB):,908.139
Skew:,0.335,Prob(JB):,6.31e-198
Kurtosis:,3.323,Cond. No.,1.24e+16


In [94]:
drop_occupation = [f'occupation_{occupation}' for occupation in ['Other-service']]
df = df[(df['occupation_Other-service'] == 0)]

X = df.drop(columns = ['educational-num'] + drop_occupation)
y = df[['educational-num']]


# OLS 검정
multi_model = sm.OLS(y.astype(float), X.astype(float))
fitted_multi_model = multi_model.fit()
fitted_multi_model.summary()

0,1,2,3
Dep. Variable:,educational-num,R-squared:,0.405
Model:,OLS,Adj. R-squared:,0.405
Method:,Least Squares,F-statistic:,655.6
Date:,"Sun, 12 Dec 2021",Prob (F-statistic):,0.0
Time:,14:32:45,Log-Likelihood:,-67317.0
No. Observations:,35617,AIC:,134700.0
Df Residuals:,35579,BIC:,135000.0
Df Model:,37,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
age,-0.0020,0.001,-2.450,0.014,-0.004,-0.000
fnlwgt,-1.618e-07,8.15e-08,-1.986,0.047,-3.22e-07,-2.1e-09
capital-gain,1.215e-05,1.11e-06,10.935,0.000,9.97e-06,1.43e-05
capital-loss,0.0001,2.06e-05,6.328,0.000,9e-05,0.000
hours-per-week,0.0094,0.001,12.326,0.000,0.008,0.011
income,0.9321,0.023,39.689,0.000,0.886,0.978
workclass_Federal-gov,0.8748,0.063,13.945,0.000,0.752,0.998
workclass_Local-gov,1.1065,0.056,19.839,0.000,0.997,1.216
workclass_Never-worked,-0.4690,0.371,-1.265,0.206,-1.196,0.258

0,1,2,3
Omnibus:,537.997,Durbin-Watson:,2.016
Prob(Omnibus):,0.0,Jarque-Bera (JB):,570.0
Skew:,0.29,Prob(JB):,1.6800000000000002e-124
Kurtosis:,3.217,Cond. No.,1.23e+16


In [95]:
drop_occupation = [f'occupation_{occupation}' for occupation in ['Armed-Forces', 'Other-service']]
df = df[(df['occupation_Armed-Forces'] == 0) & (df['occupation_Other-service'] == 0)]

X = df.drop(columns = ['educational-num'] + drop_occupation)
y = df[['educational-num']]


# OLS 검정
multi_model = sm.OLS(y.astype(float), X.astype(float))
fitted_multi_model = multi_model.fit()
fitted_multi_model.summary()

0,1,2,3
Dep. Variable:,educational-num,R-squared:,0.405
Model:,OLS,Adj. R-squared:,0.405
Method:,Least Squares,F-statistic:,673.8
Date:,"Sun, 12 Dec 2021",Prob (F-statistic):,0.0
Time:,14:32:45,Log-Likelihood:,-67292.0
No. Observations:,35605,AIC:,134700.0
Df Residuals:,35568,BIC:,135000.0
Df Model:,36,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
age,-0.0020,0.001,-2.461,0.014,-0.004,-0.000
fnlwgt,-1.628e-07,8.15e-08,-1.998,0.046,-3.23e-07,-3.1e-09
capital-gain,1.215e-05,1.11e-06,10.933,0.000,9.97e-06,1.43e-05
capital-loss,0.0001,2.06e-05,6.334,0.000,9.02e-05,0.000
hours-per-week,0.0094,0.001,12.338,0.000,0.008,0.011
income,0.9312,0.023,39.649,0.000,0.885,0.977
workclass_Federal-gov,0.8584,0.063,13.713,0.000,0.736,0.981
workclass_Local-gov,1.0900,0.055,19.936,0.000,0.983,1.197
workclass_Never-worked,-0.4593,0.371,-1.239,0.215,-1.186,0.267

0,1,2,3
Omnibus:,537.065,Durbin-Watson:,2.016
Prob(Omnibus):,0.0,Jarque-Bera (JB):,569.036
Skew:,0.29,Prob(JB):,2.7200000000000002e-124
Kurtosis:,3.217,Cond. No.,1.23e+16


In [96]:
drop_occupation = [f'occupation_{occupation}' for occupation in ['Armed-Forces', 'Other-service', 'Priv-house-serv']]
df = df[(df['occupation_Armed-Forces'] == 0) & (df['occupation_Other-service'] == 0) & (df['occupation_Priv-house-serv'] == 0)]

X = df.drop(columns = ['educational-num'] + drop_occupation)
y = df[['educational-num']]

# OLS 검정
multi_model = sm.OLS(y.astype(float), X.astype(float))
fitted_multi_model = multi_model.fit()
fitted_multi_model.summary()

0,1,2,3
Dep. Variable:,educational-num,R-squared:,0.405
Model:,OLS,Adj. R-squared:,0.404
Method:,Least Squares,F-statistic:,688.0
Date:,"Sun, 12 Dec 2021",Prob (F-statistic):,0.0
Time:,14:32:46,Log-Likelihood:,-67037.0
No. Observations:,35464,AIC:,134100.0
Df Residuals:,35428,BIC:,134500.0
Df Model:,35,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
age,-0.0019,0.001,-2.369,0.018,-0.003,-0.000
fnlwgt,-1.527e-07,8.17e-08,-1.870,0.061,-3.13e-07,7.31e-09
capital-gain,1.212e-05,1.11e-06,10.908,0.000,9.95e-06,1.43e-05
capital-loss,0.0001,2.06e-05,6.365,0.000,9.09e-05,0.000
hours-per-week,0.0094,0.001,12.188,0.000,0.008,0.011
income,0.9299,0.024,39.564,0.000,0.884,0.976
workclass_Federal-gov,0.8649,0.063,13.821,0.000,0.742,0.988
workclass_Local-gov,1.0963,0.055,20.069,0.000,0.989,1.203
workclass_Never-worked,-0.4634,0.371,-1.250,0.211,-1.190,0.263

0,1,2,3
Omnibus:,529.927,Durbin-Watson:,2.016
Prob(Omnibus):,0.0,Jarque-Bera (JB):,561.059
Skew:,0.289,Prob(JB):,1.47e-122
Kurtosis:,3.215,Cond. No.,1.23e+16


In [97]:
drop_workclass = [f'workclass_{status}' for status in ['Never-worked']]
df = df[(df['workclass_Never-worked'] == 0)]

X = df.drop(columns = ['educational-num'] + drop_occupation + drop_workclass)
y = df[['educational-num']]


# OLS 검정
multi_model = sm.OLS(y.astype(float), X.astype(float))
fitted_multi_model = multi_model.fit()
fitted_multi_model.summary()

0,1,2,3
Dep. Variable:,educational-num,R-squared:,0.405
Model:,OLS,Adj. R-squared:,0.404
Method:,Least Squares,F-statistic:,707.7
Date:,"Sun, 12 Dec 2021",Prob (F-statistic):,0.0
Time:,14:32:46,Log-Likelihood:,-67025.0
No. Observations:,35456,AIC:,134100.0
Df Residuals:,35421,BIC:,134400.0
Df Model:,34,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
age,-0.0019,0.001,-2.375,0.018,-0.004,-0.000
fnlwgt,-1.54e-07,8.17e-08,-1.886,0.059,-3.14e-07,6.05e-09
capital-gain,1.212e-05,1.11e-06,10.907,0.000,9.95e-06,1.43e-05
capital-loss,0.0001,2.06e-05,6.365,0.000,9.09e-05,0.000
hours-per-week,0.0093,0.001,12.179,0.000,0.008,0.011
income,0.9300,0.024,39.564,0.000,0.884,0.976
workclass_Federal-gov,0.8472,0.061,13.840,0.000,0.727,0.967
workclass_Local-gov,1.0786,0.053,20.315,0.000,0.975,1.183
workclass_None,0.5676,0.025,22.286,0.000,0.518,0.617

0,1,2,3
Omnibus:,529.643,Durbin-Watson:,2.016
Prob(Omnibus):,0.0,Jarque-Bera (JB):,560.715
Skew:,0.289,Prob(JB):,1.75e-122
Kurtosis:,3.214,Cond. No.,1.23e+16


In [98]:
X = df.drop(columns = ['educational-num', 'fnlwgt'] + drop_occupation + drop_workclass)
y = df[['educational-num']]


# OLS 검정
multi_model = sm.OLS(y.astype(float), X.astype(float))
fitted_multi_model = multi_model.fit()
fitted_multi_model.summary()

0,1,2,3
Dep. Variable:,educational-num,R-squared:,0.404
Model:,OLS,Adj. R-squared:,0.404
Method:,Least Squares,F-statistic:,729.0
Date:,"Sun, 12 Dec 2021",Prob (F-statistic):,0.0
Time:,14:32:46,Log-Likelihood:,-67027.0
No. Observations:,35456,AIC:,134100.0
Df Residuals:,35422,BIC:,134400.0
Df Model:,33,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
age,-0.0018,0.001,-2.286,0.022,-0.003,-0.000
capital-gain,1.212e-05,1.11e-06,10.899,0.000,9.94e-06,1.43e-05
capital-loss,0.0001,2.06e-05,6.369,0.000,9.1e-05,0.000
hours-per-week,0.0094,0.001,12.204,0.000,0.008,0.011
income,0.9288,0.023,39.527,0.000,0.883,0.975
workclass_Federal-gov,0.8396,0.061,13.745,0.000,0.720,0.959
workclass_Local-gov,1.0703,0.053,20.227,0.000,0.967,1.174
workclass_None,0.5611,0.025,22.234,0.000,0.512,0.611
workclass_Private,0.6936,0.044,15.739,0.000,0.607,0.780

0,1,2,3
Omnibus:,529.426,Durbin-Watson:,2.015
Prob(Omnibus):,0.0,Jarque-Bera (JB):,560.492
Skew:,0.289,Prob(JB):,1.9500000000000002e-122
Kurtosis:,3.215,Cond. No.,1.02e+16


In [99]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                  y, 
                                                  test_size=0.01, 
                                                  random_state=42,
                                                  stratify = y)

### pycaret

In [24]:
train = pd.concat([X_train, y_train], axis = 1)
train

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week,income,workclass_Federal-gov,workclass_Local-gov,workclass_None,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,marital-status_Divorced,marital-status_Married-AF-spouse,marital-status_Married-civ-spouse,marital-status_Married-spouse-absent,marital-status_Never-married,marital-status_Separated,marital-status_Widowed,occupation_Adm-clerical,occupation_Craft-repair,occupation_Exec-managerial,occupation_Farming-fishing,occupation_Handlers-cleaners,occupation_Machine-op-inspct,occupation_None,occupation_Prof-specialty,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,relationship_Husband,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,educational-num
2330,37,0,0,40,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,3
42464,44,0,0,40,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,10
10669,44,0,0,48,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,7
9194,45,0,0,25,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2
8084,59,0,1579,60,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7333,29,0,0,40,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,2
43644,33,0,0,35,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,7
9906,78,0,0,36,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0
10331,20,0,0,10,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,4


In [25]:
# 환경 설정
clf = setup(data = train, target = 'educational-num', session_id = 123, silent = True, use_gpu = True)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,educational-num
2,Target Type,Multiclass
3,Label Encoded,"0: 0, 1: 1, 10: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10"
4,Original Data,"(35101, 39)"
5,Missing Values,False
6,Numeric Features,37
7,Categorical Features,1
8,Ordinal Features,False
9,High Cardinality Features,False


In [26]:
# 모델들의 정확도 출력
compare_models(sort = 'F1', n_select = 1)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.4402,0.7395,0.2251,0.4089,0.3929,0.2515,0.2632,19.958
lightgbm,Light Gradient Boosting Machine,0.4381,0.7368,0.2254,0.413,0.3877,0.2469,0.2614,1.855
gbc,Gradient Boosting Classifier,0.4439,0.7428,0.2253,0.4213,0.3875,0.2505,0.2693,28.528
xgboost,Extreme Gradient Boosting,0.4331,0.7337,0.2216,0.3996,0.3846,0.2407,0.2533,17.888
lda,Linear Discriminant Analysis,0.4033,0.7183,0.2376,0.3675,0.3666,0.2175,0.2226,0.155
ada,Ada Boost Classifier,0.4108,0.6209,0.202,0.3791,0.3644,0.2174,0.2268,1.014
rf,Random Forest Classifier,0.3582,0.6741,0.2078,0.3369,0.3448,0.1754,0.1762,5.237
ridge,Ridge Classifier,0.417,0.0,0.1544,0.3345,0.3373,0.2061,0.2217,0.042
et,Extra Trees Classifier,0.3434,0.6402,0.2041,0.327,0.3334,0.1614,0.1619,7.152
knn,K Neighbors Classifier,0.3401,0.6186,0.1609,0.306,0.3155,0.137,0.1393,1.978


<catboost.core.CatBoostClassifier at 0x2bb284d13d0>

In [114]:
xgb = XGBClassifier()
lgb = LGBMClassifier()
cat = CatBoostClassifier(silent = True)


# XGBoostClassifier 학습,예측,정확도 출력
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)
print('XGBoostClassifier 정확도: {0:.4f}'.format(accuracy_score(y_test, xgb_pred)))

# LightGBMBoostClassifier 학습,예측,정확도 출력
lgb.fit(X_train, y_train)
lgb_pred = lgb.predict(X_test)
print('LightGBMBoostClassifier 정확도: {0:.4f}'.format(accuracy_score(y_test, lgb_pred)))

# CatBoostClassifier 학습,예측,정확도 출력
cat.fit(X_train, y_train)
cat_pred = cat.predict(X_test)
print('CatBoostClassifier 정확도: {0:.4f}'.format(accuracy_score(y_test, cat_pred)))

GradientBoostingClassifier 정확도: 0.4648
XGBoostClassifier 정확도: 0.4817
LightGBMBoostClassifier 정확도: 0.4732
CatBoostClassifier 정확도: 0.4676


In [115]:
xgb = XGBClassifier()
lgb = LGBMClassifier()
cat = CatBoostClassifier(silent = True)


# XGBoostClassifier 학습,예측,정확도 출력
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)
print('XGBoostClassifier 정확도: {0:.4f}'.format(f1_score(y_test, xgb_pred, average='micro')))

# LightGBMBoostClassifier 학습,예측,정확도 출력
lgb.fit(X_train, y_train)
lgb_pred = lgb.predict(X_test)
print('LightGBMBoostClassifier 정확도: {0:.4f}'.format(f1_score(y_test, lgb_pred, average='micro')))

# LightGBMBoostClassifier 학습,예측,정확도 출력
cat.fit(X_train, y_train)
cat_pred = cat.predict(X_test)
print('CatBoostClassifier 정확도: {0:.4f}'.format(f1_score(y_test, cat_pred, average='micro')))

XGBoostClassifier 정확도: 0.4817
LightGBMBoostClassifier 정확도: 0.4732
CatBoostClassifier 정확도: 0.4676
