In [69]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings(action='ignore')

from pycaret.classification import *
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import RobustScaler

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

import statsmodels.api as sm

In [70]:
df = pd.read_csv('./data/train.csv')

In [71]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income_>50K
0,67,Private,366425,Doctorate,16,Divorced,Exec-managerial,Not-in-family,White,Male,99999,0,60,United-States,1
1,17,Private,244602,12th,8,Never-married,Other-service,Own-child,White,Male,0,0,15,United-States,0
2,31,Private,174201,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,1
3,58,State-gov,110199,7th-8th,4,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States,0
4,25,State-gov,149248,Some-college,10,Never-married,Other-service,Not-in-family,Black,Male,0,0,40,United-States,0


In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43957 entries, 0 to 43956
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              43957 non-null  int64 
 1   workclass        41459 non-null  object
 2   fnlwgt           43957 non-null  int64 
 3   education        43957 non-null  object
 4   educational-num  43957 non-null  int64 
 5   marital-status   43957 non-null  object
 6   occupation       41451 non-null  object
 7   relationship     43957 non-null  object
 8   race             43957 non-null  object
 9   gender           43957 non-null  object
 10  capital-gain     43957 non-null  int64 
 11  capital-loss     43957 non-null  int64 
 12  hours-per-week   43957 non-null  int64 
 13  native-country   43194 non-null  object
 14  income_>50K      43957 non-null  int64 
dtypes: int64(7), object(8)
memory usage: 5.0+ MB


In [73]:
df.shape

(43957, 15)

### 전처리

In [74]:
new_col = {'income_>50K':'income'}
df = df.rename(columns=new_col)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,67,Private,366425,Doctorate,16,Divorced,Exec-managerial,Not-in-family,White,Male,99999,0,60,United-States,1
1,17,Private,244602,12th,8,Never-married,Other-service,Own-child,White,Male,0,0,15,United-States,0
2,31,Private,174201,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,1
3,58,State-gov,110199,7th-8th,4,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States,0
4,25,State-gov,149248,Some-college,10,Never-married,Other-service,Not-in-family,Black,Male,0,0,40,United-States,0


In [75]:
#NULL값 처리

df['workclass'] = df['workclass'].fillna('None')
df['occupation'] = df['occupation'].fillna('None')
df['native-country'] = df['native-country'].fillna('Others')
df.isna().sum()

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64

In [76]:
df = df[df['native-country'] == 'United-States']
df.drop(columns = ['native-country'], inplace = True)

In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39429 entries, 0 to 43956
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              39429 non-null  int64 
 1   workclass        39429 non-null  object
 2   fnlwgt           39429 non-null  int64 
 3   education        39429 non-null  object
 4   educational-num  39429 non-null  int64 
 5   marital-status   39429 non-null  object
 6   occupation       39429 non-null  object
 7   relationship     39429 non-null  object
 8   race             39429 non-null  object
 9   gender           39429 non-null  object
 10  capital-gain     39429 non-null  int64 
 11  capital-loss     39429 non-null  int64 
 12  hours-per-week   39429 non-null  int64 
 13  income           39429 non-null  int64 
dtypes: int64(7), object(7)
memory usage: 4.5+ MB


In [78]:
# educational-num 편집
print(df.shape)
df = df[df['educational-num'] != 1]
print(df.shape)
df.loc[df['educational-num'].isin([2, 3]), 'educational-num'] = 0
df.loc[df['educational-num'].isin([4, 5]), 'educational-num'] = 1
df.loc[df['educational-num'].isin([6, 7, 8]), 'educational-num'] = 2
for i in range(9, 17):
    df.loc[df['educational-num'].isin([i]), 'educational-num'] = i - 6
print(df['educational-num'].value_counts())

(39429, 14)
(39407, 14)
3     13099
4      9079
7      6468
2      3121
8      2123
5      1702
6      1326
1      1201
9       655
10      432
0       201
Name: educational-num, dtype: int64


In [79]:
df.drop(columns = ['education', 'race', 'gender'], inplace = True)
df.head()

Unnamed: 0,age,workclass,fnlwgt,educational-num,marital-status,occupation,relationship,capital-gain,capital-loss,hours-per-week,income
0,67,Private,366425,10,Divorced,Exec-managerial,Not-in-family,99999,0,60,1
1,17,Private,244602,2,Never-married,Other-service,Own-child,0,0,15,0
2,31,Private,174201,7,Married-civ-spouse,Exec-managerial,Husband,0,0,40,1
3,58,State-gov,110199,1,Married-civ-spouse,Transport-moving,Husband,0,0,40,0
4,25,State-gov,149248,4,Never-married,Other-service,Not-in-family,0,0,40,0


In [80]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39407 entries, 0 to 43956
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              39407 non-null  int64 
 1   workclass        39407 non-null  object
 2   fnlwgt           39407 non-null  int64 
 3   educational-num  39407 non-null  int64 
 4   marital-status   39407 non-null  object
 5   occupation       39407 non-null  object
 6   relationship     39407 non-null  object
 7   capital-gain     39407 non-null  int64 
 8   capital-loss     39407 non-null  int64 
 9   hours-per-week   39407 non-null  int64 
 10  income           39407 non-null  int64 
dtypes: int64(7), object(4)
memory usage: 3.6+ MB


In [81]:
df = df[df['occupation'].isin(['Armed-Forces', 'Other-service', 'Priv-house-serv']) == False]
df = df[df['workclass'].isin(['Never-worked']) == False]
df = df.drop(columns = ['fnlwgt'])

In [82]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35456 entries, 0 to 43956
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              35456 non-null  int64 
 1   workclass        35456 non-null  object
 2   educational-num  35456 non-null  int64 
 3   marital-status   35456 non-null  object
 4   occupation       35456 non-null  object
 5   relationship     35456 non-null  object
 6   capital-gain     35456 non-null  int64 
 7   capital-loss     35456 non-null  int64 
 8   hours-per-week   35456 non-null  int64 
 9   income           35456 non-null  int64 
dtypes: int64(6), object(4)
memory usage: 3.0+ MB


In [83]:
cat_vars = [var for var in df.columns if df[var].dtype == "O"]
cat_vars

['workclass', 'marital-status', 'occupation', 'relationship']

In [84]:
for var in cat_vars:  
    df[var] = df[var].astype('category') 

In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35456 entries, 0 to 43956
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   age              35456 non-null  int64   
 1   workclass        35456 non-null  category
 2   educational-num  35456 non-null  int64   
 3   marital-status   35456 non-null  category
 4   occupation       35456 non-null  category
 5   relationship     35456 non-null  category
 6   capital-gain     35456 non-null  int64   
 7   capital-loss     35456 non-null  int64   
 8   hours-per-week   35456 non-null  int64   
 9   income           35456 non-null  int64   
dtypes: category(4), int64(6)
memory usage: 2.0 MB


In [86]:
X = df.drop(columns = ['educational-num'])
y = df[['educational-num']]

In [87]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                  y, 
                                                  test_size=0.01, 
                                                  random_state=42,
                                                  stratify = y)

### pycaret

In [24]:
train = pd.concat([X_train, y_train], axis = 1)
train

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week,income,workclass_Federal-gov,workclass_Local-gov,workclass_None,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,marital-status_Divorced,marital-status_Married-AF-spouse,marital-status_Married-civ-spouse,marital-status_Married-spouse-absent,marital-status_Never-married,marital-status_Separated,marital-status_Widowed,occupation_Adm-clerical,occupation_Craft-repair,occupation_Exec-managerial,occupation_Farming-fishing,occupation_Handlers-cleaners,occupation_Machine-op-inspct,occupation_None,occupation_Prof-specialty,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,relationship_Husband,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,educational-num
2330,37,0,0,40,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,3
42464,44,0,0,40,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,10
10669,44,0,0,48,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,7
9194,45,0,0,25,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2
8084,59,0,1579,60,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7333,29,0,0,40,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,2
43644,33,0,0,35,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,7
9906,78,0,0,36,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0
10331,20,0,0,10,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,4


In [25]:
# 환경 설정
clf = setup(data = train, target = 'educational-num', session_id = 123, silent = True, use_gpu = True)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,educational-num
2,Target Type,Multiclass
3,Label Encoded,"0: 0, 1: 1, 10: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10"
4,Original Data,"(35101, 39)"
5,Missing Values,False
6,Numeric Features,37
7,Categorical Features,1
8,Ordinal Features,False
9,High Cardinality Features,False


In [26]:
# 모델들의 정확도 출력
compare_models(sort = 'F1', n_select = 1)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.4402,0.7395,0.2251,0.4089,0.3929,0.2515,0.2632,19.958
lightgbm,Light Gradient Boosting Machine,0.4381,0.7368,0.2254,0.413,0.3877,0.2469,0.2614,1.855
gbc,Gradient Boosting Classifier,0.4439,0.7428,0.2253,0.4213,0.3875,0.2505,0.2693,28.528
xgboost,Extreme Gradient Boosting,0.4331,0.7337,0.2216,0.3996,0.3846,0.2407,0.2533,17.888
lda,Linear Discriminant Analysis,0.4033,0.7183,0.2376,0.3675,0.3666,0.2175,0.2226,0.155
ada,Ada Boost Classifier,0.4108,0.6209,0.202,0.3791,0.3644,0.2174,0.2268,1.014
rf,Random Forest Classifier,0.3582,0.6741,0.2078,0.3369,0.3448,0.1754,0.1762,5.237
ridge,Ridge Classifier,0.417,0.0,0.1544,0.3345,0.3373,0.2061,0.2217,0.042
et,Extra Trees Classifier,0.3434,0.6402,0.2041,0.327,0.3334,0.1614,0.1619,7.152
knn,K Neighbors Classifier,0.3401,0.6186,0.1609,0.306,0.3155,0.137,0.1393,1.978


<catboost.core.CatBoostClassifier at 0x2bb284d13d0>

In [88]:
lgb = LGBMClassifier()
cat = CatBoostClassifier(cat_features=cat_vars,
                         task_type="GPU",
                         silent=True
                         )

# # LightGBMBoostClassifier 학습,예측,정확도 출력
lgb.fit(X_train, y_train)
lgb_pred = lgb.predict(X_test)
print('LightGBMBoostClassifier 정확도: {0:.4f}'.format(accuracy_score(y_test, lgb_pred)))

# LightGBMBoostClassifier 학습,예측,정확도 출력
cat.fit(X_train, y_train)
cat_pred = cat.predict(X_test)
print('CatBoostClassifier 정확도: {0:.4f}'.format(accuracy_score(y_test, cat_pred)))

LightGBMBoostClassifier 정확도: 0.4676
CatBoostClassifier 정확도: 0.4704


In [89]:
lgb = LGBMClassifier()
cat = CatBoostClassifier(cat_features=cat_vars,
                         task_type="GPU",
                         silent=True
                         )

# # LightGBMBoostClassifier 학습,예측,정확도 출력
lgb.fit(X_train, y_train)
lgb_pred = lgb.predict(X_test)
print('LightGBMBoostClassifier 정확도: {0:.4f}'.format(f1_score(y_test, lgb_pred, average='micro')))

# LightGBMBoostClassifier 학습,예측,정확도 출력
cat.fit(X_train, y_train)
cat_pred = cat.predict(X_test)
print('CatBoostClassifier 정확도: {0:.4f}'.format(accuracy_score(y_test, cat_pred, average='micro')))

LightGBMBoostClassifier 정확도: 0.4676


TypeError: accuracy_score() got an unexpected keyword argument 'average'