# ADULT-ML-03-ML Modles 실습

- 데이터 처리
    - adult.csv : 원자료
    - adult4ml.csv : 원자료의 범주형변수 재그룹화한 자료
    - adult4ml-clean.csv : adult4ml에서 이상치 591개 제거한 자료

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
np.set_printoptions(edgeitems=30, precision=4, linewidth=120, sign=' ')
plt.rcParams['figure.figsize'] = [5, 4]  # [6.4, 4.8] default size

In [4]:
adultml = pd.read_csv('./adult4ml.csv')
adultml.head()

Unnamed: 0,age,wrkclsgrp,fnlwgt,edugrp,eduyr,maritalgrp,occ,rel,racegrp,sex,capgain,caploss,hr,countrygrp,y
0,25.0,Private,226802.0,1M,7.0,NotMarried,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,US,<=50K
1,38.0,Private,89814.0,2HS,9.0,Married,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,US,<=50K
2,28.0,Gov,336951.0,3C,12.0,Married,Protective-serv,Husband,White,Male,0.0,0.0,40.0,US,>50K
3,44.0,Private,160323.0,3C,10.0,Married,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,US,>50K
4,18.0,,103497.0,3C,10.0,NotMarried,,Own-child,White,Female,0.0,0.0,30.0,US,<=50K


In [5]:
adultml.shape

(48842, 15)

In [6]:
# 원본 파일
DF = adultml.copy()
DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   age         48842 non-null  float64
 1   wrkclsgrp   46043 non-null  object 
 2   fnlwgt      48842 non-null  float64
 3   edugrp      48842 non-null  object 
 4   eduyr       48842 non-null  float64
 5   maritalgrp  48842 non-null  object 
 6   occ         46033 non-null  object 
 7   rel         48842 non-null  object 
 8   racegrp     48842 non-null  object 
 9   sex         48842 non-null  object 
 10  capgain     48842 non-null  float64
 11  caploss     48842 non-null  float64
 12  hr          48842 non-null  float64
 13  countrygrp  47985 non-null  object 
 14  y           48842 non-null  object 
dtypes: float64(6), object(9)
memory usage: 5.6+ MB


In [7]:
# fnlwgt, eddur 제거
DF.drop(['fnlwgt', 'eduyr'], axis=1, inplace=True)

vobj = list(DF.select_dtypes('object'))
vnum = list(DF.select_dtypes(np.number))
vord = ['edugrp']

vobj, vnum, vord

(['wrkclsgrp',
  'edugrp',
  'maritalgrp',
  'occ',
  'rel',
  'racegrp',
  'sex',
  'countrygrp',
  'y'],
 ['age', 'capgain', 'caploss', 'hr'],
 ['edugrp'])

In [8]:
# 문자변수 일괄 categorize
for vname in vobj:
    DF[vname] = DF[vname].astype('category')

DF['edugrp'] = pd.Categorical(DF['edugrp'], ordered=True)
DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   age         48842 non-null  float64 
 1   wrkclsgrp   46043 non-null  category
 2   edugrp      48842 non-null  category
 3   maritalgrp  48842 non-null  category
 4   occ         46033 non-null  category
 5   rel         48842 non-null  category
 6   racegrp     48842 non-null  category
 7   sex         48842 non-null  category
 8   capgain     48842 non-null  float64 
 9   caploss     48842 non-null  float64 
 10  hr          48842 non-null  float64 
 11  countrygrp  47985 non-null  category
 12  y           48842 non-null  category
dtypes: category(9), float64(4)
memory usage: 1.9 MB


In [9]:
# 개별 모형에는 지장이 없으나, cross_val_score에서 score 계산 시 오류 발생
DF['y'] = DF['y'].map({'<=50K': 0, '>50K': 1})
DF.head()

Unnamed: 0,age,wrkclsgrp,edugrp,maritalgrp,occ,rel,racegrp,sex,capgain,caploss,hr,countrygrp,y
0,25.0,Private,1M,NotMarried,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,US,0
1,38.0,Private,2HS,Married,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,US,0
2,28.0,Gov,3C,Married,Protective-serv,Husband,White,Male,0.0,0.0,40.0,US,1
3,44.0,Private,3C,Married,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,US,1
4,18.0,,3C,NotMarried,,Own-child,White,Female,0.0,0.0,30.0,US,0


In [10]:
DF['y'].value_counts()

0    37155
1    11687
Name: y, dtype: int64

In [11]:
# 결측 확인
DF.isnull().sum()

age              0
wrkclsgrp     2799
edugrp           0
maritalgrp       0
occ           2809
rel              0
racegrp          0
sex              0
capgain          0
caploss          0
hr               0
countrygrp     857
y                0
dtype: int64

In [12]:
# from pandas import set_option
# from pandas.tools.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay, PrecisionRecallDisplay
from sklearn.metrics import roc_curve, precision_recall_curve
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.metrics import f1_score, balanced_accuracy_score, cohen_kappa_score
from sklearn.metrics import log_loss

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier

In [13]:
TRX, TSX, TRy, TSy = train_test_split(DF.drop('y', axis=1), DF['y'], test_size=0.25, random_state=1234, stratify=DF['y'])

TRX.shape, TSX.shape

((36631, 12), (12211, 12))

StratifiedKFold : 층화 CV Splitter, 분류분석시 타겟 수준에 대한 층화 CV 분할
n_split : 폴드 수
shuffle : False(기본값)면 random_state 지정할 수 없음
random_state : 랜덤시드

In [14]:
SKF = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [15]:
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer

# from sklearn import set_config
# set_config(display='diagram')

개별변수 변환용 파이프라인(예시)

In [16]:
# 수치형 변수 공통
Pnum = Pipeline([('impmed', SimpleImputer(strategy='median')),
                 ('normalize', StandardScaler())])

# 범주형 변수 공통
# LDA, NB는 sparse 안됨. drop='First'면 R
Pcat = Pipeline([('impmod', SimpleImputer(strategy='most_frequent'),
                  ('dummy', OneHotEncoder(handle_unknown='ignore', sparse=False)))])

In [17]:
PP = ColumnTransformer([('num', Pnum, make_column_selector(dtype_include=np.number)),
                        ('cat', Pcat, make_column_selector(dtype_include='category'))], remainder=Pnum)

PP.fit(TRX)

ValueError: too many values to unpack (expected 2)