In [1]:
import pandas as pd
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()

df_cancer = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df_cancer.head(3)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758


In [2]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(cancer.data, cancer.target, test_size=0.2, random_state=10)
print(X_train.shape, X_test.shape)

(455, 30) (114, 30)


## 패키지 설치
```
pip install xgboost
```

In [3]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

xgb = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3)
xgb.fit(X_train, y_train)

pred = xgb.predict(X_test)
accuracy_score(y_test, pred)

0.9649122807017544

In [4]:
xgb = XGBClassifier(n_estimators=400, leaning_rate=0.1, max_depth=3)
xgb.set_params(early_stopping_rounds=100, eval_metric='logloss')
xgb.fit(X_train, y_train, verbose=True, eval_set=[(X_test, y_test)])

pred = xgb.predict(X_test)

[0]	validation_0-logloss:0.46416
[1]	validation_0-logloss:0.34609
[2]	validation_0-logloss:0.27056
[3]	validation_0-logloss:0.21628
[4]	validation_0-logloss:0.17872
[5]	validation_0-logloss:0.14741
[6]	validation_0-logloss:0.13184
[7]	validation_0-logloss:0.11887
[8]	validation_0-logloss:0.10900
[9]	validation_0-logloss:0.10706
[10]	validation_0-logloss:0.10236
[11]	validation_0-logloss:0.09681
[12]	validation_0-logloss:0.09800
[13]	validation_0-logloss:0.09729
[14]	validation_0-logloss:0.09605
[15]	validation_0-logloss:0.09292
[16]	validation_0-logloss:0.09267
[17]	validation_0-logloss:0.09343
[18]	validation_0-logloss:0.09340
[19]	validation_0-logloss:0.09374
[20]	validation_0-logloss:0.09459
[21]	validation_0-logloss:0.09233
[22]	validation_0-logloss:0.09244
[23]	validation_0-logloss:0.09351
[24]	validation_0-logloss:0.09213
[25]	validation_0-logloss:0.09132
[26]	validation_0-logloss:0.09373
[27]	validation_0-logloss:0.09218
[28]	validation_0-logloss:0.08975
[29]	validation_0-loglos

> n_estimators는 생성할 트리의 개수로 400정도를 지정하자. 이 값은 높으면 높을 수록 성능이 향상되지만 그만큼 시간이 오래 걸린다.

> learning_rate는 GBM이 학습을 진행할 때마다 적용하는 학습률로 오류 값을 보정해 가는 데 적용되는 계수로 0.1을 사용했으며 작을 수록 오류를 많이 찾을 수 있지만 시간이 오래 걸린다.

> Max_depth는 트리의 최대 깊이로 3을 설정했으며 보통 3~10정도를 설정한다.

> early_stopping_rounds은 n_estimators가 400임으로 400번의 부스팅을 진행하는 동안 100번의 학습오류가 감소하지 않으면 더 이상 부스팅을 진행하지 않고 종료한다.

> eval_set은 evaluation 세트, 즉 검증 세트를 지정하는 것이다.  fit( ) 수행 시 반복적으로 예측 오류값을 줄일 수 있도록 학습이 진행되는데 이때 학습은 학습 데이터로 하되, 예측 오류값 평가는 eval_set로 지정된 검증 세트로 평가하는 방식이다. 학습 데이터로만 예측 오류값을 줄이게 되면 오버 피팅이 될 가능성이 높아 별도의 검증 세트를 지정하여 수행해야 한다.

> eval_metric은 검증 함수를 지정할 수 있으며 예측값 가중치 적용을 위해 logloss를 이용했다.

> 48번까지 수치가 줄다 다시 늘어 난 것을 볼 수 있으며 100회가 지나도 수치는 줄지 않아 멈추게 된다.

> 튜닝

In [5]:
from sklearn.model_selection import GridSearchCV
import numpy as np

param = {
    'max_depth':[3,5,7],
    'subsample':[0.6, 0.8, 1.0]
}

grid_cv = GridSearchCV(xgb, param_grid=param, scoring='accuracy', 
                       cv=2, verbose=1, n_jobs=-1)
grid_cv.fit(X_train, y_train, eval_set=[(X_test, y_test)])
print('최적 파라미터 : ', grid_cv.best_params_)
print('최고 예측 정확도 : ', grid_cv.best_score_)

Fitting 2 folds for each of 9 candidates, totalling 18 fits
[0]	validation_0-logloss:0.45978
[1]	validation_0-logloss:0.33925
[2]	validation_0-logloss:0.26945
[3]	validation_0-logloss:0.22036
[4]	validation_0-logloss:0.18764
[5]	validation_0-logloss:0.16258
[6]	validation_0-logloss:0.14326
[7]	validation_0-logloss:0.13021
[8]	validation_0-logloss:0.11730
[9]	validation_0-logloss:0.10958
[10]	validation_0-logloss:0.10174
[11]	validation_0-logloss:0.09870
[12]	validation_0-logloss:0.09278
[13]	validation_0-logloss:0.09215
[14]	validation_0-logloss:0.09547
[15]	validation_0-logloss:0.09638
[16]	validation_0-logloss:0.09335
[17]	validation_0-logloss:0.09046
[18]	validation_0-logloss:0.08567
[19]	validation_0-logloss:0.08914
[20]	validation_0-logloss:0.09162
[21]	validation_0-logloss:0.09071
[22]	validation_0-logloss:0.09169
[23]	validation_0-logloss:0.08993
[24]	validation_0-logloss:0.08705
[25]	validation_0-logloss:0.08675
[26]	validation_0-logloss:0.08629
[27]	validation_0-logloss:0.0863

In [6]:
param = {
    'max_depth':[3, 4],
    'subsample':[0.7, 0.8, 0.9]
}

grid_cv = GridSearchCV(xgb, param_grid=param, scoring='accuracy', 
                       cv=2, verbose=1, n_jobs=-1)
grid_cv.fit(X_train, y_train, eval_set=[(X_test, y_test)])
print('최적 파라미터 : ', grid_cv.best_params_)
print('최고 예측 정확도 : ', grid_cv.best_score_)

Fitting 2 folds for each of 6 candidates, totalling 12 fits
[0]	validation_0-logloss:0.44094
[1]	validation_0-logloss:0.33097
[2]	validation_0-logloss:0.25455
[3]	validation_0-logloss:0.21041
[4]	validation_0-logloss:0.17940
[5]	validation_0-logloss:0.15345
[6]	validation_0-logloss:0.13133
[7]	validation_0-logloss:0.11894
[8]	validation_0-logloss:0.10318
[9]	validation_0-logloss:0.09765
[10]	validation_0-logloss:0.09185
[11]	validation_0-logloss:0.08940
[12]	validation_0-logloss:0.08379
[13]	validation_0-logloss:0.08462
[14]	validation_0-logloss:0.08552
[15]	validation_0-logloss:0.09008
[16]	validation_0-logloss:0.08566
[17]	validation_0-logloss:0.08532
[18]	validation_0-logloss:0.08671
[19]	validation_0-logloss:0.08853
[20]	validation_0-logloss:0.09086
[21]	validation_0-logloss:0.09293
[22]	validation_0-logloss:0.09355
[23]	validation_0-logloss:0.09302
[24]	validation_0-logloss:0.09027
[25]	validation_0-logloss:0.09287
[26]	validation_0-logloss:0.09597
[27]	validation_0-logloss:0.0948