In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
# xgboost 패키지 로딩
import xgboost as xgb
from xgboost import plot_importance

import warnings
warnings.filterwarnings("ignore")

### Native XGBoost 적용 – Wisconsin Breast Cancer 데이터 셋
데이터 셋 로딩하기

In [2]:
dataset = load_breast_cancer()
features= dataset.data
labels = dataset.target

cancer_df = pd.DataFrame(data=features, columns=dataset.feature_names)
cancer_df["target"]= labels
cancer_df.head(3)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0


In [3]:
print(dataset.target_names)
print(cancer_df["target"].value_counts())

['malignant' 'benign']
target
1    357
0    212
Name: count, dtype: int64


In [4]:
# cancer_df에서 feature 용 DataFrame과 Label용 Series 객체 추출
# 맨 마지막 컬럼이 Label이므로 Feature용 DataFrame은 cancer_df의 첫번째 컬럼에서 맨 마지막 두번째 컬럼까지를 :-1 슬라이싱으로 추출
X_features = cancer_df.iloc[:, :-1]
y_label = cancer_df.iloc[:, -1]

# 전체 데이터 중 80%는 학습 용 데이터, 20%는 테스트 용 데이터 추출
X_train, X_test, y_train, y_test=train_test_split(X_features, y_label, test_size = 0.2, random_state = 156 )

# 위에서 만든 X_train, y_train을 다시 쪼개어 90%는 학습과 10%는 검증용 데이터로 분리
X_tr, X_val, y_tr, y_val= train_test_split(X_train, y_train, test_size = 0.1, random_state = 156 )

print(X_train.shape, X_test.shape)
print(X_tr.shape, X_val.shape)

(455, 30) (114, 30)
(409, 30) (46, 30)


### 학습과 예측 데이터 세트를 DMatrix로 변환
* DMatrix는 넘파이 array, DataFrame에서도 변환 가능

In [5]:
# 만약 구 버전 XGBoost에서 DataFrame으로 DMatrix 생성이 안될 경우 X_train.values로 numpy 변환
# 학습, 검증, 테스트 용 DMatrix를 생성
dtr = xgb.DMatrix(data=X_tr, label=y_tr)
dval = xgb.DMatrix(data=X_val, label=y_val)
dtest = xgb.DMatrix(data=X_test , label=y_test)

### 하이퍼 파라미터 설정

In [6]:
params = {"max_depth" : 3,
          "eta" : 0.05,
          "objective" : "binary:logistic",
          "eval_metric" : "logloss"}
num_rounds = 400

### 주어진 하이퍼 파라미터와 early stopping 파라미터를 train( ) 함수의 파라미터로 전달하고 학습

In [7]:
# 학습 데이터 셋은 "train", 평가 데이터 셋은 "eval"로 표기
eval_list = [(dtr,"train"), (dval,"eval")] # 또는 eval_list = [(dval,"eval")]만 표기해도 됨

# 하이퍼 파라미터와 early stopping 파라미터를 train( ) 함수의 파라미터로 전달
xgb_model = xgb.train(params = params, dtrain = dtr, num_boost_round = num_rounds, early_stopping_rounds = 50, evals = eval_list)

[0]	train-logloss:0.62473	eval-logloss:0.63088
[1]	train-logloss:0.58662	eval-logloss:0.60461
[2]	train-logloss:0.55210	eval-logloss:0.58205
[3]	train-logloss:0.52068	eval-logloss:0.56165
[4]	train-logloss:0.49173	eval-logloss:0.54101
[5]	train-logloss:0.46517	eval-logloss:0.52206
[6]	train-logloss:0.44009	eval-logloss:0.50271
[7]	train-logloss:0.41646	eval-logloss:0.48606
[8]	train-logloss:0.39505	eval-logloss:0.46961
[9]	train-logloss:0.37523	eval-logloss:0.45485
[10]	train-logloss:0.35682	eval-logloss:0.44120
[11]	train-logloss:0.33964	eval-logloss:0.43123
[12]	train-logloss:0.32279	eval-logloss:0.41962
[13]	train-logloss:0.30783	eval-logloss:0.40848
[14]	train-logloss:0.29308	eval-logloss:0.39857
[15]	train-logloss:0.27928	eval-logloss:0.38945
[16]	train-logloss:0.26681	eval-logloss:0.38171
[17]	train-logloss:0.25498	eval-logloss:0.37381
[18]	train-logloss:0.24352	eval-logloss:0.36656
[19]	train-logloss:0.23307	eval-logloss:0.36014
[20]	train-logloss:0.22290	eval-logloss:0.35395
[2

### predict()를 통해 예측 확률값을 반환하고 예측 값으로 변환

In [8]:
pred_probs = xgb_model.predict(dtest)
print("predict( ) 수행 결과값을 10개만 표시, 예측 확률 값으로 표시됨")
print(np.round(pred_probs[:10], 3))

# 예측 확률이 0.5보다 크면 1, 그렇지 않으면 0으로 예측 값 결정하여 List 객체인 preds에 저장 
preds = [1 if x > 0.5 else 0 for x in pred_probs]
print("예측값 10개만 표시 : ",preds[:10])

predict( ) 수행 결과값을 10개만 표시, 예측 확률 값으로 표시됨
[0.938 0.004 0.776 0.058 0.975 1.    0.999 0.999 0.998 0.   ]
예측값 10개만 표시 :  [1, 0, 1, 0, 1, 1, 1, 1, 1, 0]


In [9]:
pred_probs

array([9.37527478e-01, 3.71755939e-03, 7.76189387e-01, 5.79332076e-02,
       9.75099325e-01, 9.99556959e-01, 9.99142289e-01, 9.99223113e-01,
       9.97575939e-01, 4.94481821e-04, 9.95804323e-04, 6.93196373e-04,
       9.98852372e-01, 9.99322534e-01, 9.98479545e-01, 9.95870888e-01,
       9.94609833e-01, 9.99404788e-01, 9.98329818e-01, 9.98935163e-01,
       1.40492129e-03, 1.76025674e-01, 6.11953263e-04, 9.99618411e-01,
       8.04344541e-04, 8.17749083e-01, 5.60258515e-03, 9.79702570e-04,
       9.98273134e-01, 6.34756461e-02, 9.76984859e-01, 7.95303902e-04,
       9.82709050e-01, 8.89836013e-01, 7.72360852e-03, 4.90200007e-04,
       9.96063530e-01, 9.98581767e-01, 1.37578353e-01, 9.98345256e-01,
       3.41511995e-01, 9.97788668e-01, 9.97865021e-01, 9.97809947e-01,
       9.97716665e-01, 9.41117465e-01, 3.60030867e-02, 9.97942388e-01,
       9.96318698e-01, 9.99110639e-01, 9.98090565e-01, 1.26643653e-03,
       9.99124587e-01, 9.99040544e-01, 9.97892439e-01, 9.97332811e-01,
      

### get_clf_eval( )을 통해 예측 평가

In [10]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score

def get_clf_eval(y_test, pred = None, pred_proba = None):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    # ROC-AUC 추가 
    roc_auc = roc_auc_score(y_test, pred_proba)
    print("오차 행렬")
    print(confusion)
    # ROC-AUC print 추가
    print("정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1: {3:.4f}, AUC: {4:.4f}".format(accuracy, precision, recall, f1, roc_auc))

In [11]:
get_clf_eval(y_test, preds, pred_probs)

오차 행렬
[[35  2]
 [ 2 75]]
정확도: 0.9649, 정밀도: 0.9740, 재현율: 0.9740, F1: 0.9740, AUC: 0.9965


### Feature Importance 시각화

In [12]:
import matplotlib.pyplot as plt
%matplotlib inline

fig, ax = plt.subplots(figsize = (10, 12))
plot_importance(xgb_model, ax = ax)

<Axes: title={'center': 'Feature importance'}, xlabel='Importance score', ylabel='Features'>