# 1. 데이터 읽기

In [3]:
import pandas as pd
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['class'] = cancer.target
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,class
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


# 2. 모델 만들기
> voting_classifier(분류기 리스트, voting="보팅기법")으로 선언해주어야 한다.

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], 
                                                    df.iloc[:,-1],test_size=0.2,random_state=10)
lr = LogisticRegression(max_iter=4000)
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()

vo = VotingClassifier([('lr', lr), ('knn', knn), ('dt', dt)], voting='soft')
vo.fit(X_train, y_train)
y_pred = vo.predict(X_test)

print('정확도 : ', accuracy_score(y_test, y_pred))

정확도 :  0.956140350877193


> 다음과 같은 경고가 나타날 경우 lr의 max_iter의 수치를 올린다.

>Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html

>Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)

In [10]:
for model in [lr, knn, dt]:
    print('클래스 : ', model.__class__)
    model_name = model.__class__.__name__
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test, pred)
    print(f'{model_name} 의 정확도 : {acc}')

클래스 :  <class 'sklearn.linear_model._logistic.LogisticRegression'>
LogisticRegression 의 정확도 : 0.956140350877193
클래스 :  <class 'sklearn.neighbors._classification.KNeighborsClassifier'>
KNeighborsClassifier 의 정확도 : 0.9210526315789473
클래스 :  <class 'sklearn.tree._classes.DecisionTreeClassifier'>
DecisionTreeClassifier 의 정확도 : 0.9210526315789473


In [11]:
vo = VotingClassifier([('lr', lr), ('knn', knn), ('dt', dt)], voting='hard')
vo.fit(X_train, y_train)
y_pred = vo.predict(X_test)

print('정확도 : ', accuracy_score(y_test, y_pred))

정확도 :  0.9736842105263158


# 3. 스케일링을 통한 모델 성능 높이기
> 로지스틱 회귀의 경우 데이터를 스케일링하게 되면 성능이 향상됨으로 StandardScaler를 통해 스케일링을 진행해 보자

In [12]:
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], 
                                                    df.iloc[:,-1], test_size=0.2, random_state=10)
lr = LogisticRegression(max_iter=4000)
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()

vo = VotingClassifier([('lr', lr), ('knn', knn), ('dt', dt)], voting='soft')
vo.fit(X_train, y_train)
y_pred = vo.predict(X_test)

print('정확도 : ', accuracy_score(y_test, y_pred))

정확도 :  0.9736842105263158


In [13]:
scaler = StandardScaler()

scaler.fit(X_train)
X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

vo = VotingClassifier([('lr',lr), ('knn',knn), ('dt', dt)], voting='soft')

vo.fit(X_train_std, y_train)
y_pred = vo.predict(X_test_std)

print('정확도 :', accuracy_score(y_test, y_pred))

정확도 : 0.9912280701754386


# 4. 교차검증

In [15]:
import numpy as np 
from sklearn.model_selection import cross_val_score

for model in [vo, lr, knn, dt]:
    scores = cross_val_score(model, np.concatenate((X_train_std, X_test_std)),
                             np.concatenate((y_train,y_test)), scoring='accuracy', cv=5)
    print('모델 : ',model.__class__.__name__)
    print('전체 정확도')
    print(scores)
    print('평균 정확도 :',np.mean(scores))
    print('=' * 30)

모델 :  VotingClassifier
전체 정확도
[0.98245614 0.96491228 0.95614035 0.94736842 0.96460177]
평균 정확도 : 0.9630957925787922
모델 :  LogisticRegression
전체 정확도
[0.99122807 0.97368421 0.98245614 0.96491228 0.96460177]
평균 정확도 : 0.9753764943331781
모델 :  KNeighborsClassifier
전체 정확도
[0.97368421 0.95614035 0.94736842 0.96491228 0.99115044]
평균 정확도 : 0.9666511411271541
모델 :  DecisionTreeClassifier
전체 정확도
[0.95614035 0.90350877 0.92105263 0.88596491 0.92035398]
평균 정확도 : 0.9174041297935103


# 5. 튜닝
> 하이퍼 파라미터 설정 시 이전 VotingClassifier에서 설정한 쌍따옴표 안의 문자와 언더바(_) 두 개를 이어서 파라미터를 설정할 수 있다.

In [16]:
from sklearn.model_selection import GridSearchCV

def get_best_params(cost_list, n_neighbors_list, n_depth_list, n_split_list):
    params = {
        'lr__C':cost_list,
        'knn__n_neighbors':n_neighbors_list,
        'knn__weights':['uniform', 'distance'],
        'knn__metric':['euclidean', 'manhattan', 'minkowski'],
        'dt__max_depth':n_depth_list,
        'dt__min_samples_split':n_split_list
    }

    grid_cv = GridSearchCV(vo, param_grid=params, scoring='accuracy', cv=5)
    grid_cv.fit(X_train_std, y_train)

    print('Best Score : ', grid_cv.best_score_)
    print('Best Hyper Parameters :', grid_cv.best_params_)

In [17]:
cost_list = [0.001, 0.01, 0.1, 1, 10]
n_neighbors_list = range(1, 100, 10)
n_depth_list = range(1, 21, 10)
n_split_list = range(2, 50, 10)

get_best_params(cost_list, n_neighbors_list, n_depth_list, n_split_list)

Best Score :  0.9736263736263737
Best Hyper Parameters : {'dt__max_depth': 11, 'dt__min_samples_split': 2, 'knn__metric': 'manhattan', 'knn__n_neighbors': 1, 'knn__weights': 'distance', 'lr__C': 10}


In [19]:
# max_depth : 11
# min_samples_split : 2
# metric : manhattan
# n_neighbors : 1
# weights : distance
# C : 10

n_depth_list = range(11, 22, 5)
n_split_list = range(2, 12, 3)
n_neighbors_list = range(1, 10, 3)
cost_list = range(2, 12, 3)

get_best_params(cost_list, n_neighbors_list, n_depth_list, n_split_list)

Best Score :  0.9736263736263737
Best Hyper Parameters : {'dt__max_depth': 11, 'dt__min_samples_split': 2, 'knn__metric': 'manhattan', 'knn__n_neighbors': 1, 'knn__weights': 'distance', 'lr__C': 5}


In [20]:
# max_depth : 11
# min_samples_split : 2
# metric : manhattan
# n_neighbors : 1
# weights : distance
# C : 10


# max_depth : 11
# sample_split : 2
# neighbor : 1
# cost : 5

n_depth_list = range(8, 14, 1)
n_split_list = range(2, 4, 1)
n_neighbors_list = range(1, 3, 1)
cost_list = range(2, 8, 1)

get_best_params(cost_list, n_neighbors_list, n_depth_list, n_split_list)

Best Score :  0.9736263736263737
Best Hyper Parameters : {'dt__max_depth': 9, 'dt__min_samples_split': 3, 'knn__metric': 'manhattan', 'knn__n_neighbors': 1, 'knn__weights': 'distance', 'lr__C': 6}


In [23]:
lr = LogisticRegression(max_iter=4000, C=6)
knn = KNeighborsClassifier(metric='manhattan', weights='distance', n_neighbors=1)
dt = DecisionTreeClassifier(max_depth=9, min_samples_split=3)

vo = VotingClassifier([('lr', lr), ('knn', knn), ('dt', dt)], voting='soft')

vo.fit(X_train_std, y_train)
pred = vo.predict(X_test_std)

print('정확도 : ', accuracy_score(y_test, pred))

정확도 :  0.9736842105263158


In [24]:
for model in [vo, lr, knn, dt]:
    scores = cross_val_score(model, np.concatenate((X_train_std, X_test_std)),
                             np.concatenate((y_train,y_test)), scoring='accuracy', cv=5)
    print('모델 : ',model.__class__.__name__)
    print('전체 정확도')
    print(scores)
    print('평균 정확도 :',np.mean(scores))
    print('=' * 30)

모델 :  VotingClassifier
전체 정확도
[0.99122807 0.97368421 0.96491228 0.95614035 0.95575221]
평균 정확도 : 0.9683434249340165
모델 :  LogisticRegression
전체 정확도
[0.99122807 0.96491228 0.98245614 0.95614035 0.9380531 ]
평균 정확도 : 0.9665579878900792
모델 :  KNeighborsClassifier
전체 정확도
[0.98245614 0.97368421 0.9122807  0.92982456 0.98230088]
평균 정확도 : 0.9561092997981678
모델 :  DecisionTreeClassifier
전체 정확도
[0.93859649 0.9122807  0.92982456 0.92982456 0.90265487]
평균 정확도 : 0.9226362366092221


> 파라미터는 알고리즘마다 다르게 나타나며 각 알고리즘에 맞는 파라미터를 지정하기 위해 3번에서 VotingClassifier에 적용된 값을 사용하여 lr__C 형식으로 만들게 된다.