<a href="https://colab.research.google.com/github/Eleemon98/Colaboratory/blob/main/0315_%EB%B6%84%EB%A5%98%EB%AA%A8%EB%8D%B82.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd

from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

cancer = load_breast_cancer()

data_df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
data_df.head(3)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758


In [5]:
(X_train, X_test, y_train, y_test) = train_test_split(cancer.data, cancer.target,
                                                      test_size=0.4,
                                                      random_state = 2)

In [6]:
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(341, 30) (228, 30)
(341,) (228,)


In [9]:
# 개별 모델 생성
dtm = DecisionTreeClassifier(max_depth = 5)
knn = KNeighborsClassifier(n_neighbors=5)
lr = LogisticRegression(max_iter = 1000) # max_iter는 로지스틱 회귀가 반복하는 최대 숫자를 정해주는 옵션

# 앙상블 모델로 합치기
# estimators는 모델이름과 실제 모델값을 튜플 형식으로 넣어줌
voting_clf = VotingClassifier(estimators=[('DTM', dtm), ('KNN', knn), ('Logistic', lr)], voting = 'soft')

In [10]:
# 모델 학습
voting_clf.fit(X_train, y_train)

# 모델 결과 예측
pred = voting_clf.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
print('accracy :', round(accuracy_score(y_test, pred), 3))
print('F1 Score :', round(f1_score(y_test, pred), 3))

accracy : 0.934
F1 Score : 0.947


In [16]:
# 개별 모델의 학습/평가지표
for classifier in [dtm, knn, lr]:
  classifier.fit(X_train, y_train)
  pred = classifier.predict(X_test)
  class_name = classifier.__class__.__name__
  print(f'{class_name} accuray :', accuracy_score(y_test, pred))
  print(f'{class_name} f1 Scoure :', f1_score(y_test, pred))
  print('-----------------------------------------------')

DecisionTreeClassifier accuray : 0.8947368421052632
DecisionTreeClassifier f1 Scoure : 0.9117647058823529
-----------------------------------------------
KNeighborsClassifier accuray : 0.9122807017543859
KNeighborsClassifier f1 Scoure : 0.93006993006993
-----------------------------------------------
LogisticRegression accuray : 0.9385964912280702
LogisticRegression f1 Scoure : 0.9510489510489512
-----------------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
pred = gbc.predict(X_test)
print('acc :', accuracy_score(y_test, pred))


acc : 0.9210526315789473


In [18]:
# gridsearchCV는 하이퍼 파라미터들을 값을 지정해주면 알아서 조절해줌
# 모델도 여러가지를 넣을 수 있음
params = {
    'min_samples_split': [5,7,9],
    'min_samples_leaf': [3,5,7],
    'max_depth': [5,7,11]}

dtm = DecisionTreeClassifier()
# 모델하고 param_grid는 하이퍼 파라미터 값하고
# cv는 몇 번 돌릴 건지(각각 2번씩 돌아감),
# verbose는 출력된 결과물을 간략하게 볼 건지 상세하게 볼 건지 정하는 것(1은 간략하게, 2는 상세하게)
grid_cv = GridSearchCV(dtm, param_grid=params, cv=2, verbose = 1)
grid_cv.fit(X_train, y_train)

Fitting 2 folds for each of 27 candidates, totalling 54 fits


In [19]:
# 이렇게 하면 제일 좋은 파라미터 값을 알려줌
print(grid_cv.best_params_)

{'max_depth': 11, 'min_samples_leaf': 3, 'min_samples_split': 5}


In [20]:
# 가장 좋은 모델이 뭔지 알고 싶을 때 사용
print(grid_cv.best_estimator_)

DecisionTreeClassifier(max_depth=11, min_samples_leaf=3, min_samples_split=5)


In [21]:
# 가장 좋은 점수는 얼마인지 알 때
print(grid_cv.best_score_)

0.9559855521155831


In [22]:
print(grid_cv.best_index_)

18


In [23]:
# 결과 보고 싶을 때 사용
df = pd.DataFrame(grid_cv.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_leaf,param_min_samples_split,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002793,0.000438,0.001055,3.85046e-05,5,3,5,"{'max_depth': 5, 'min_samples_leaf': 3, 'min_s...",0.94152,0.947059,0.94429,0.002769,4
1,0.002716,0.000309,0.001015,2.82526e-05,5,3,7,"{'max_depth': 5, 'min_samples_leaf': 3, 'min_s...",0.959064,0.947059,0.953062,0.006003,2
2,0.002657,0.000243,0.001078,1.382828e-05,5,3,9,"{'max_depth': 5, 'min_samples_leaf': 3, 'min_s...",0.912281,0.947059,0.92967,0.017389,8
3,0.002641,0.000329,0.000941,9.298325e-05,5,5,5,"{'max_depth': 5, 'min_samples_leaf': 5, 'min_s...",0.906433,0.947059,0.926746,0.020313,10
4,0.002517,0.000196,0.000945,5.161762e-05,5,5,7,"{'max_depth': 5, 'min_samples_leaf': 5, 'min_s...",0.906433,0.947059,0.926746,0.020313,10
5,0.002618,0.000255,0.001006,2.622604e-05,5,5,9,"{'max_depth': 5, 'min_samples_leaf': 5, 'min_s...",0.906433,0.947059,0.926746,0.020313,10
6,0.002764,0.000275,0.001106,1.66893e-05,5,7,5,"{'max_depth': 5, 'min_samples_leaf': 7, 'min_s...",0.906433,0.947059,0.926746,0.020313,10
7,0.002679,0.000353,0.002212,0.00112915,5,7,7,"{'max_depth': 5, 'min_samples_leaf': 7, 'min_s...",0.906433,0.947059,0.926746,0.020313,10
8,0.002755,0.000369,0.001093,2.861023e-05,5,7,9,"{'max_depth': 5, 'min_samples_leaf': 7, 'min_s...",0.906433,0.947059,0.926746,0.020313,10
9,0.002804,0.000341,0.001066,4.768372e-05,7,3,5,"{'max_depth': 7, 'min_samples_leaf': 3, 'min_s...",0.929825,0.947059,0.938442,0.008617,6


In [24]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
pred = rfc.predict(X_test)
print(accuracy_score(y_test, pred))

0.9385964912280702


In [25]:
params = {
    'n_estimators': [100,200],
    'min_samples_split': [5,7,9],
    'min_samples_leaf': [3,5,7],
    'max_depth': [5,7,11]}
# n_estimators는 의사결정나무를 몇 개 사용할 건지 정하는 것
rfc = RandomForestClassifier()
grid_cv = GridSearchCV(rfc, param_grid=params, cv=2, verbose = 2)
grid_cv.fit(X_train, y_train)

Fitting 2 folds for each of 54 candidates, totalling 108 fits
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=5, n_estimators=100; total time=   0.4s
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=5, n_estimators=100; total time=   0.3s
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=5, n_estimators=200; total time=   0.6s
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=5, n_estimators=200; total time=   0.6s
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=7, n_estimators=100; total time=   0.3s
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=7, n_estimators=100; total time=   0.3s
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=7, n_estimators=200; total time=   0.6s
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=7, n_estimators=200; total time=   0.6s
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=9, n_estimators=100; total time=   0.4s
[CV] END max_depth=5, min_samples_leaf

In [28]:
from sklearn.svm import SVC
# C(cost) : 오차를 어느 정도 허용할 것인지에 대한 파라미터
# kernel : 어떤 커널함수(초평면을 생성하는 함수)를 사용할 것인가? : 'linear', 'sigmoid', 'rbf', 'poly(다항함수)'
# degree : 어느 차수까지 다항차수로 분류할지 결정(kernel 함수가 poly)
# gamma : 곡률경계를 어떻게 할 것인가?
# coef0 : 계수값(상수값) 함수가 어느 위치에서 시작할지 결정하는 것 'sigmoid', 'poly'일 때 사용

svm = SVC(C=0.5, gamma=0.5)
svm.fit(X_train, y_train)

pred = svm.predict(X_test)
print(accuracy_score(y_test, pred))

0.6140350877192983
