# Wine 데이터 분류

#### 1) 데이터 전처리

In [None]:
from sklearn.datasets import load_wine
wine = load_wine()

In [None]:
import pandas as pd
df = pd.DataFrame(wine.data, columns=wine.feature_names)
df["target"] = wine.target
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [None]:
df.shape

(178, 14)

In [None]:
df.target.value_counts() # 데이터 분포 확인 (Wine 데이터는 일정하지 않음)

1    71
0    59
2    48
Name: target, dtype: int64

In [None]:
wine.target_names # 와인의 등급

array(['class_0', 'class_1', 'class_2'], dtype='<U7')

#### 2) 훈련/테스트 데이터셋 분리

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    wine.data, wine.target, random_state=2021
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((133, 13), (45, 13), (133,), (45,))

In [None]:
import numpy as np
np.unique(y_train, return_counts=True)

(array([0, 1, 2]), array([40, 54, 39]))

In [None]:
# y값을 균일하게 뽑으려면 stratify 옵션을 설정해 주어야 함
X_train, X_test, y_train, y_test = train_test_split(
    wine.data, wine.target, stratify=wine.target, test_size=0.2, random_state=2021
)
np.unique(y_train, return_counts=True)

(array([0, 1, 2]), array([47, 57, 38]))

#### 3) GridSearchCV를 통해서
- DecisionTreeClassifietr 또는 SVC 를 선택해서
- 어떤 파라미터일 때 최선의 모델이 되는지 파악하고
- 그때의 성능을 평가함

In [None]:
# 분류기와 그에 해당하는 하이퍼 파라미터
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=2021)
dtc.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 2021,
 'splitter': 'best'}

In [None]:
params = {
    'max_depth' : [2, 3, 4, 5, 6],
    'min_samples_split' : [2, 3, 4]
}

In [None]:
from sklearn.model_selection import GridSearchCV

grid_dt = GridSearchCV(
    dtc, param_grid=params, scoring='accuracy', cv=5
)

In [None]:
grid_dt.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=2021),
             param_grid={'max_depth': [2, 3, 4, 5, 6],
                         'min_samples_split': [2, 3, 4]},
             scoring='accuracy')

In [None]:
# 최적의 파라미터
grid_dt.best_params_

{'max_depth': 3, 'min_samples_split': 2}

In [None]:
# 최적의 파라미터로 학습한 분류기
best_clf = grid_dt.best_estimator_

In [None]:
# 최적의 파라미터로 학습한 분류기로 예측 및 평가
best_clf.score(X_test, y_test)

0.9722222222222222

In [None]:
from sklearn.svm import SVC
svc = SVC(random_state=2021)
svc.get_params() # C 값이 결정 C가 뭔지는 나중에... 정수 파라미터는 정수로 나왔는데 C는 실수값...

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': 2021,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [None]:
params = {'C': [0.01, 0.1, 1, 10, 100]}

In [None]:
grid_sv = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv=5)
grid_sv.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=SVC(random_state=2021),
             param_grid={'C': [0.01, 0.1, 1, 10, 100]}, scoring='accuracy')

In [None]:
grid_sv.best_params_

{'C': 100}

In [None]:
params = {'C': [1000, 2000, 3000, 4000, 5000]}
grid_sv = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv=5)
grid_sv.fit(X_train, y_train)
grid_sv.best_params_

{'C': 5000}

In [None]:
params = {'C': [5000, 7000, 9000, 10000]}
grid_sv = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv=5)
grid_sv.fit(X_train, y_train)
grid_sv.best_params_

{'C': 9000}

In [None]:
params = {'C': [8000, 8500, 9000, 9500, 10000]}
grid_sv = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv=5)
grid_sv.fit(X_train, y_train)
grid_sv.best_params_

{'C': 9000}

In [None]:
params = {'C': [8900, 9000, 9100, 9200, 9300]}
grid_sv = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv=5)
grid_sv.fit(X_train, y_train)
grid_sv.best_params_

{'C': 8900}

In [None]:
params = {'C': [8700, 8800, 8900]}
grid_sv = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv=5)
grid_sv.fit(X_train, y_train)
grid_sv.best_params_

{'C': 8700}

In [None]:
params = {'C': [8300, 8400, 8500, 8600]}
grid_sv = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv=5)
grid_sv.fit(X_train, y_train)
grid_sv.best_params_

{'C': 8600}

In [None]:
params = {'C': [8600, 8700, 8800]}
grid_sv = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv=5)
grid_sv.fit(X_train, y_train)
grid_sv.best_params_

{'C': 8600}

In [None]:
params = {'C': [8550, 8600, 8650]}
grid_sv = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv=5)
grid_sv.fit(X_train, y_train)
grid_sv.best_params_

{'C': 8600}

In [None]:
best_svc = grid_sv.best_estimator_
best_svc.score(X_test, y_test)

1.0

In [None]:
params = {'C': [81.7,81.8,81.9,82,82.1]}
grid_sv = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv=5)
grid_sv.fit(X_train, y_train)
grid_sv.best_params_

{'C': 82.1}

In [None]:
params = {'C': [81.9,82,82.1,82.2,82.3]}
grid_sv = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv=5)
grid_sv.fit(X_train, y_train)
grid_sv.best_params_

{'C': 82.1}

In [None]:
params = {'C': [82.06, 82.08, 82.1, 82.12, 82.13]}
grid_sv = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv=5)
grid_sv.fit(X_train, y_train)
grid_sv.best_params_

{'C': 82.06}

In [None]:
params = {'C': [82.01, 82.03, 82.04, 82.05, 82.06]}
grid_sv = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv=5)
grid_sv.fit(X_train, y_train)
grid_sv.best_params_

{'C': 82.01}

In [None]:
params = {'C': [82.01, 82.03, 82.04, 82.05, 82.06]}
grid_sv = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv=5)
grid_sv.fit(X_train, y_train)
grid_sv.best_params_

In [None]:
params = {'C': [81.90, 81.95, 82, 82.05, 82.15]}
grid_sv = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv=5)
grid_sv.fit(X_train, y_train)
grid_sv.best_params_

{'C': 81.95}

In [None]:
grid_sv.best_params_

{'C': 81.95}

In [None]:
best_svc = grid_sv.best_estimator_
best_svc.score(X_test, y_test)

0.7222222222222222

In [None]:
best_svc

SVC(C=82.1, random_state=2021)

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
# 모델 생성 - 객체 생성
dtc = DecisionTreeClassifier(random_state=2021)

In [None]:
# 하이퍼 파라메터
dtc.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 2021,
 'splitter': 'best'}

In [None]:
# 학습(훈련)
dtc.fit(X_train, y_train)

DecisionTreeClassifier(random_state=2021)

In [None]:
pred_dt = dtc.predict(X_test)

In [None]:
pred_dt

array([1, 2, 1, 0, 1, 2, 0, 0, 1, 0, 0, 0, 2, 1, 0, 2, 0, 1, 2, 1, 0, 1,
       2, 1, 1, 0, 0, 1, 1, 2, 1, 1, 2, 2, 1, 2])

In [None]:
res = pd.DataFrame({'y':y_test, 'DT':pred_dt})
res.head()

Unnamed: 0,y,DT
0,1,1
1,2,2
2,1,1
3,0,0
4,1,1


In [None]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, pred_dt)
print(f'정확도(DT): {acc:.4f}')

정확도(DT): 0.9722


In [None]:
# 4, 5번을 한꺼번에 수행
dtc.score(X_test, y_test)

0.9722222222222222