# Wine 데이터 분류

#### 1) 데이터 전처리 

In [6]:
from sklearn.datasets import load_wine
wine = load_wine()


In [45]:
import pandas as pd
df = pd.DataFrame(wine.data, columns= wine.feature_names)
df['target'] = wine.target
df['target'].shape

(178,)

In [8]:
wine.target_names

array(['class_0', 'class_1', 'class_2'], dtype='<U7')

### 2) 훈련/테스트 데이터셋 분리

In [9]:
from sklearn.tree import DecisionTreeClassifier

In [10]:
dtc = DecisionTreeClassifier(random_state=2021)
dtc.fit(wine.data, wine.target)
dtc.score(wine.data, wine.target)

1.0

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    wine.data, wine.target, stratify = wine.target, test_size=0.2, random_state=2021
)

In [29]:
import numpy as np
np.unique(y_train, return_counts=True)

(array([0, 1, 2]), array([47, 57, 38], dtype=int64))

# 3) GridSearchCV를 통해서
* DecisionTreeClassifier 또는 SVC를 선택해서
* 어떤 파라메타일때 최선의 모델이 되는지 파악하고
* 그때의 성능을 파악함

In [30]:
import warnings
warnings.filterwarnings('ignore')

In [31]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=2021)
dtc.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 2021,
 'splitter': 'best'}

In [14]:
params ={
    'max_depth' : [2, 3, 4, 5, 6],
    'min_samples_split' : [2, 3, 4]
}

In [15]:
from sklearn.model_selection import GridSearchCV

grid_dt = GridSearchCV(
    dtc, param_grid=params, scoring='accuracy', cv=5
)

In [16]:
grid_dt.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=2021),
             param_grid={'max_depth': [2, 3, 4, 5, 6],
                         'min_samples_split': [2, 3, 4]},
             scoring='accuracy')

In [17]:
grid_dt.best_params_

{'max_depth': 3, 'min_samples_split': 2}

In [18]:
best_clf = grid_dt.best_estimator_

In [19]:
best_clf.score(X_test, y_test)

0.9722222222222222

In [20]:
from sklearn.svm import SVC
svc = SVC(random_state=2021)
svc.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': 2021,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [21]:
params = {'C' : [0.01, 0.1, 1, 10, 100]}

In [22]:
grid_sv = GridSearchCV(svc, param_grid = params, scoring='accuracy', cv=5)
grid_sv.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=SVC(random_state=2021),
             param_grid={'C': [0.01, 0.1, 1, 10, 100]}, scoring='accuracy')

In [23]:
grid_sv.best_params_

{'C': 100}

In [42]:
params = {'C' : [7000, 7500, 8000]}
grid_sv = GridSearchCV(svc, param_grid = params, scoring='accuracy', cv=5)
grid_sv.fit(X_train, y_train)
grid_sv.best_params_


{'C': 7000}

In [43]:
best_svc = grid_sv.best_estimator_
best_svc.score(X_test, y_test)

0.9722222222222222