In [None]:
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm

import pandas as pd
import numpy as np

In [None]:
data = datasets.load_wine()
df = pd.DataFrame(data.data, columns=data.feature_names)
df.head()

In [None]:
X, y = datasets.load_wine(return_X_y=True)
X.shape, y.shape

### Dividindo o dataset em 60% de treino e 40% de teste.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=0
)

In [None]:
X_train.shape, y_train.shape

In [None]:
X_test.shape, y_test.shape

### Utilizando SVC para treinar o modelo suporvisionado. Em seguida realiza-se um teste com o conjunto teste estipulado

In [None]:
clf = svm.SVC(kernel = 'linear', C=1).fit(X_train, y_train)
clf.score(X_test, y_test)

### Acontece que, ao se realizar esse tipo de teste, o modelo pode ter sofrido overfiting durante o seu treinamento, deixando "vazar alguma informacão" do dos dados de teste durante seu treinamento. Sendo assim o modelo estaria idealmente preparado para validar com o conjunto de teste. 

### Uma alternativa é usar a cross-validation, em que iremos realizar rodadas de validacão, em que os folders de validacão irão variar de acordo com a rodada. Ou seja, iremos realizar a validacão juntamente com os conjuntos e treinamento, e ao final iremos validar com um conjunto de teste. 

In [None]:
from sklearn.model_selection import cross_val_score
clf = svm.SVC(kernel='linear', C=1, random_state=42)
scores = cross_val_score(clf, X, y, cv=5)
scores

In [None]:
print("%0.2f previsão com um desvio padrão de %0.2f" % (scores.mean(), scores.std()))

### - cross_validate nos permite especificar as métricas para avaliacão
### - retorna um dictionary contendo o tempo de juste (fit_time), tempos de pontuacão (score_time) além da pontuacão do teste (test_score) 

In [None]:
from sklearn.metrics import recall_score
from sklearn.model_selection import cross_validate
scoring = ['precision_macro', 'recall_macro']
scores = cross_validate(clf, X, y, scoring=scoring, cv=5)
sorted(scores.keys())

In [None]:
scores['test_recall_macro']

In [None]:
df = pd.DataFrame(scores, columns=scores.keys())
df.head()

### Validando com Support Vector Classification

In [None]:
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, accuracy_score
scoring = {'prec_macro': make_scorer(precision_score, average='macro'),
           'rec_macro': make_scorer(recall_score, average='macro'),
           'f1_macro': make_scorer(f1_score, average='macro'),
           'rec_macro': make_scorer(recall_score, average='macro')}

scores = cross_validate(clf, X, y, scoring=scoring,
                        cv=5, return_train_score=True)

In [None]:
df = pd.DataFrame(scores, columns=scores.keys())
df.head()

### Testando com Support Vector Classification

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

y_pred = clf.predict(X_test)

prec = precision_score(y_test, y_pred, average='macro')
rec = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
acc = accuracy_score(y_test, y_pred)

scores = {'precisao': prec,
           'recall': rec,
           'f1': f1,
           'accuracy': acc}

scores

### Validando com Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, accuracy_score
scoring = {'prec_macro': make_scorer(precision_score, average='macro'),
           'rec_macro': make_scorer(recall_score, average='macro'),
           'f1_macro': make_scorer(f1_score, average='macro'),
           'rec_macro': make_scorer(recall_score, average='macro')}

scores = cross_validate(clf, X, y, scoring=scoring,
                        cv=5, return_train_score=True)

In [None]:
df = pd.DataFrame(scores, columns=scores.keys())
df.head()

### Testando com Decision Tree

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

y_pred = clf.predict(X_test)

prec = precision_score(y_test, y_pred, average='macro')
rec = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
acc = accuracy_score(y_test, y_pred)

scores = {'precisao': prec,
           'recall': rec,
           'f1': f1,
           'accuracy': acc}

scores