In [46]:
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm

import pandas as pd
import numpy as np

In [47]:
data = datasets.load_wine()
df = pd.DataFrame(data.data, columns=data.feature_names)
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [48]:
X, y = datasets.load_wine(return_X_y=True)
X.shape, y.shape

((178, 13), (178,))

### Dividindo o dataset em 60% de treino e 40% de teste.

In [49]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=0
)

In [50]:
X_train.shape, y_train.shape

((106, 13), (106,))

In [51]:
X_test.shape, y_test.shape

((72, 13), (72,))

### Utilizando SVC para treinar o modelo suporvisionado. Em seguida realiza-se um teste com o conjunto teste estipulado

In [52]:
clf = svm.SVC(kernel = 'linear', C=1).fit(X_train, y_train)
clf.score(X_test, y_test)

0.9583333333333334

### Acontece que, ao se realizar esse tipo de teste, o modelo pode ter sofrido overfiting durante o seu treinamento, deixando "vazar alguma informacão" do dos dados de teste durante seu treinamento. Sendo assim o modelo estaria idealmente preparado para validar com o conjunto de teste. 

### Uma alternativa é usar a cross-validation, em que iremos realizar rodadas de validacão, em que os folders de validacão irão variar de acordo com a rodada. Ou seja, iremos realizar a validacão juntamente com os conjuntos e treinamento, e ao final iremos validar com um conjunto de teste. 

In [53]:
from sklearn.model_selection import cross_val_score
clf = svm.SVC(kernel='linear', C=1, random_state=42)
scores = cross_val_score(clf, X, y, cv=5)
scores

array([0.88888889, 0.94444444, 0.97222222, 1.        , 1.        ])

In [None]:
print("%0.2f previsão com um desvio padrão de %0.2f" % (scores.mean(), scores.std()))

### - cross_validate nos permite especificar as métricas para avaliacão
### - retorna um dictionary contendo o tempo de juste (fit_time), tempos de pontuacão (score_time) além da pontuacão do teste (test_score) 

In [54]:
from sklearn.metrics import recall_score
from sklearn.model_selection import cross_validate
scoring = ['precision_macro', 'recall_macro']
scores = cross_validate(clf, X, y, scoring=scoring, cv=5)
sorted(scores.keys())

['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro']

In [56]:
scores['test_recall_macro']

array([0.9047619 , 0.95238095, 0.97222222, 1.        , 1.        ])

In [55]:
df = pd.DataFrame(scores, columns=scores.keys())
df.head()

Unnamed: 0,fit_time,score_time,test_precision_macro,test_recall_macro
0,0.070371,0.001884,0.897436,0.904762
1,0.066549,0.001849,0.944056,0.952381
2,0.077913,0.001463,0.977778,0.972222
3,0.069925,0.001455,1.0,1.0
4,0.061129,0.001637,1.0,1.0


### Validando com Support Vector Classification

In [57]:
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, accuracy_score
scoring = {'prec_macro': make_scorer(precision_score, average='macro'),
           'rec_macro': make_scorer(recall_score, average='macro'),
           'f1_macro': make_scorer(f1_score, average='macro'),
           'acc_macro': make_scorer(accuracy_score)}

scores = cross_validate(clf, X, y, scoring=scoring,
                        cv=5, return_train_score=True)

In [58]:
df = pd.DataFrame(scores, columns=scores.keys())
df.head()

Unnamed: 0,fit_time,score_time,test_prec_macro,train_prec_macro,test_rec_macro,train_rec_macro,test_f1_macro,train_f1_macro,test_acc_macro,train_acc_macro
0,0.071372,0.002714,0.897436,0.994253,0.904762,0.992908,0.887633,0.993517,0.888889,0.992958
1,0.064286,0.0023,0.944056,0.991453,0.952381,0.994152,0.945153,0.992721,0.944444,0.992958
2,0.079506,0.002341,0.977778,0.991453,0.972222,0.994152,0.974013,0.992721,0.972222,0.992958
3,0.07491,0.002367,1.0,0.991667,1.0,0.994152,1.0,0.992831,1.0,0.993007
4,0.063288,0.00211,1.0,0.991667,1.0,0.994048,1.0,0.992778,1.0,0.993007


### Validando com Decision Tree

In [59]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [61]:
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, accuracy_score
scoring = {'prec_macro': make_scorer(precision_score, average='macro'),
           'rec_macro': make_scorer(recall_score, average='macro'),
           'f1_macro': make_scorer(f1_score, average='macro'),
           'acc_macro': make_scorer(accuracy_score)}

scores = cross_validate(clf, X, y, scoring=scoring,
                        cv=5, return_train_score=True)

In [62]:
df = pd.DataFrame(scores, columns=scores.keys())
df.head()

Unnamed: 0,fit_time,score_time,test_prec_macro,train_prec_macro,test_rec_macro,train_rec_macro,test_f1_macro,train_f1_macro,test_acc_macro,train_acc_macro
0,0.000844,0.002454,0.945887,1.0,0.948413,1.0,0.945825,1.0,0.944444,1.0
1,0.000954,0.002524,0.795123,1.0,0.761905,1.0,0.7558,1.0,0.777778,1.0
2,0.001006,0.002211,0.891026,1.0,0.89127,1.0,0.887864,1.0,0.888889,1.0
3,0.001221,0.002082,0.918519,1.0,0.911376,1.0,0.913987,1.0,0.914286,1.0
4,0.000955,0.001905,0.877778,1.0,0.888889,1.0,0.864507,1.0,0.857143,1.0


### Testando com Decision Tree

In [63]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

y_pred = clf.predict(X_test)

prec = precision_score(y_test, y_pred, average='macro')
rec = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
acc = accuracy_score(y_test, y_pred)

scores = {'precisao': prec,
           'recall': rec,
           'f1': f1,
           'accuracy': acc}

scores

{'precisao': 0.8999744572158365,
 'recall': 0.9043576683644595,
 'f1': 0.9003450258769408,
 'accuracy': 0.9027777777777778}