In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import learning_curve
from sklearn.model_selection import GridSearchCV

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
%matplotlib inline

In [None]:
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)

In [None]:
import confusion_matrix_pretty_print as cmpp

In [None]:
X_train = pd.read_csv('../subset/X_winetrain.csv', index_col=0)
X_test = pd.read_csv('../subset/X_winetest.csv', index_col=0)
y_train = pd.read_csv('../subset/y_winetrain.csv', header=None, index_col=0, squeeze=True)
y_test = pd.read_csv('../subset/y_winetest.csv', header=None, index_col=0, squeeze=True)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
scaler = StandardScaler()

In [None]:
scaler.fit(X_train)

In [None]:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
clf = SVC()

In [None]:
%time clf.fit(X_train,y_train)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
metrics.accuracy_score(y_test, y_pred)

In [None]:
y_test.value_counts()

# Null Accuracy

In [None]:
y_test.value_counts().head(1) / y_test.count()

# Confusion Matrix

In [None]:
cm = metrics.confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
labels = ['Ok','Good','Great']
df_cm = pd.DataFrame(cm, index=labels, columns=labels)

In [None]:
cmpp.pretty_plot_confusion_matrix(df_cm, cmap="Blues_r", show_null_values=1, pred_val_axis='x')

In [None]:
cm.sum(axis=1)

In [None]:
np.diag(cm)

In [None]:
cm.sum(axis=1) - np.diag(cm)

## When Actual = Rows and Predicted = Columns
## TP: Diagonal top left to bottom right
## TN: Values in all columns and rows excluding given Class column and row
## FP: Values in column excluding TP
## FN: Values in row excluding TP

In [None]:
TP = np.diag(cm)
FP = cm.sum(axis=0) - np.diag(cm)
FN = cm.sum(axis=1) - np.diag(cm)
TN = cm.sum() - (TP + FP + FN)

print(TP)
print(FP)
print(FN)
print(TN)

## Accuracy: How often correct?  _Diagonal / Total_

In [None]:
print(metrics.accuracy_score(y_test,y_pred))
print(TP.sum() / cm.sum())

## Classification Error: How often incorrect? _Non-Diagonal / Total_

In [None]:
print(1 - metrics.accuracy_score(y_test,y_pred))
print((cm.sum() - np.diag(cm).sum()) / cm.sum())

## Precision: When a positive value is predicted, how often is it correct? _TP / Column Total_

In [None]:
print(metrics.precision_score(y_test,y_pred, average=None))
print(TP / (TP + FP))

## Sensitivity (Recall or TPR): When actual value is positive, how often is prediction correct? _TP / Row Total_

In [None]:
print(metrics.recall_score(y_test,y_pred, average=None))
print(TP/(TP+FN))

## False Positive Rate FP / float(TN + FP)
=1 - Specificty

In [None]:
print(FP / (TN + FP))

## Specifity TN / float(TN + FP)
When the actual value is negative, how often is the prediction correct?

In [None]:
print(TN / (TN + FP))
print([1,1,1] - (FP / (TN + FP)))

In [None]:
metrics.matthews_corrcoef(y_test,y_pred)

#### https://www.researchgate.net/post/Multiclass_Confusion_Matrix_Explanation

In [None]:
X_train.shape

In [None]:
X = pd.read_csv('../subset/X_wine.csv', index_col=0)
y = pd.read_csv('../subset/y_wine.csv', header=None, index_col=0, squeeze=True)

In [None]:
y.value_counts()

In [None]:
print(X.shape)
print(y.shape)

In [None]:
print(clf)

In [None]:
labels = np.unique(y)
print(labels)

In [None]:
train_sizes = [50,520,1300,2600,3900,5196]
train_sizes, train_scores, validation_scores = learning_curve(estimator=clf,X=X, y=y, train_sizes=train_sizes, cv=5, shuffle=True, random_state=0, scoring='accuracy')

In [None]:
print(train_scores)
print(validation_scores)

In [None]:
train_scores_mean = train_scores.mean(axis=1)
validation_scores_mean = validation_scores.mean(axis=1)

In [None]:
print(pd.Series(train_scores_mean, index=train_sizes))
print()
print(pd.Series(validation_scores_mean, index=train_sizes))

In [None]:
plt.style.use('seaborn')
plt.plot(train_sizes, train_scores_mean, label='Training error')
plt.plot(train_sizes, validation_scores_mean, label='Validation Error')

plt.ylabel('Accuracy', fontsize = 14)
plt.xlabel('Training set size', fontsize = 14)
plt.title('Learning curves for Decision Tree', fontsize = 18, y=1.03)
plt.legend()

In [None]:
train_sizes, train_scores, validation_scores = learning_curve(estimator=clf,X=X, y=y, train_sizes=train_sizes, cv=5, shuffle=True, random_state=0,scoring='accuracy')

In [None]:
train_scores_mean = train_scores.mean(axis=1)
validation_scores_mean = validation_scores.mean(axis=1)

In [None]:
plt.style.use('seaborn')
plt.plot(train_sizes, train_scores_mean, label='Training error')
plt.plot(train_sizes, validation_scores_mean, label='Validation Error')

plt.ylabel('Accuracy', fontsize = 14)
plt.xlabel('Training set size', fontsize = 14)
plt.title('Learning curves for Decision Tree', fontsize = 18, y=1.03)
plt.legend()

In [None]:
#kern = ['linear', 'poly']

#param_grid = dict(kernel=kern)

In [None]:
#grid = GridSearchCV(clf, param_grid, scoring='accuracy')

In [None]:
#grid.fit(X,y)

In [None]:
#pd.DataFrame(grid.cv_results_)

In [None]:
#grid.best_score_

In [None]:
#grid.best_params_