In [2]:
""" SVMs are applied to many different domains. They can classify non-linearly. What are the conditions on preprocessing?
Indeed, they have a solid mathematical foundation. 
Predicting the class of a given sample is a quick decision. 
The classification of a new example consists in seeing the sign of the decision function  ℎ𝜃(𝑥). 
Moreover, the test examples are compared just with the support vectors and not with all the training examples.

However, this classification model is basically binary, hence the need to use the one-versus-one approach. 
And the more the quantity of input examples is large, the more complex the matrix calculation is. 
The computation time is high when regularizing the parameters of the kernel function."""

import numpy as np
import pandas as pd

from sklearn import svm, model_selection, preprocessing
from sklearn.model_selection import train_test_split

df = pd.read_csv('source/wine.csv')

print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Wine                  178 non-null    int64  
 1   Alcohol               178 non-null    float64
 2   Malic.acid            178 non-null    float64
 3   Ash                   178 non-null    float64
 4   Acl                   178 non-null    float64
 5   Mg                    178 non-null    int64  
 6   Phenols               178 non-null    float64
 7   Flavanoids            178 non-null    float64
 8   Nonflavanoid.phenols  178 non-null    float64
 9   Proanth               178 non-null    float64
 10  Color.int             178 non-null    float64
 11  Hue                   178 non-null    float64
 12  OD                    178 non-null    float64
 13  Proline               178 non-null    int64  
dtypes: float64(11), int64(3)
memory usage: 19.6 KB
None


Unnamed: 0,Wine,Alcohol,Malic.acid,Ash,Acl,Mg,Phenols,Flavanoids,Nonflavanoid.phenols,Proanth,Color.int,Hue,OD,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [3]:
""" SVMs only work with numerical data
But they should have a simmilar scale -> Normalization / standardization
Note: The same normalization can not be used for new data -> influences the results
"""

data = df.iloc[:, 1:14]
target = df.iloc[:,0]

X_train, X_test, y_train, y_test = train_test_split(data, target, train_size = 0.2)

""" Normalization so that we have mean 0 and std 1 for all variables independet of original distribution 
TODO: Read the link of the notebook
"""
X_train_scale = preprocessing.scale(X_train)

print(X_train_scale.mean(axis = 0))
print(X_train_scale.std(axis = 0))

[-1.20538500e-16 -3.58443434e-16  6.84373193e-16  9.83340393e-17
  3.32273891e-16  1.90323947e-16  3.93336157e-16 -3.21964677e-16
  4.75809868e-17  3.17206578e-16  1.26882631e-16  2.41473508e-16
  6.34413157e-17]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [4]:
""" Using API Transformer """
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)

# Same result but now it is safed and we can apply the same normalization for the test data too
print(X_train_scaled.mean(axis = 0))
print(X_train_scaled.std(axis = 0))

X_test_scaled = scaler.transform(X_test)
print(X_test_scaled.mean(axis = 0))
print(X_test_scaled.std(axis = 0))


[-1.20538500e-16 -3.58443434e-16  6.84373193e-16  9.83340393e-17
  3.32273891e-16  1.90323947e-16  3.93336157e-16 -3.21964677e-16
  4.75809868e-17  3.17206578e-16  1.26882631e-16  2.41473508e-16
  6.34413157e-17]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
[-0.33570847  0.25274741 -0.19338262  0.171952   -0.36531885 -0.3900448
 -0.29535973  0.03891923 -0.18404855 -0.2848332  -0.1881901  -0.25031226
 -0.5434083 ]
[0.96507936 1.11845622 0.92118837 0.82455595 0.86231226 0.90020053
 0.82388275 1.00835627 0.91870813 1.11317475 1.15410837 0.94336839
 0.9055857 ]


In [5]:
# Classification unsing SVMs
# Possible kernel functions are: 'linear', 'polynomial', 'rbf', 'sigmoid'

clf = svm.SVC(gamma=0.01,  kernel='poly')
clf.fit(X_train_scaled, y_train)

# Evaluation using a confusion matrix
y_pred = clf.predict(X_test_scaled)
pd.crosstab(y_test, y_pred, rownames=['Classe réelle'], colnames=['Classe prédite'])


Classe prédite,1
Classe réelle,Unnamed: 1_level_1
1,43
2,60
3,40


In [6]:
""" The hyper parameters seem not to work well. 
To find the optimal hyperparameters for a given problem we can use cross validation on a search grid. 
The grid is just the cross product of different values for the HP.
"""
parameter = {'C':[0.1,1,10], 
             'kernel': ['rbf', 'linear', 'poly'],
             'gamma' :[0.001, 0.1, 0.5]}

# The scoring argument allows us to select the measure that we want to optimize for. 
# Default is Accuracy
grid_clf = model_selection.GridSearchCV(estimator = clf, param_grid = parameter)

grid = grid_clf.fit(X_train_scaled,y_train)

#print(pd.DataFrame.from_dict(grid.cv_results_).loc[:,['params', 'mean_test_score']]) 
grid.best_params_

y_pred = grid_clf.predict(X_test_scaled)
pd.crosstab(y_test, y_pred, rownames=['Classe réelle'], colnames=['Classe prédite'])

Classe prédite,1,2,3
Classe réelle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,41,2,0
2,0,58,2
3,0,1,39
