# Mushroom dataset analysis and SVM

### Importing all the libraries

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

### Read dataset and show 10 rows from head

In [2]:
data = pd.read_csv("./datasets/mushrooms.csv")
data.head(10)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
5,e,x,y,y,t,a,f,c,b,n,...,s,w,w,p,w,o,p,k,n,g
6,e,b,s,w,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,n,m
7,e,b,y,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,s,m
8,p,x,y,w,t,p,f,c,n,p,...,s,w,w,p,w,o,p,k,v,g
9,e,b,s,y,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,s,m


### check for null values

In [3]:
data.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [4]:
data.shape

(8124, 23)

### Two class clasiffication , Either the mushroom is poisonous or edible

In [5]:
data['class'].unique()

array(['p', 'e'], dtype=object)

###  Converting categorical attributes, into numeric data

In [6]:
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
for col in data.columns:
    data[col] = labelencoder.fit_transform(data[col])
 
data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


### Check wether the dataset is balanced ( equal % of samples from each class)

In [7]:
print(data.groupby('class').size())


class
0    4208
1    3916
dtype: int64


### Separating features and label

In [8]:
X = data.iloc[:,1:23]  # all rows, all the features and no labels
y = data.iloc[:, 0]  # all rows, label only
X.head()
y.head()

0    1
1    0
2    0
3    1
4    0
Name: class, dtype: int32

In [9]:
y

0       1
1       0
2       0
3       1
4       0
       ..
8119    0
8120    0
8121    0
8122    1
8123    0
Name: class, Length: 8124, dtype: int32

In [10]:
X.describe() # Show features statistic

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,...,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0
mean,3.348104,1.827671,4.504677,0.415559,4.144756,0.974151,0.161497,0.309207,4.810684,0.567208,...,1.603644,5.816347,5.794682,0.0,1.965534,1.069424,2.291974,3.59675,3.644018,1.508616
std,1.604329,1.229873,2.545821,0.492848,2.103729,0.158695,0.368011,0.462195,3.540359,0.495493,...,0.675974,1.901747,1.907291,0.0,0.242669,0.271064,1.801672,2.382663,1.252082,1.719975
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,3.0,0.0,2.0,1.0,0.0,0.0,2.0,0.0,...,1.0,6.0,6.0,0.0,2.0,1.0,0.0,2.0,3.0,0.0
50%,3.0,2.0,4.0,0.0,5.0,1.0,0.0,0.0,5.0,1.0,...,2.0,7.0,7.0,0.0,2.0,1.0,2.0,3.0,4.0,1.0
75%,5.0,3.0,8.0,1.0,5.0,1.0,0.0,1.0,7.0,1.0,...,2.0,7.0,7.0,0.0,2.0,1.0,4.0,7.0,4.0,2.0
max,5.0,3.0,9.0,1.0,8.0,1.0,1.0,1.0,11.0,1.0,...,3.0,8.0,8.0,0.0,3.0,2.0,4.0,8.0,5.0,6.0


### Scaling

The main advantage of scaling is to avoid attributes in greater numeric ranges dominating those in smaller numeric ranges. Another advantage is to avoid numerical difficulties during the calculation

In [11]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X=scaler.fit_transform(X)
X

array([[ 1.02971224,  0.14012794, -0.19824983, ..., -0.67019486,
        -0.5143892 ,  2.03002809],
       [ 1.02971224,  0.14012794,  1.76587407, ..., -0.2504706 ,
        -1.31310821, -0.29572966],
       [-2.08704716,  0.14012794,  1.37304929, ..., -0.2504706 ,
        -1.31310821,  0.86714922],
       ...,
       [-0.8403434 ,  0.14012794, -0.19824983, ..., -1.50964337,
        -2.11182722,  0.28570978],
       [-0.21699152,  0.95327039, -0.19824983, ...,  1.42842641,
         0.28432981,  0.28570978],
       [ 1.02971224,  0.14012794, -0.19824983, ...,  0.16925365,
        -2.11182722,  0.28570978]])

### Export the preprocessed data to a SCV file for the C# SVM code

In [12]:
df = pd.DataFrame(X)
df.to_csv('mashrooms.txt', sep = ',', index = False)

### Splitting The Data into Training And Testing Dataset


In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=4)

### Support Vector Machine (SVM)

In [14]:
from sklearn.svm import SVC
svm_model= SVC(gamma="auto") 

 The gamma parameter defines how far the influence of a single training example reaches, 
 with low values meaning ‘far’ and high values meaning ‘close’. 
 The gamma parameters can be seen as the inverse of the radius of influence of samples selected by the model as support vectors. 

## Find best SVM parametrs , but use linear or polynomial kernel only

RandomizedSearchCV implements a randomized search over parameters, where each setting is sampled from a distribution over possible parameter values. This has two main benefits over an exhaustive search: 1)A budget can be chosen independent of the number of parameters and possible values. 2)Adding parameters that do not influence the performance does not decrease efficiency.

In [15]:
tuned_parameters = {
 'C': [1, 10, 100,500, 1000], 'kernel': ['linear'],
 'degree': [2,3,4,5,6] , 'C':[1,10,100,500,1000] , 'kernel':['poly']
    }

The C parameter trades off misclassification of training examples against simplicity of the decision surface. A low C makes the decision surface smooth, while a high C aims at classifying all training examples correctly by giving the model freedom to select more samples as support vectors.

In [16]:

from sklearn.model_selection  import GridSearchCV, RandomizedSearchCV

model_svm = RandomizedSearchCV(svm_model, tuned_parameters,cv=10,scoring='accuracy',n_iter=20)

In [17]:
model_svm.fit(X_train, y_train)
print(model_svm.best_score_)

1.0


In [18]:
print(model_svm.best_params_)

{'kernel': 'poly', 'degree': 4, 'C': 500}


In [19]:
svm_model= SVC(kernel="poly",C=500,degree=2,gamma="auto")

In [20]:
svm_model.fit(X_train, y_train)  # we train the algorithm with the training data and the training output

SVC(C=500, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=2, gamma='auto', kernel='poly',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [21]:
svm_model.score(X_test, y_test) # score on tesitng data

1.0

### Get SVM parameters

In [22]:
svm_model.dual_coef_ # model y_i α_i

array([[-1.37985977e+00, -4.88360796e+00, -5.54816579e+00,
        -6.17573138e-02, -1.08657190e+00, -1.42642414e-02,
        -6.16393609e-01, -4.26015393e+00, -1.25690418e+00,
        -2.14660276e+00, -2.26876526e+00, -6.11846930e+00,
        -4.92435728e-01, -1.98669665e+01, -3.45936906e-01,
        -1.05684729e+00, -1.40688918e-01, -9.77416653e-01,
        -7.13053311e+00, -1.48406760e+00, -1.76393844e+00,
        -1.28194418e+00, -5.40688130e+00, -9.42895740e-01,
        -2.32245493e+00, -1.25606202e+00, -2.86020866e+00,
        -8.66607892e+00, -4.25969623e-01, -4.73397648e-01,
        -5.73640258e-01, -1.60740591e-01, -6.26190522e+00,
        -2.04933257e-01, -1.42490390e+00, -1.24795875e-01,
        -8.95885286e+00, -1.21896676e-01, -2.42805125e-01,
        -8.55358762e+00, -3.29245646e+00, -1.29007993e-01,
        -1.56241259e+00, -1.41627517e+00, -9.55913629e+00,
        -5.65162244e+00, -9.15489961e+00, -6.59974554e-01,
        -7.99040211e-02, -5.26057863e+00, -2.72483635e+0

In [23]:
svm_model.support_vectors_ # get spport vectors

array([[-0.8403434 ,  0.95327039, -0.98389939, ..., -0.2504706 ,
         0.28432981, -0.8771691 ],
       [-0.8403434 ,  0.14012794, -1.37672417, ...,  1.42842641,
         0.28432981,  1.44858865],
       [ 1.02971224,  0.95327039, -0.19824983, ..., -0.67019486,
         0.28432981, -0.8771691 ],
       ...,
       [ 1.02971224,  0.14012794, -0.19824983, ..., -0.2504706 ,
         0.28432981,  2.03002809],
       [-0.8403434 ,  0.95327039,  1.76587407, ..., -1.08991911,
         0.28432981, -0.8771691 ],
       [ 1.02971224, -1.48615695,  1.76587407, ..., -1.08991911,
         0.28432981,  1.44858865]])

In [24]:
svm_model.intercept_ # independent term

array([0.32094812])

In [25]:
svm_model.gamma

'auto'

### Export model code with porter

In [26]:
from sklearn_porter import Porter

In [27]:
porter = Porter(svm_model, language='java')
output = porter.export(embed_data=True)
print(output)

class SVC {

    private enum Kernel { LINEAR, POLY, RBF, SIGMOID }

    private int nClasses;
    private int nRows;
    private int[] classes;
    private double[][] vectors;
    private double[][] coefficients;
    private double[] intercepts;
    private int[] weights;
    private Kernel kernel;
    private double gamma;
    private double coef0;
    private double degree;

    public SVC (int nClasses, int nRows, double[][] vectors, double[][] coefficients, double[] intercepts, int[] weights, String kernel, double gamma, double coef0, double degree) {
        this.nClasses = nClasses;
        this.classes = new int[nClasses];
        for (int i = 0; i < nClasses; i++) {
            this.classes[i] = i;
        }
        this.nRows = nRows;

        this.vectors = vectors;
        this.coefficients = coefficients;
        this.intercepts = intercepts;
        this.weights = weights;

        this.kernel = Kernel.valueOf(kernel.toUpperCase());
        this.gamma = gamma;
        thi

In [28]:
porter = Porter(svm_model, language='c')
output = porter.export(embed_data=True)
print(output)

#include <stdlib.h>
#include <stdio.h>
#include <math.h>

#define N_FEATURES 22
#define N_CLASSES 2
#define N_VECTORS 96
#define N_ROWS 2
#define N_COEFFICIENTS 1
#define N_INTERCEPTS 1
#define KERNEL_TYPE 'p'
#define KERNEL_GAMMA 0.045454545454545456
#define KERNEL_COEF 0.0
#define KERNEL_DEGREE 2

double vectors[96][22] = {{-0.8403433999584713, 0.9532703900465632, -0.9838993878642219, 1.185916567160356, 0.40656202865404567, 0.16289645171177966, -0.4388636369510842, -0.6690383093994308, 0.05347685426934428, 0.8735106372181927, -0.10348153740396093, 0.6837776537937139, 0.5863846591895536, 0.622441390325499, 0.6319913825853262, 0.0, 0.14203663498716684, -0.2561317410190009, 0.9480808566164142, -0.250470603921817, 0.2843298100961381, -0.8771690980243297}, {-0.8403433999584713, 0.14012794477794924, -1.376724168039617, 1.185916567160356, 0.40656202865404567, 0.16289645171177966, -0.4388636369510842, -0.6690383093994308, 1.4658499494714023, -1.1448057498013176, -0.10348153740396093, 0.68377