# SVM - Classification (Binary)

# Libraries

In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
from sklearn import preprocessing, svm

In [4]:
from sklearn.model_selection import train_test_split,GridSearchCV

In [5]:
from sklearn.metrics import classification_report,accuracy_score

# Read the file

In [6]:
path = "C:/Users/Ansari Aaquib/Downloads/drive-download-20210519T143124Z-001/SVM/diab.csv"

In [7]:
db = pd.read_csv(path)

In [8]:
db.head()

Unnamed: 0,no_preg,pg_conc,dia_bp,tri_sf_th,ser_ins,bmi,diab_ped,age,class_val,class
0,6,148,72,35,0,33.6,0.627,50,positive,1
1,1,85,66,29,0,26.6,0.351,31,negative,0
2,8,183,64,0,0,23.3,0.672,32,positive,1
3,1,89,66,23,94,28.1,0.167,21,negative,0
4,0,137,40,35,168,43.1,2.288,33,positive,1


In [9]:
db.tail()

Unnamed: 0,no_preg,pg_conc,dia_bp,tri_sf_th,ser_ins,bmi,diab_ped,age,class_val,class
763,10,101,76,48,180,32.9,0.171,63,negative,0
764,2,122,70,27,0,36.8,0.34,27,negative,0
765,5,121,72,23,112,26.2,0.245,30,negative,0
766,1,126,60,0,0,30.1,0.349,47,positive,1
767,1,93,70,31,0,30.4,0.315,23,negative,0


In [10]:
db.dtypes
db[['class_val','class']].head(20)

no_preg        int64
pg_conc        int64
dia_bp         int64
tri_sf_th      int64
ser_ins        int64
bmi          float64
diab_ped     float64
age            int64
class_val     object
class          int64
dtype: object

In [11]:
db.drop(columns='class_val',inplace=True)

In [12]:
db.dtypes

no_preg        int64
pg_conc        int64
dia_bp         int64
tri_sf_th      int64
ser_ins        int64
bmi          float64
diab_ped     float64
age            int64
class          int64
dtype: object

In [13]:
db.head()

Unnamed: 0,no_preg,pg_conc,dia_bp,tri_sf_th,ser_ins,bmi,diab_ped,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [14]:
db.isnull().sum()

no_preg      0
pg_conc      0
dia_bp       0
tri_sf_th    0
ser_ins      0
bmi          0
diab_ped     0
age          0
class        0
dtype: int64

# check the y-distribution

In [15]:
db['class'].value_counts()

0    500
1    268
Name: class, dtype: int64

# Standarize the dataset

In [17]:
db_std = db.copy()

In [18]:
mm = preprocessing.MinMaxScaler()

In [19]:
db_std.iloc[:,:] = mm.fit_transform(db_std.iloc[:,:])

# reset the y - values

In [20]:
db_std['class'] = db['class']

# compare the two data

In [21]:
db.head()

Unnamed: 0,no_preg,pg_conc,dia_bp,tri_sf_th,ser_ins,bmi,diab_ped,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [22]:
db_std.head()

Unnamed: 0,no_preg,pg_conc,dia_bp,tri_sf_th,ser_ins,bmi,diab_ped,age,class
0,0.352941,0.743719,0.590164,0.353535,0.0,0.500745,0.234415,0.483333,1
1,0.058824,0.427136,0.540984,0.292929,0.0,0.396423,0.116567,0.166667,0
2,0.470588,0.919598,0.52459,0.0,0.0,0.347243,0.253629,0.183333,1
3,0.058824,0.447236,0.540984,0.232323,0.111111,0.418778,0.038002,0.0,0
4,0.0,0.688442,0.327869,0.353535,0.198582,0.642325,0.943638,0.2,1


# Split the data into train and test

In [23]:
trainx,testx,trainy,testy = train_test_split(db_std.drop('class',1),db_std['class'],test_size=0.25) 

In [24]:
trainx.shape,trainy.shape

((576, 8), (576,))

In [25]:
testx.shape,testy.shape

((192, 8), (192,))

# List of kernels supported by svm

# linear (C), sigmoid(C + Gamma), polynomial(C + Gamma), rbf(radial-basis-function)(C + Gamma)

# tuning the C and Gamma parameter with initial values

# values can be tunned later on depending upon the model accuracy

In [26]:
list_c = np.logspace(-5,4,10)

In [27]:
print(list_c)

[1.e-05 1.e-04 1.e-03 1.e-02 1.e-01 1.e+00 1.e+01 1.e+02 1.e+03 1.e+04]


In [28]:
list_G = np.random.uniform(0,1,10)

In [29]:
print(list_G)

[0.56594568 0.88403882 0.24962521 0.27287426 0.9946176  0.76242694
 0.21526744 0.01271798 0.42250516 0.19414984]


# build the parameters 

In [45]:
params = [{'kernel':['linear'],'C':list_c,
           'kernel':['sigmoid'],'C':list_c, 'gamma':list_G,
           'kernel':['poly'],'C':list_c, 'gamma':list_G, 
           'kernel':['rbf'],'C':list_c,'gamma':list_G}]

In [46]:
model = svm.SVC()

In [47]:
grid = GridSearchCV(model, param_grid=params, scoring='accuracy', cv=10, n_jobs=-1).fit(trainx,trainy)

# Best parameters

In [48]:
bp = grid.best_params_

# build the SVM classifier using the best params from cv

In [49]:
m1 = svm.SVC(kernel=bp['kernel'], C=bp['C'], gamma=bp['gamma']).fit(trainx,trainy)

# Prediction

In [50]:
p1 = m1.predict(testx)

# model accuracy

In [51]:
accuracy_score(testy,p1)

0.7708333333333334

# function to print the confusion matrix / classification report 

In [52]:
def cm(actual,pred):
    df = pd.DataFrame({'actual':actual,'pred':pred})
    print(pd.crosstab(df.actual,df.pred,margins=True))
    
    print(classification_report(actual,pred))
    return('success')

# print confusion matrix

In [53]:
cm(testy,p1)

pred      0   1  All
actual              
0       112  12  124
1        32  36   68
All     144  48  192
              precision    recall  f1-score   support

           0       0.78      0.90      0.84       124
           1       0.75      0.53      0.62        68

    accuracy                           0.77       192
   macro avg       0.76      0.72      0.73       192
weighted avg       0.77      0.77      0.76       192



'success'

# model 2 = using the svm.LinearSVC model

In [54]:
m2 = svm.LinearSVC(C=bp['C'],max_iter=10000).fit(trainx,trainy)

In [55]:
p2 = m2.predict(testx)

# model accuracy

In [56]:
accuracy_score(testy,p2)

0.78125

# Confusion matrix

In [57]:
cm(testy,p2)

pred      0   1  All
actual              
0       109  15  124
1        27  41   68
All     136  56  192
              precision    recall  f1-score   support

           0       0.80      0.88      0.84       124
           1       0.73      0.60      0.66        68

    accuracy                           0.78       192
   macro avg       0.77      0.74      0.75       192
weighted avg       0.78      0.78      0.78       192



'success'

# try and fix the 0 in data and rebuild the model
# try Logistics, DT, RF, KNN, SVM