# Medical Diagnosis with Support Vector Machines


In [2]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [3]:
column_names = ["pregnancies", "glucose", "bpressure", "skinfold", "insulin", "bmi", "pedigree", "age", "class"]
df = pd.read_csv("data.csv" , names = column_names)

## Extract Features

In [4]:
X = df.iloc[:,:8]
print(X)
df

     pregnancies  glucose  bpressure  skinfold  insulin   bmi  pedigree  age
0              6      148         72        35        0  33.6     0.627   50
1              1       85         66        29        0  26.6     0.351   31
2              8      183         64         0        0  23.3     0.672   32
3              1       89         66        23       94  28.1     0.167   21
4              0      137         40        35      168  43.1     2.288   33
..           ...      ...        ...       ...      ...   ...       ...  ...
763           10      101         76        48      180  32.9     0.171   63
764            2      122         70        27        0  36.8     0.340   27
765            5      121         72        23      112  26.2     0.245   30
766            1      126         60         0        0  30.1     0.349   47
767            1       93         70        31        0  30.4     0.315   23

[768 rows x 8 columns]


Unnamed: 0,pregnancies,glucose,bpressure,skinfold,insulin,bmi,pedigree,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


## Task 1: Extract Class Labels

In [5]:
y = df['class']
print(y)

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: class, Length: 768, dtype: int64


## Task 2: Split Dataset

In [17]:
X_train, X_test , y_train , y_test = train_test_split(X , y , test_size = 0.25)
print(X_train.shape)

(576, 8)


## Task 2: Normalize Features

In [7]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_train[:5,:]

array([[ 2.45571805, -0.44785756,  0.57059472, -1.29127968, -0.66795448,
        -1.1347182 , -1.01094261,  0.92676062],
       [-0.53791919,  0.419359  ,  0.0467462 , -1.29127968, -0.66795448,
        -0.44626942,  0.15750039, -0.86427572],
       [-0.83728292, -1.16021402, -0.0580235 , -1.29127968, -0.66795448,
        -1.83615658,  0.39407403, -0.52312594],
       [-1.13664664,  0.79102324,  0.0467462 , -1.29127968, -0.66795448,
         0.72279454, -0.44258886, -0.4378385 ],
       [ 0.06080826,  0.07866678, -0.37233261, -1.29127968, -0.66795448,
        -0.04359183, -0.75417365,  0.15917362]])

## Task 3: Training a Support Vector Machine

In [8]:
clf = svm.SVC(kernel ='sigmoid')
clf.fit(X_train , y_train)

SVC(kernel='sigmoid')

## Task 3: Decision Boundary

In [9]:
y_pred = clf.predict(X_train)
print(y_pred)
print(accuracy_score(y_train , y_pred))

[0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1
 0 0 1 0 1 0 1 1 0 0 1 1 1 0 1 0 0 1 1 1 1 1 0 0 0 0 0 0 1 1 1 0 1 0 1 1 0
 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 1 0 0 1 0 1 0 1 0 0 1 1 0 0 0 0 0 1
 0 1 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 1 1 1 1 1 0 0 0 1 0 0 0
 0 1 1 0 0 1 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 1 0 1 1 0 1 0 0 1 1 0
 0 0 1 1 1 1 0 0 1 0 0 0 1 0 0 0 0 0 1 0 1 0 1 0 1 0 1 0 1 0 0 0 0 0 0 1 1
 1 1 1 1 0 1 1 0 0 1 1 0 1 1 0 0 0 0 1 0 1 0 0 0 1 0 0 1 0 1 0 0 1 0 0 1 1
 0 0 1 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0 1 1 1 0 1 0 0 0 0 1 0 0 1 0 0 0 0
 0 0 0 0 1 1 1 0 1 1 1 1 0 0 1 1 0 1 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 1
 0 1 0 1 0 1 0 0 0 1 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0 0 0 1
 0 0 0 0 1 0 1 0 1 0 1 0 1 0 0 1 1 1 1 0 0 0 0 0 0 1 1 1 0 0 0 1 0 0 0 0 0
 0 0 1 1 1 1 0 0 0 0 0 1 1 0 0 1 0 1 1 1 1 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0
 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 1 1 0 1 1 0 0 1 1 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 1 0 0 

## Task 3: SVM Kernels

Trying different vtypes of kernels avialable in SVM 

In [10]:
for k in ('linear' , 'poly' , 'rbf' , 'sigmoid'):
    clf = svm.SVC(kernel = k)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_train)
    print(k)
    print(accuracy_score(y_train , y_pred))

linear
0.7795138888888888
poly
0.8055555555555556
rbf
0.8385416666666666
sigmoid
0.65625


## Task 4: Instantiating the Best Model

In [11]:
clf = svm.SVC(kernel = 'rbf')
clf.fit(X_train, y_train)

SVC()

## Task 4: Making a single prediction

In [14]:
# "pregnancies", "glucose", "bpressure", 
# "skinfold", "insulin", "bmi", 
# "pedigree", "age", "class"


patient = np.array([[1 , 200 , 75,40 , 0, 45, 1.5 , 20]])
patient = scaler.transform(patient)
clf.predict(patient)

array([1])

## Task 4: Testing Set Prediction

In [23]:
for i in range(1,20, 4):
    patient = np.array([X_test.iloc[i]])
    patient = scaler.transform(patient)
    print('pred' , clf.predict(patient) , 'gt' , y_test.iloc[i])

pred [0] gt 0
pred [0] gt 1
pred [0] gt 0
pred [0] gt 0
pred [1] gt 1


## Task 5: Accuracy on Testing Set

In [24]:
X_test = scaler.transform(X_test)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test , y_pred))

0.8489583333333334


## Task 5: Comparison to All-Zero Prediction

In [28]:
y_zero =np.zeros(y_test.shape)
print(accuracy_score(y_test , y_zero))

0.6927083333333334


## Task 5: Precision and Recall

In [29]:
print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

           0       0.85      0.95      0.90       133
           1       0.84      0.63      0.72        59

    accuracy                           0.85       192
   macro avg       0.85      0.79      0.81       192
weighted avg       0.85      0.85      0.84       192

