# Medical Diagnosis with Support Vector Machines

## Task 1: Import Libraries



In [32]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

## Task 1: Get Data

In [34]:
column_names = ["pregnancies", "glucose", "bpressure", "skinfold", "insulin", "bmi", "pedigree", "age", "class"]
df=pd.read_csv('/content/data.csv', names=column_names)
print(df.shape)
df.head()

(768, 9)


Unnamed: 0,pregnancies,glucose,bpressure,skinfold,insulin,bmi,pedigree,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Task 1: Extract Features

In [35]:
x=df.iloc[:,:8]
x.head()

Unnamed: 0,pregnancies,glucose,bpressure,skinfold,insulin,bmi,pedigree,age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


## Task 1: Extract Class Labels

In [36]:
y=df['class']
y.head()

0    1
1    0
2    1
3    0
4    1
Name: class, dtype: int64

## Task 2: Split Dataset

In [37]:
X_train,X_test,y_train, y_test=train_test_split(x,y,test_size=0.25,random_state=0)

## Task 2: Normalize Features

In [38]:
scaler=StandardScaler()
scaler.fit(X_train)
X_train=scaler.transform(X_train)

## Task 3: Training a Support Vector Machine

In [40]:
clf=svm.SVC(kernel='sigmoid')
clf.fit(X_train,y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='sigmoid',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

so svm basically makes a decision boundary with which to divide the two status by taking into consideration all the features

## Task 3: Decision Boundary

In [41]:
y_pred=clf.predict(X_train)
print(accuracy_score(y_train, y_pred))

0.6510416666666666


the accuracy is low 
- One reason which comes to mind is that the data provided is insufficient to make the prediction
- or perhaps the model could be improved

## Task 3: SVM Kernels

In [42]:
#hyper parameter optimization
for k in ('linear','poly','rbf','sigmoid'):
  clf=svm.SVC(kernel=k)
  clf.fit(X_train,y_train)
  y_pred=clf.predict(X_train)
  print(k)
  print(accuracy_score(y_train,y_pred))

linear
0.7638888888888888
poly
0.7934027777777778
rbf
0.8246527777777778
sigmoid
0.6510416666666666


## Task 4: Instantiating the Best Model

In [43]:
clf=svm.SVC(kernel='rbf')
clf.fit(X_train,y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

## Task 4: Making a single prediction

In [44]:
# "pregnancies", "glucose", "bpressure", 
# "skinfold", "insulin", "bmi", 
# "pedigree", "age", "class"
new_patient=np.array([[0., 100.,75.,40.,0.,45.,1.5,20]])
new_patient=scaler.transform(new_patient)
clf.predict(new_patient)

array([0])

## Task 4: Testing Set Prediction

In [45]:
new_patient=np.array([X_test.iloc[8] ])
new_patient=scaler.transform(new_patient)
print(clf.predict(new_patient))
print(y_test.iloc[8] )

[1]
0


## Task 5: Accuracy on Testing Set

In [48]:
X_test=scaler.transform(X_test)
y_pred=clf.predict(X_test)
print(accuracy_score(y_test, y_pred))#which is not similar to the training accuracy of 83 as the model?? coz model hasent seen the test set maybe ??

0.6770833333333334


## Task 5: Comparison to All-Zero Prediction

In [49]:
y_zero=np.zeros(y_test.shape)
print(accuracy_score(y_test,y_zero))

0.6770833333333334


- so the data is imbalanced as there are more number of patients with no diabities
- in such cases we can look for various ther performance factors


## Task 5: Precision and Recall

In [51]:
print(classification_report(y_test, y_pred))
#status 1 classification reports are all 0's :(
#some buggy code somewhere

              precision    recall  f1-score   support

           0       0.68      1.00      0.81       130
           1       0.00      0.00      0.00        62

    accuracy                           0.68       192
   macro avg       0.34      0.50      0.40       192
weighted avg       0.46      0.68      0.55       192



  _warn_prf(average, modifier, msg_start, len(result))
