In [1]:
# SVM Classification
import pandas as pd
import numpy as np

from sklearn import svm
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report


from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score

import matplotlib.pyplot as plt

In [2]:
from sklearn.datasets import load_diabetes

In [3]:
print(load_diabetes()["DESCR"])

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, total serum cholesterol
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, total cholesterol / HDL
      - s5      ltg, possibly log of serum triglycerides level
      - s6      glu, blood sugar level

Note: Each of these 1

In [4]:
# filename = 'pima-indians-diabetes.data (2).csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv('pima-indians-diabetes.data (2).csv',names=names)
array = dataframe
X = array.iloc[:,0:8]
Y = array.iloc[:,8]


In [5]:
dataframe.columns

Index(['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'], dtype='object')

In [6]:
dataframe.head(6)

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0


In [7]:
dataframe["class"].value_counts()

0    500
1    268
Name: class, dtype: int64

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.3,random_state=15)

In [9]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((537, 8), (537,), (231, 8), (231,))

In [10]:
clf = SVC(kernel="rbf")# gamma is nothing but "a "

clf.fit(X_train , y_train)

y_pred = clf.predict(X_test)

acc = accuracy_score(y_test, y_pred) * 100

print("Accuracy =", acc)

print(classification_report(y_test, y_pred))

Accuracy = 77.92207792207793
              precision    recall  f1-score   support

           0       0.82      0.88      0.85       160
           1       0.67      0.55      0.60        71

    accuracy                           0.78       231
   macro avg       0.74      0.72      0.73       231
weighted avg       0.77      0.78      0.77       231



### Grid Search CV

In [22]:
# whatever we have called as "a" they are calling as "gamma"
# C=regularization hyper parameter , changing in C we can address underfitting and overfitting issues
# gsv finds the best hyper parameter

clf = SVC()
param_grid = [{'kernel':["rbf"],'gamma':[50,5,10,0.5,1,0.001,0.0001,0.00001,0.000001],'C':[1,15,14,13,12,11,10,0.1] }]
gsv = GridSearchCV(clf,param_grid,cv=10)
gsv.fit(X_train,y_train)



GridSearchCV(cv=10, estimator=SVC(),
             param_grid=[{'C': [1, 15, 14, 13, 12, 11, 10, 0.1],
                          'gamma': [50, 5, 10, 0.5, 1, 0.001, 0.0001, 1e-05,
                                    1e-06],
                          'kernel': ['rbf']}])

In [26]:
gsv.best_params_ , gsv.best_score_ 

({'C': 14, 'gamma': 1e-05, 'kernel': 'rbf'}, 0.756114605171209)

In [27]:
clf = SVC(C= 14, gamma = 0.00001,kernel="rbf")
clf.fit(X_train , y_train)
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred) * 100
print("Accuracy =", acc)
confusion_matrix(y_test, y_pred)

Accuracy = 77.05627705627705


array([[139,  21],
       [ 32,  39]], dtype=int64)

In [25]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.87      0.84       160
           1       0.65      0.55      0.60        71

    accuracy                           0.77       231
   macro avg       0.73      0.71      0.72       231
weighted avg       0.76      0.77      0.76       231



# another model

In [15]:
#overfitting 

#higher gamma means data will be projected to very high dimension so overfitting will occur 

# 

In [16]:
clf = SVC(C= 1, gamma = 100,kernel="rbf")
clf.fit(X_train , y_train)
y_pred_train= clf.predict(X_train)
acc = accuracy_score(y_train, y_pred_train) * 100
print("Accuracy =", acc)


Accuracy = 100.0


In [17]:
y_pred_test= clf.predict(X_test)
acc = accuracy_score(y_test, y_pred_test) * 100
print("Accuracy =", acc)


Accuracy = 69.26406926406926
