# Import libraries

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# Import dataset into pandas dataframe

In [41]:
df = pd.read_csv("diabetes.csv")

## Checking data

In [42]:
df.head(-1)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
762,9,89,62,0,0,22.5,0.142,33,0
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0


In [43]:
df.shape

(768, 9)

In [44]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [45]:
# Get statistical summary of the data
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


# Data preparing:

In [46]:
# Checking for missing values:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [47]:
# Selecting features:
x = df.iloc[:, 0:8]
# print(x)

In [48]:
# X = df.drop("Outcome", axis=1)   # This line does the same as above
# print(X)

In [49]:
# Selecting target value:
y = df["Outcome"]
# print(y)

In [50]:
# y = df.iloc[:, -1]   # This line does the exact same thing as above
# print(y)

In [51]:
# df.drop_duplicates()    # NO DUPLICATE FOUND

# Scaling the dataset

In [52]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

x_scaled = scaler.fit_transform(x)

In [53]:
print(x_scaled)

[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]


In [54]:
x_scaled.max(axis=0)

array([3.90657835, 2.44447821, 2.73452825, 4.92186584, 6.65283938,
       4.45580749, 5.88356477, 4.06371575])

# spliting the dataset

In [55]:
from sklearn.model_selection import train_test_split

x_train , x_test, y_train , y_test = train_test_split(x_scaled, y, test_size=0.20, random_state=42)

In [56]:
print(x_train)

[[-0.54791859 -1.15469351 -3.57259724 ... -4.06047387 -0.50700636
  -1.04154944]
 [ 1.53084665 -0.27837344  0.66661825 ... -0.48135115  2.44666971
   1.4259954 ]
 [-0.84488505  0.56664949 -1.19450074 ... -0.41789153  0.55003518
  -0.95646168]
 ...
 [ 1.82781311 -0.62264204  0.87340925 ...  1.72704372  2.00573238
   0.40494237]
 [-1.14185152  0.62924378 -3.57259724 ...  1.32090213 -0.8059981
  -0.36084741]
 [-1.14185152  0.12848945  1.39038675 ... -1.20479085 -0.63385134
  -1.04154944]]


In [57]:
print(y_train)

60     0
618    1
346    0
294    0
231    1
      ..
71     0
106    0
270    1
435    1
102    0
Name: Outcome, Length: 614, dtype: int64


## Train the model using SVM with multiple kernal for better results

In [61]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

kernal = ['linear', 'rbf', 'poly', 'sigmoid']

for k in kernal:
    model = SVC(C=1, kernel=k, gamma="scale")
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    train_acc = accuracy_score(y_test, y_pred)
    print(f"Kernel={k}, train Accuracy={train_acc}")

Kernel=linear, train Accuracy=0.7597402597402597
Kernel=rbf, train Accuracy=0.7272727272727273
Kernel=poly, train Accuracy=0.7467532467532467
Kernel=sigmoid, train Accuracy=0.6558441558441559


## Cross validation

In [67]:
from sklearn.model_selection import cross_val_score

score = cross_val_score(model, x_scaled, y, cv=10)
print(f"Cross validation score:{score.mean():.4f}")

Cross validation score:0.7032
