Importing the Dependencies

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import svm

In [None]:
diabetes_dataset = pd.read_csv('/content/diabetes.csv')

In [None]:
pd.read_csv?

In [None]:
diabetes_dataset.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
diabetes_dataset.shape

(768, 9)

In [None]:
diabetes_dataset.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [None]:
diabetes_dataset['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

0 --> Represents Non Diabetic Patients
1 --> Represents Diabetic Patients

In [None]:
diabetes_dataset.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [None]:
X = diabetes_dataset.drop(columns='Outcome', axis=1)

In [None]:
print(X)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  
0                       0.627   50  
1                       0.351   31  


In [None]:
Y = diabetes_dataset['Outcome']

In [None]:
print(Y)

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


Data Standardization

In [None]:
scaler = StandardScaler() # () means we are creating one instance of this

In [None]:
scaler.fit(X) # fitting this inconsistent data to a certain range

StandardScaler()

In [None]:
standardized_data = scaler.transform(X) # transforms to be within a common range, as our values are really sparse
# can use sclaer.fit_transform()

Feature scaling : is mapping the feature values of a dataset into the same range. Feature scaling is crucial for some machine learning algorithms, which consider distances between observations because the distance between two observations differs for non-scaled and scaled cases.
As we already know, the decision boundary maximises the distance to the nearest points from diff classes. Hence, the distance between data points affect the decision boundary SVM chooses.
* In other words, training an SVM over the scaled and non-scaled data leads to the generation of different models.

The two most widely adopted approaches for feature scaling are Normalization and Standardization

Normalization maps the values into the [0, 1] interval.

Standardization shifts the feature values to have mean zero, then maps them into a range such that they have a standard deviation of 1. It centers the data, and it's more flexible to new values that are not yet seen in the dataset. They why normally prefer standardization in general.

In [None]:
X = standardized_data
Y = diabetes_dataset['Outcome']

In [None]:
print(X)
print(Y)

[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]
0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


In [None]:
# splitting data into training and split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1, stratify=Y)
# we use stratify to avoid skewed data to go into training, which would result in model training for just one type of val

In [None]:
print(X.shape, X_train.shape, X_test.shape)

(768, 8) (614, 8) (154, 8)


Training the Model

In [None]:
classifier = svm.SVC(kernel='linear')

In [None]:
classifier.fit(X_train, Y_train)

SVC(kernel='linear')

Model Evaluation

In [None]:
X_train_pred = classifier.predict(X_train)
training_data_acc = accuracy_score(X_train_pred, Y_train)

In [None]:
print(training_data_acc)

0.7833876221498371


In [None]:
X_test_pred = classifier.predict(X_test)
testing_data_accuracy = accuracy_score(X_test_pred, Y_test)

In [None]:
print(testing_data_accuracy)

0.7792207792207793


In [None]:
input_data = (5,166,72,19,175,25.8,0.587,51)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
'''
since our model has been trained using 768 datapoints, it expects 768 data points
as well, our input data is just one point which would confuse our model, so when we
are reshaping the datapoint we are telling the model that we gonna use just one point
(1,-1) we know we need 1 row and don't know the number of column so we leave it upto
numpy to decide
'''
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# standardize the input data, the scaler has already been trained on the data so no need to
# train it again, just transform it
std_data = scaler.transform(input_data_reshaped)
print(std_data)

prediction = classifier.predict(std_data)
print(prediction)

if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

[[ 0.3429808   1.41167241  0.14964075 -0.09637905  0.82661621 -0.78595734
   0.34768723  1.51108316]]
[1]
The person is diabetic


  "X does not have valid feature names, but"


In [None]:
data = np.array(input_data)
print(data.shape)
reshaped_data = data.reshape(1, -1)
print(reshaped_data[0][7])
print(reshaped_data.shape)

(8,)
51.0
(1, 8)


In [None]:
def diabetic(input_data):
  input_data = (5,166,72,19,175,25.8,0.587,51)

  # changing the input_data to numpy array
  input_data_as_numpy_array = np.asarray(input_data)

  # reshape the array as we are predicting for one instance
  input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

  # standardize the input data
  std_data = scaler.transform(input_data_reshaped)
  print(std_data)

  prediction = classifier.predict(std_data)
  print(prediction)

  if (prediction[0] == 0):
    print('The person is not diabetic')
  else:
    print('The person is diabetic')