Importing Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

Data Collection 

In [2]:
diabetes_dataset = pd.read_csv('diabetes.csv')

In [3]:
diabetes_dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


Outcome: 1 = diabetic and 0 = not diabetic

In [4]:
#getting stats
diabetes_dataset.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [5]:
diabetes_dataset['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [6]:
#this is to help get the mean values of all features based on outcome only
#for ex: 3.2 people on average who are pregnant are non-diabetic    
diabetes_dataset.groupby('Outcome').mean()


Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


###### prep for train test split

In [7]:
X = diabetes_dataset.drop(columns = 'Outcome', axis = 1)
Y = diabetes_dataset['Outcome']

In [8]:
print(X, Y)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  
0                       0.627   50  
1                       0.351   31  


###### Data Standardization

In [9]:
scaler = StandardScaler()

In [10]:
scaler.fit(X)

StandardScaler()

In [11]:
standardized_data = scaler.transform(X)

In [12]:
print(standardized_data) 

[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]


#we are standardizing the data because the data ranges are too spread out. 
#StandardScaler helps with standardizing the data so they all fall within a smaller range
#note how now the data falls between 0 and 1
#It is used to standardize features by removing the mean and scaling them to unit variance. This process is also known as z-score normalization.
#how it does this is by first removing the mean by calculating the mean of each feature and dividing it by the mean of each feature values making the mean for every feauture to equal 0
#next it scales the features by dividing each value by the standard deviation of the corresponding feature essentially making the variance of each feature to be 1
#Standardizing features is often beneficial in machine learning algorithms, especially those that rely on distance measures, such as k-nearest neighbors or support vector machines. It can help prevent features with larger scales from dominating the learning process.

In [13]:
X = standardized_data
Y = diabetes_dataset['Outcome'] #this part was already done but redoing it just for clarity purposes

In [14]:
print(X,Y)

[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]] 0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


###### Train Test Split time

In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2, stratify = Y)

In [16]:
print(X.shape,X_train.shape,X_test.shape)

(768, 8) (614, 8) (154, 8)


In [17]:
classifier = svm.SVC(kernel='linear') # accuracy score with linear on training is 78% and test is 75%
#classifier = svm.SVC(kernel='rbf', C=1.0, gamma='scale') # accuracy score with rbf on training is 82% and test is 80%
#classifier = svm.SVC(kernel='poly', degree=3, C=1.0) # accuracy score with polynomial on training is 80% and test is 74%

In [18]:
#training the support vector machine classifer
classifier.fit(X_train, Y_train)

SVC(kernel='linear')

###### Evaluating the model

In [19]:
#accuracy score of training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [20]:
print("Accuracy score of the training data:",training_data_accuracy )

Accuracy score of the training data: 0.7736156351791531


In [21]:
#accuracy on test data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [22]:
print("Accuracy score of test data: ",test_data_accuracy )

Accuracy score of test data:  0.7727272727272727


###### Making a Predictive System based on data

In [23]:
input_data = (0,162,76,56,100,53.2,0.759,25)

#change input data to an np array
input_data_as_np_array = np.asarray(input_data)

#reshape the array as we are prediction for one instance
#if array isnt reshaped, the model will try to predict the outcome on the entire dataset which is 768 records. 
#we need it to just predict the outcome for one instance only bc of that
input_data_reshaped = input_data_as_np_array.reshape(1,-1)

#VERY IMPORTANT STEP: you have to standardize the input data bc the range is too big with a lot of values similar to how you standardized the original standardized and transformed the data previously
std_data = scaler.transform(input_data_reshaped)
print(std_data)

prediction = classifier.predict(std_data)
print(prediction)

[[-1.14185152  1.28648383  0.35643175  2.22455921  0.17539902  2.69162998
   0.86714764 -0.70119842]]
[1]


