## **Pima Indian Diabetes Prediction** 

*Predicting diabetes using machine learning*

In [None]:
#Importing Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

  import pandas.util.testing as tm


*Loading and Reviewing the Data*

In [None]:
data_frame=pd.read_csv("pima-data.txt")

In [None]:
data_frame.shape

(768, 10)

*1=>diabetes*
*0=>No diabetes*

In [None]:
data_frame.head()

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age,skin,diabetes
0,6,148,72,35,0,33.6,0.627,50,1.379,True
1,1,85,66,29,0,26.6,0.351,31,1.1426,False
2,8,183,64,0,0,23.3,0.672,32,0.0,True
3,1,89,66,23,94,28.1,0.167,21,0.9062,False
4,0,137,40,35,168,43.1,2.288,33,1.379,True


In [None]:
data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   num_preg      768 non-null    int64  
 1   glucose_conc  768 non-null    int64  
 2   diastolic_bp  768 non-null    int64  
 3   thickness     768 non-null    int64  
 4   insulin       768 non-null    int64  
 5   bmi           768 non-null    float64
 6   diab_pred     768 non-null    float64
 7   age           768 non-null    int64  
 8   skin          768 non-null    float64
 9   diabetes      768 non-null    bool   
dtypes: bool(1), float64(3), int64(6)
memory usage: 54.9 KB


*Checking for null values*

In [None]:
data_frame.isnull().values.any()

False

*Correlation*

In [None]:
data_frame.corr()

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age,skin,diabetes
num_preg,1.0,0.129459,0.141282,-0.081672,-0.073535,0.017683,-0.033523,0.544341,-0.081672,0.221898
glucose_conc,0.129459,1.0,0.15259,0.057328,0.331357,0.221071,0.137337,0.263514,0.057328,0.466581
diastolic_bp,0.141282,0.15259,1.0,0.207371,0.088933,0.281805,0.041265,0.239528,0.207371,0.065068
thickness,-0.081672,0.057328,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397,1.0,0.074752
insulin,-0.073535,0.331357,0.088933,0.436783,1.0,0.197859,0.185071,-0.042163,0.436783,0.130548
bmi,0.017683,0.221071,0.281805,0.392573,0.197859,1.0,0.140647,0.036242,0.392573,0.292695
diab_pred,-0.033523,0.137337,0.041265,0.183928,0.185071,0.140647,1.0,0.033561,0.183928,0.173844
age,0.544341,0.263514,0.239528,-0.11397,-0.042163,0.036242,0.033561,1.0,-0.11397,0.238356
skin,-0.081672,0.057328,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397,1.0,0.074752
diabetes,0.221898,0.466581,0.065068,0.074752,0.130548,0.292695,0.173844,0.238356,0.074752,1.0


*Changing the data type to numeric*

In [None]:
diabetes_map={True:1,False:0}

In [None]:
data_frame['diabetes']=data_frame['diabetes'].map(diabetes_map)

*The diabetes column has been changed from boolean to numeric*

In [None]:
data_frame.head()

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age,skin,diabetes
0,6,148,72,35,0,33.6,0.627,50,1.379,1
1,1,85,66,29,0,26.6,0.351,31,1.1426,0
2,8,183,64,0,0,23.3,0.672,32,0.0,1
3,1,89,66,23,94,28.1,0.167,21,0.9062,0
4,0,137,40,35,168,43.1,2.288,33,1.379,1


In [None]:
diabetes_true_count=len(data_frame.loc[data_frame['diabetes']== True])
diabetes_false_count=len(data_frame.loc[data_frame['diabetes']== False])

In [None]:
(diabetes_true_count,diabetes_false_count)

(268, 500)

# Train Test Split
The data set is divided into train and test in order to compare the accuracy.
80% for training and 20% for testing.


In [None]:
from sklearn.model_selection import train_test_split
feature_columns=['num_preg','glucose_conc','diastolic_bp','thickness','insulin','bmi','diab_pred','age','skin']
predicted_class=['diabetes']

In [None]:
x=data_frame[feature_columns].values
y=data_frame[predicted_class].values
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.30,random_state=10)

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age,skin,diabetes
0,6,148,72,35,0,33.6,0.627,50,1.379,1
1,1,85,66,29,0,26.6,0.351,31,1.1426,0
2,8,183,64,0,0,23.3,0.672,32,0.0,1
3,1,89,66,23,94,28.1,0.167,21,0.9062,0
4,0,137,40,35,168,43.1,2.288,33,1.379,1


*Impute with the mean*

In [None]:
from sklearn.impute import SimpleImputer
fill_values=SimpleImputer(missing_values=0,strategy="mean")
x_train=fill_values.fit_transform(x_train)
x_test=fill_values.fit_transform(x_test)

In [None]:
from sklearn.svm import SVC
svm_model=SVC(kernel='linear',C=1,random_state=42)
svm_model.fit(x_train,y_train.ravel())

SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001,
    verbose=False)

In [None]:
prediction_from_trained_data=svm_model.predict(x_train)

*Accuracy of the trained data*

In [None]:
from sklearn import metrics
accuracy=metrics.accuracy_score(y_train,prediction_from_trained_data)
accuracy

0.7914338919925512

*Accuracy of the tested data*

In [None]:
svm_predict_test=svm_model.predict(x_test)
svm_accuracy_testdata=metrics.accuracy_score(y_test,svm_predict_test)
svm_accuracy_testdata

0.7489177489177489

In [None]:
import matplotlib.pyplot as plt
print ("Confusion Matrix for Support Vector machine")

print ("{0}".format(metrics.confusion_matrix(y_test, svm_predict_test, labels=[1, 0])))

print ("")

print ("Classification Report\n")
print ("{0}".format(metrics.classification_report(y_test, svm_predict_test, labels=[1, 0])))

Confusion Matrix for Support Vector Amchine
[[ 43  44]
 [ 14 130]]

Classification Report

              precision    recall  f1-score   support

           1       0.75      0.49      0.60        87
           0       0.75      0.90      0.82       144

    accuracy                           0.75       231
   macro avg       0.75      0.70      0.71       231
weighted avg       0.75      0.75      0.73       231

