In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

In [4]:
#Now lets collect the data and analyse it
#loading the data----->
diabetes_dataset = pd.read_csv('diabetes.csv')

In [5]:
diabetes_dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
diabetes_dataset.shape

(768, 9)

In [7]:
diabetes_dataset['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [8]:
#Here "0" represents Non-Diabetic "1" represents Diabetic
diabetes_dataset.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [9]:
#From the above table we get the mean values of all the parameters And we can observe that the people with high age and high blood glucose levels are having diabetes(more likely to get the diabetes )
# Separate the data and labels
X = diabetes_dataset.drop(columns='Outcome',axis=1)
Y = diabetes_dataset['Outcome']



In [10]:
print(X)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  
0                       0.627   50  
1                       0.351   31  


In [11]:
print(Y)

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


In [12]:
#Standardisation of the data
scaler = StandardScaler()
scaler.fit(X)
standardized_data = scaler.transform(X)

In [13]:
X =  standardized_data
Y = diabetes_dataset['Outcome']

In [14]:
print(X)
print(Y)

[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]
0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


In [15]:
X_train,X_test,Y_train,Y_test =  train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)a

In [16]:
print(X.shape,X_train.shape,X_test.shape)

(768, 8) (614, 8) (154, 8)


In [17]:
#Training the model using the training data set
classifier = svm.SVC(kernel='linear')
classifier.fit(X_train,Y_train)

SVC(kernel='linear')

In [18]:
XtrainPrediction=classifier.predict(X_train)
Xtrain_accuract_score=accuracy_score(XtrainPrediction,Y_train)

In [19]:
print('accuracy score for training data: ',Xtrain_accuract_score)

accuracy score for training data:  0.7866449511400652


In [20]:
XtestPrediction = classifier.predict(X_test)
Xtest_accuracy_score = accuracy_score(XtestPrediction,Y_test)
print("Accuracy score for testing data : ",Xtest_accuracy_score )

Accuracy score for testing data :  0.7727272727272727


In [22]:
#Now we will build the model where it take the input of required parameters and predict the disease
give_input = (1,103,30,38,83,43.3,0.183,33)
#changing the given input into numpy array
numpy_array = np.asarray(give_input)
#now reshape the input , because the data whhich we trained is of shape[768 ,8]
#But the input which we give is [1,8]
reshaped_input = numpy_array.reshape(1,-1)
#Now standardize the input data
std_input_data = scaler.transform(reshaped_input)
print(std_input_data)

final_prediction = classifier.predict(std_input_data)
print(final_prediction)


[[-0.84488505 -0.56004775 -2.02166474  1.09545411  0.02778979  1.43512945
  -0.87244072 -0.0204964 ]]
[0]




In [23]:
if final_prediction[0] == 1:
  print("Diabetic")
else:
  print("Non_diabetic")

Non_diabetic
