#Diabetes Prediction Model

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score


##Data Collection and Preprocessing:
Extractrd data contains tests results for various components.
                          

In [4]:
#Loading the Diabetes Dataset to Pandas Dataframe
diabetes_data= pd.read_csv('/content/Diabetes Data.csv')

In [5]:
diabetes_data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
diabetes_data.shape

(768, 9)

In [8]:
#Extracting statistical measures from the data
diabetes_data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [9]:
diabetes_data['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [10]:
#Proportion of Non Diabetic[0] people is more in our dataset

In [13]:
diabetes_data.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


This helps us understand that:


1.   People with higher levels of Glucose, Blood Pressure, Insulin Index and BMI are susseptible to being Diabetic.
2.   With increasing age, there are elevating chances of people being diagnosed with Diabetes.





In [16]:
#Seperating data and labels
x= diabetes_data.drop(columns='Outcome', axis=1)
y= diabetes_data['Outcome']

#axis= 1 for dropping a coloumn & axis=0 for dropping a row

In [17]:
print(x)
print(y)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  
0                       0.627   50  
1                       0.351   31  


In [18]:
print(y)

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


Data Standardization

In [19]:
scaler= StandardScaler()


In [20]:
scaler.fit(x)

In [21]:
standardized_data= scaler.transform(x)

In [22]:
print(standardized_data)

[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]


In [23]:
x= standardized_data
y= diabetes_data['Outcome']

Splitting Data into Test and Training Data

In [25]:
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size=0.2, stratify=y, random_state=2)

In [26]:
print(x_train.shape, x_test.shape)

(614, 8) (154, 8)


Training the SVM Model

In [28]:
classifier= svm.SVC(kernel='linear')

In [29]:
#Training SVM Classifier
classifier.fit(x_train, y_train)

#Model Evaluation after training

In [31]:
#Accuracy Score

In [32]:
x_train_acc= classifier.predict(x_train)

In [35]:
training_data_accuracy= accuracy_score(x_train_acc, y_train)

In [36]:
print('Accuracy score of Training Data: ', training_data_accuracy)

Accuracy score of Training Data:  0.7866449511400652


#Using Test Data

In [37]:
x_test_acc= classifier.predict(x_test)
test_data_accuracy= accuracy_score(x_test_acc, y_test)
print('Accuracy score of Test Data: ', test_data_accuracy)

Accuracy score of Test Data:  0.7727272727272727


In [38]:
#Model has performed well over the training data and test data

#Making a Predictive System

In [49]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

user_input = input("Please enter the values separated by spaces (e.g., 8 183 64 0 0 23.3 0.672 32): ")

# Split the user input string into a list of values
user_input_values = user_input.split()

# Check if the user entered exactly 8 values
if len(user_input_values) != 8:
    print("Please enter exactly 8 values separated by spaces.")
else:
    # Convert the input values to floats
    input_data = [float(value) for value in user_input_values]

    # Changing data to a NumPy array
    input_data_to_array = np.asarray(input_data)

    # Reshape the array to predict for a single value
    input_data_reshape = input_data_to_array.reshape(1, -1)

    # Standardize the input data (assuming you have a 'scaler' defined elsewhere)
    std_data = scaler.transform(input_data_reshape)

    print("Standardized input data:", std_data)

    # Assuming 'classifier' is defined elsewhere, you can now make a prediction
    prediction = classifier.predict(std_data)

    if(prediction== '0'):
      print('The person is not diagnosed with Diabetes')
    else:
      print('The person is diagnosed with Diabetes')

Please enter the values separated by spaces (e.g., 8 183 64 0 0 23.3 0.672 32): 30 150 70 1 1 25.5 0.6 45
Standardized input data: [[ 7.76714237  0.91091809  0.04624525 -1.22548415 -0.68420768 -0.82403312
   0.38694877  1.00055664]]
The person is diagnosed with Diabetes
