###**The objective of this project is to develop a model that can accurately classify whether an individual is likely to have diabetes or not.**

In [None]:
#importing the required libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [None]:
#importing the dataset from google drive
df=pd.read_csv('/content/drive/MyDrive/Datasets/diabetes_prediction_dataset.csv')

In [None]:
#displaying top 5 entries from Dataset
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [None]:
#information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [None]:
#statistical description of the dataset
df.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


In [None]:
#checking for null values
df.isna().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

In [None]:
#checking number of rows and columns in dataset
df.shape

(100000, 9)

In [None]:
#dropping the duplicate from the dataset
df.drop_duplicates(inplace=True)

In [None]:
#rows and columns after dropping the duplicate values
df.shape

(96146, 9)

In [None]:
#converting gender column to numeric
df['gender']=df['gender'].apply(lambda x:1 if x=='Male' else 0)

In [None]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0,1,never,25.19,6.6,140,0
1,0,54.0,0,0,No Info,27.32,6.6,80,0
2,1,28.0,0,0,never,27.32,5.7,158,0
3,0,36.0,0,0,current,23.45,5.0,155,0
4,1,76.0,1,1,current,20.14,4.8,155,0


In [None]:
df['smoking_history'].unique()

array(['never', 'No Info', 'current', 'former', 'ever', 'not current'],
      dtype=object)

In [None]:
df['smoking_history'].value_counts()

never          34398
No Info        32887
former          9299
current         9197
not current     6367
ever            3998
Name: smoking_history, dtype: int64

In [None]:
#encoding the smoking history column
le=LabelEncoder()
for columns in df[['smoking_history']]:
  df[columns]=le.fit_transform(df[columns])

In [None]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0,1,4,25.19,6.6,140,0
1,0,54.0,0,0,0,27.32,6.6,80,0
2,1,28.0,0,0,4,27.32,5.7,158,0
3,0,36.0,0,0,1,23.45,5.0,155,0
4,1,76.0,1,1,1,20.14,4.8,155,0


In [None]:
#finding the correlation between each variable
df.corr()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
gender,1.0,-0.02858,0.01424,0.078539,-0.079621,-0.023488,0.019944,0.017824,0.037883
age,-0.02858,1.0,0.257305,0.238451,0.241128,0.344797,0.106719,0.114317,0.264927
hypertension,0.01424,0.257305,1.0,0.119982,0.087892,0.148111,0.081441,0.084834,0.19571
heart_disease,0.078539,0.238451,0.119982,1.0,0.023608,0.061376,0.06814,0.070832,0.170711
smoking_history,-0.079621,0.241128,0.087892,0.023608,1.0,0.18384,0.035518,0.038259,0.088471
bmi,-0.023488,0.344797,0.148111,0.061376,0.18384,1.0,0.084423,0.092543,0.214932
HbA1c_level,0.019944,0.106719,0.081441,0.06814,0.035518,0.084423,1.0,0.171615,0.406408
blood_glucose_level,0.017824,0.114317,0.084834,0.070832,0.038259,0.092543,0.171615,1.0,0.424336
diabetes,0.037883,0.264927,0.19571,0.170711,0.088471,0.214932,0.406408,0.424336,1.0


In [None]:
#Selecting the feature and target variable
X=df.drop(['diabetes'],axis=1)
y=df['diabetes']

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)

###**1. LOGISTIC REGRESSION MODEL**

In [None]:
#fitting the Logistic Regression Model
model=LogisticRegression(max_iter=500)
model.fit(X_train,y_train)

In [None]:
#making predictions using Logistic Regression Model
pred=model.predict(X_test)

In [None]:
#printing the classification report
cr=classification_report(y_test,pred)
print(cr)

              precision    recall  f1-score   support

           0       0.97      0.99      0.98     26387
           1       0.84      0.63      0.72      2457

    accuracy                           0.96     28844
   macro avg       0.91      0.81      0.85     28844
weighted avg       0.96      0.96      0.96     28844



###**2. SVC MODEL**

In [None]:
#fitting the SVC Model
model2=SVC()
model2.fit(X_train,y_train)

In [None]:
#making predictions using SVC Model
predict2=model2.predict(X_test)

In [None]:
cr2=classification_report(y_test,predict2)

In [None]:
print(cr2)

              precision    recall  f1-score   support

           0       0.95      1.00      0.97     26387
           1       1.00      0.39      0.56      2457

    accuracy                           0.95     28844
   macro avg       0.97      0.70      0.77     28844
weighted avg       0.95      0.95      0.94     28844



###**3. RANDOM FOREST CLASSIFIER MODEL**

In [None]:
#fitting the RFC Model
model3=RandomForestClassifier(random_state=2525)
model3.fit(X_train,y_train)

In [None]:
#making predictions using RFC Model
predict3=model3.predict(X_test)

In [None]:
cr3=classification_report(y_test,predict3)
print(cr3)

              precision    recall  f1-score   support

           0       0.97      1.00      0.98     26387
           1       0.94      0.69      0.80      2457

    accuracy                           0.97     28844
   macro avg       0.96      0.84      0.89     28844
weighted avg       0.97      0.97      0.97     28844



###**4. DECISION TREE CLASSIFIER MODEL**

In [None]:
#Fitting the DTC Model
model4=DecisionTreeClassifier()
model4.fit(X_train,y_train)

In [None]:
#making predictions using DTC Model
pred4=model4.predict(X_test)

In [None]:
cr4=classification_report(y_test,pred4)
print(cr4)

              precision    recall  f1-score   support

           0       0.98      0.97      0.97     26387
           1       0.69      0.74      0.71      2457

    accuracy                           0.95     28844
   macro avg       0.83      0.85      0.84     28844
weighted avg       0.95      0.95      0.95     28844



In [None]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0,1,4,25.19,6.6,140,0
1,0,54.0,0,0,0,27.32,6.6,80,0
2,1,28.0,0,0,4,27.32,5.7,158,0
3,0,36.0,0,0,1,23.45,5.0,155,0
4,1,76.0,1,1,1,20.14,4.8,155,0


In [None]:
input_data=(0,55,1,0,2,22.5,7.2,120)
#changing the input data to numpy array
input_data_as_numpy_array=np.asarray(input_data)

#reshaping the array
input_data_reshaped=input_data_as_numpy_array.reshape(1,-1)

prediction=model3.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print("The person is not diabetic")
else:
  print("The person is diabetic")

[1]
The person is diabetic




In [None]:
import pickle

In [None]:

with open("trained_model.pkl", "wb") as file:
  pickle.dump(model3,file)


In [None]:
input_data=(0,55,1,0,2,22.5,7.2,120)
#changing the input data to numpy array
input_data_as_numpy_array=np.asarray(input_data)

#reshaping the array
input_data_reshaped=input_data_as_numpy_array.reshape(1,-1)

prediction=loaded_model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print("The person is not diabetic")
else:
  print("The person is diabetic")