#**DIABETES PREDICTION USING MACHINE LEARNING**

In [None]:
#import the all libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

In [None]:
#read csv file
df=pd.read_csv("/content/synthetic_healthcare_diabetes.csv")
df

Unnamed: 0,PatientID,Age,Gender,BMI,BloodPressure,Cholesterol,Glucose,Insulin,DiabetesPedigreeFunction,PhysicalActivity,SmokingStatus,DiabetesOutcome
0,1,69,Female,24.43,155,159,109,19.82,0.80,Low,Former smoker,0
1,2,32,Female,17.17,107,247,178,17.49,0.30,Medium,Non-smoker,0
2,3,89,Male,26.74,110,251,170,25.78,1.85,Medium,Former smoker,1
3,4,78,Male,33.80,146,285,141,10.73,0.83,Medium,Current smoker,1
4,5,38,Female,24.01,143,229,139,9.95,0.84,Low,Former smoker,0
...,...,...,...,...,...,...,...,...,...,...,...,...
14995,14996,71,Male,29.21,94,258,163,22.51,0.13,High,Current smoker,1
14996,14997,32,Female,34.44,98,184,142,21.32,2.45,High,Non-smoker,1
14997,14998,67,Female,32.88,105,165,88,21.93,0.75,Medium,Former smoker,0
14998,14999,73,Male,28.50,158,286,92,4.86,0.11,Low,Former smoker,1


In [None]:
#check null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   PatientID                 15000 non-null  int64  
 1   Age                       15000 non-null  int64  
 2   Gender                    15000 non-null  object 
 3   BMI                       15000 non-null  float64
 4   BloodPressure             15000 non-null  int64  
 5   Cholesterol               15000 non-null  int64  
 6   Glucose                   15000 non-null  int64  
 7   Insulin                   15000 non-null  float64
 8   DiabetesPedigreeFunction  15000 non-null  float64
 9   PhysicalActivity          15000 non-null  object 
 10  SmokingStatus             15000 non-null  object 
 11  DiabetesOutcome           15000 non-null  int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 1.4+ MB


In [None]:
X=df[["Age","Gender","BMI","BloodPressure","Cholesterol","Glucose","Insulin","DiabetesPedigreeFunction","PhysicalActivity","SmokingStatus"]]
X # Taking all data except target value

Unnamed: 0,Age,Gender,BMI,BloodPressure,Cholesterol,Glucose,Insulin,DiabetesPedigreeFunction,PhysicalActivity,SmokingStatus
0,69,Female,24.43,155,159,109,19.82,0.80,Low,Former smoker
1,32,Female,17.17,107,247,178,17.49,0.30,Medium,Non-smoker
2,89,Male,26.74,110,251,170,25.78,1.85,Medium,Former smoker
3,78,Male,33.80,146,285,141,10.73,0.83,Medium,Current smoker
4,38,Female,24.01,143,229,139,9.95,0.84,Low,Former smoker
...,...,...,...,...,...,...,...,...,...,...
14995,71,Male,29.21,94,258,163,22.51,0.13,High,Current smoker
14996,32,Female,34.44,98,184,142,21.32,2.45,High,Non-smoker
14997,67,Female,32.88,105,165,88,21.93,0.75,Medium,Former smoker
14998,73,Male,28.50,158,286,92,4.86,0.11,Low,Former smoker


In [None]:
Y=df['DiabetesOutcome']
Y # Target value

Unnamed: 0,DiabetesOutcome
0,0
1,0
2,1
3,1
4,0
...,...
14995,1
14996,1
14997,0
14998,1


In [None]:
#split the data using train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=40) #test_size means 20% of data is taking for testing

In [None]:
x_train

Unnamed: 0,Age,Gender,BMI,BloodPressure,Cholesterol,Glucose,Insulin,DiabetesPedigreeFunction,PhysicalActivity,SmokingStatus
4741,70,Female,25.52,158,194,177,3.25,0.43,Medium,Current smoker
6160,68,Female,35.51,88,195,97,7.23,0.56,Low,Current smoker
9466,26,Female,36.25,145,276,71,6.49,0.11,Medium,Current smoker
12802,66,Female,30.95,152,296,184,26.83,1.05,High,Current smoker
8391,34,Male,25.24,112,199,196,2.54,1.67,High,Current smoker
...,...,...,...,...,...,...,...,...,...,...
11256,64,Male,36.35,82,165,188,9.40,0.60,High,Non-smoker
14501,58,Male,32.34,96,224,158,24.02,2.49,Medium,Non-smoker
14343,39,Male,23.36,102,238,181,19.64,0.56,Medium,Non-smoker
14555,20,Female,24.34,82,224,145,5.17,1.00,Medium,Non-smoker


In [None]:
x_test

Unnamed: 0,Age,Gender,BMI,BloodPressure,Cholesterol,Glucose,Insulin,DiabetesPedigreeFunction,PhysicalActivity,SmokingStatus
6609,74,Male,33.14,81,290,161,12.88,1.38,Low,Former smoker
6460,20,Male,20.41,153,250,188,16.34,0.90,High,Current smoker
10508,41,Female,21.19,119,299,83,10.35,1.60,High,Non-smoker
12107,61,Female,37.18,91,259,145,19.16,0.24,Low,Current smoker
1039,29,Male,25.35,146,257,132,11.92,2.28,High,Current smoker
...,...,...,...,...,...,...,...,...,...,...
11279,61,Female,18.40,138,228,102,26.47,2.47,Medium,Non-smoker
8806,77,Male,24.24,125,200,179,8.08,0.75,Medium,Non-smoker
8337,30,Male,27.74,123,240,185,20.62,1.87,High,Former smoker
13116,50,Female,37.23,134,219,144,17.54,2.28,Low,Former smoker


In [None]:
y_train

Unnamed: 0,DiabetesOutcome
4741,0
6160,1
9466,0
12802,1
8391,1
...,...
11256,1
14501,0
14343,0
14555,0


In [None]:
y_test

Unnamed: 0,DiabetesOutcome
6609,1
6460,0
10508,0
12107,0
1039,1
...,...
11279,0
8806,0
8337,0
13116,1


In [None]:
#completed of splitting the data into training and testing data

In [None]:
"""Before Giving the test data ml algorithm
1. we should convert the all values into numericals(Onehotencoder)
2. there should be no null values(SimpleImputer)
3. use standardscalar to avoid the wrong values"""

'Before Giving the test data ml algorithm\n1. we should convert the all values into numericals(Onehotencoder)\n2. there should be no null values(SimpleImputer)\n3. use standardscalar to avoid the wrong values'

In [None]:
#Seperate numerical features and categorical features
numerical_features =["Age","BMI","BloodPressure","Cholesterol","Glucose","Insulin","DiabetesPedigreeFunction"]
numerical_features

['Age',
 'BMI',
 'BloodPressure',
 'Cholesterol',
 'Glucose',
 'Insulin',
 'DiabetesPedigreeFunction']

In [None]:
categorical_features=["Gender","PhysicalActivity","SmokingStatus"]
categorical_features

['Gender', 'PhysicalActivity', 'SmokingStatus']

In [None]:
#check the total null values in numerical and categorical features
df[numerical_features].isnull().sum()

Unnamed: 0,0
Age,0
BMI,0
BloodPressure,0
Cholesterol,0
Glucose,0
Insulin,0
DiabetesPedigreeFunction,0


In [None]:
df[categorical_features].isnull().sum()

Unnamed: 0,0
Gender,0
PhysicalActivity,0
SmokingStatus,0


In [None]:
# To fill fill the null values we use the below methods
# numerical_transformer
#categorical_transformer

In [None]:
numerical_transformer=Pipeline(
    steps=[
        ("Imputer",SimpleImputer(strategy="mean")),
        ("StandardScalar",StandardScaler())
    ]
)

In [None]:
numerical_transformer

In [None]:
categorical_transformer = Pipeline(
    steps=[
        ("Imputer", SimpleImputer(strategy="most_frequent")),
        ("OneHotEncoder", OneHotEncoder(handle_unknown="ignore"))
    ]
)

In [None]:
categorical_transformer

In [None]:
# ColumnTransformer : fitting the numerical and categorical transformers into ColumnTransformer is named as preprocessor with the data is given from numerical and categorical features

#Preprocessor

In [None]:
preprocessor=ColumnTransformer(
    transformers=[
        ("numerical",numerical_transformer,numerical_features),
        ("categorical",categorical_transformer,categorical_features)
    ]
)

In [None]:
preprocessor

In [None]:
# After creating a Preprocessor we completed a ML algorithm now we should create a model


#**Logistic Regression**

In [None]:
lr_model = Pipeline(
    steps=[
        ("Preprocessor", preprocessor),
        ("Classifier", LogisticRegression(solver='liblinear'))
    ]
)

In [None]:
lr_model

In [None]:
lr_model=lr_model.fit(x_train,y_train) # train the model for understanding the patterns from the training data

In [None]:
y_predict=lr_model.predict(x_test) # predicting the target data by usind all the data

In [None]:
y_predict

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
y_test

Unnamed: 0,DiabetesOutcome
6609,1
6460,0
10508,0
12107,0
1039,1
...,...
11279,0
8806,0
8337,0
13116,1


In [None]:
lr_acc = accuracy_score(y_test,y_predict) #main goal is improving accuracy using other algorithms

In [None]:
lr_acc

0.7086666666666667

#**KNN**

In [None]:
knn_model=Pipeline(
    steps=[
        ("Preprocessor",preprocessor),
        ("Classifier",KNeighborsClassifier())
    ]
)

In [None]:
knn_model.fit(x_train,y_train)

In [None]:
knn_y_pred=knn_model.predict(x_test)

In [None]:
knn_y_pred

array([0, 0, 1, ..., 0, 0, 0])

In [None]:
y_test

Unnamed: 0,DiabetesOutcome
6609,1
6460,0
10508,0
12107,0
1039,1
...,...
11279,0
8806,0
8337,0
13116,1


In [None]:
knn_acc = accuracy_score(y_test,knn_y_pred)

In [None]:
knn_acc

0.6386666666666667

#**Decision** **Tree**

In [None]:
dt_model = Pipeline(steps=[
    ("Preprocessor", preprocessor),
    ("Classifier", DecisionTreeClassifier())
])

In [None]:
dt_model.fit(x_train, y_train)

In [None]:
dt_y_predict = dt_model.predict(x_test)
dt_y_predict

array([1, 0, 0, ..., 1, 0, 0])

In [None]:
dt_accuracy = accuracy_score(y_test, dt_y_predict)

In [None]:
dt_accuracy

0.5676666666666667

#**Random Forest**

In [None]:
rf_model = Pipeline(steps=[
    ("Preprocessor", preprocessor),
    ("Classifier", RandomForestClassifier())
])

In [None]:
rf_model.fit(x_train, y_train)

In [None]:
rf_y_predict = rf_model.predict(x_test)
rf_y_predict

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
rf_accuracy = accuracy_score(y_test, rf_y_predict)
rf_accuracy

0.705

# **Support** **Vector** **Machine**



In [None]:
svm_model = Pipeline(steps=[
    ("Preprocessor", preprocessor),
    ("Classifier", SVC())
])

In [None]:
svm_model.fit(x_train, y_train)

In [None]:
svm_y_predict = svm_model.predict(x_test)
svm_y_predict

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
svm_accuracy = accuracy_score(y_test, svm_y_predict)
svm_accuracy

0.7086666666666667

# **Naive Bayes**

In [None]:
nb_model=Pipeline(
    steps=[
        ("Preprocessor",preprocessor),
         ("Classifier",GaussianNB())
    ]
)

In [None]:
nb_model.fit(x_train,y_train)

In [None]:
nb_y_pred=nb_model.predict(x_test)
nb_y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
nb_acc = accuracy_score(y_test,nb_y_pred)
nb_acc

0.7086666666666667

#**Hyperparameter tuning for Logistic Regression**

In [None]:
# Define the hyperparameter grid for logistic regression
param_grid = {
    'Classifier__C': [0.01, 0.1, 1, 10, 100],
    'Classifier__penalty': ['l1', 'l2']
}

In [None]:
# Initialize GridSearchCV
grid_search = GridSearchCV(lr_model, param_grid, cv=5, scoring='accuracy')

In [None]:
# Fit the model with grid search
grid_search.fit(x_train, y_train)


In [None]:
# Print best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation accuracy: {:.2f}".format(grid_search.best_score_))

Best parameters found:  {'Classifier__C': 0.01, 'Classifier__penalty': 'l1'}
Best cross-validation accuracy: 0.71


In [None]:
#testcase accuracy for Linear Regression
print(" Test accuracy for Linear Regression",lr_acc)

 Test accuracy for Linear Regression 0.7086666666666667


#**Hyperparameter tuning for Decision Tree**

In [None]:
param_grid = {
    'Classifier__criterion': ['gini', 'entropy'],
    'Classifier__splitter': ['best', 'random'],
    'Classifier__max_depth': [None, 10, 20, 30],
    'Classifier__min_samples_split': [2, 5, 10],
    'Classifier__min_samples_leaf': [1, 2, 4],
    'Classifier__max_features': [None, 'auto', 'sqrt', 'log2']
}

In [None]:
dt_grid_search = GridSearchCV(dt_model, param_grid, cv=5, n_jobs=-1, verbose=1)


In [None]:
dt_grid_search.fit(x_train, y_train)


In [None]:
# Print the best parameters and best score
print("Best Parameters:", dt_grid_search.best_params_)
print("Best accuracy:", dt_grid_search.best_score_)

In [None]:
#test case accuracy of decision tree
print(" Test accuracy for decision tree",dt_accuracy)

# **Hyperparameter tuning for Random Forest**


In [None]:
param_grid = {
    'Classifier__criterion': ['gini', 'entropy'],
    'Classifier__splitter': ['best', 'random'],
    'Classifier__max_depth': [None, 10, 20, 30],
    'Classifier__min_samples_split': [2, 5, 10],
    'Classifier__min_samples_leaf': [1, 2, 4],
    'Classifier__max_features': [None, 'auto', 'sqrt', 'log2']
}

In [None]:
rf_grid_search = GridSearchCV(rf_model, param_grid, cv=5, n_jobs=-1, verbose=1)


In [None]:
param_grid = {
    'Classifier__criterion': ['gini', 'entropy'],
    'Classifier__max_depth': [None, 10, 20, 30],
    'Classifier__min_samples_split': [2, 5, 10],
    'Classifier__min_samples_leaf': [1, 2, 4],
    'Classifier__max_features': [None, 'auto', 'sqrt', 'log2']
}

rf_grid_search = GridSearchCV(rf_model, param_grid, cv=5, n_jobs=-1, verbose=1)
rf_grid_search.fit(x_train, y_train)
# Removed the second redundant .fit call
rf_grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


In [None]:
print("Best Parameters:", dt_grid_search.best_params_)
print("Best accuracy:", dt_grid_search.best_score_)

print(" Test accuracy for decision tree",dt_accuracy)

#Project Report

## Diabetes Prediction using Machine Learning

This project aims to predict diabetes outcomes using machine learning algorithms. The dataset used is "synthetic_healthcare_diabetes.csv".

### Data Preprocessing

1. **Feature Selection:** Features like Age, Gender, BMI, BloodPressure, etc. are used for prediction, while 'DiabetesOutcome' is the target variable.
2. **Train-Test Split:** The data is split into training (80%) and testing (20%) sets using `train_test_split`.
3. **Handling Missing Values:** `SimpleImputer` is used to fill missing values in numerical features with the mean and in categorical features with the most frequent value.
4. **Encoding Categorical Features:** `OneHotEncoder` converts categorical features into numerical representations.
5. **Scaling Numerical Features:** `StandardScaler` standardizes numerical features to have zero mean and unit variance.

### Model Selection and Evaluation

Several machine learning models are trained and evaluated:

* **Logistic Regression:** Achieved an accuracy of [insert lr_acc value].
* **K-Nearest Neighbors (KNN):** Achieved an accuracy of [insert knn_acc value].
* **Decision Tree:** Achieved an accuracy of [insert dt_accuracy value].
* **Random Forest:** Achieved an accuracy of [insert rf_accuracy value].
* **Support Vector Machine (SVM):** Achieved an accuracy of [insert svm_accuracy value].
* **Naive Bayes:** Achieved an accuracy of [insert nb_acc value].

### Hyperparameter Tuning

GridSearchCV is used to tune hyperparameters for Logistic Regression and Decision Tree models, improving their performance.

### Conclusion

The project demonstrates the application of various machine learning algorithms for diabetes prediction.
Further improvements can be explored by experimenting with different feature engineering techniques, model ensembles, and
more advanced hyperparameter optimization methods.
