Consider the diabetes dataset available on kaggle (https://www.kaggle.com/code/milanvaddoriya/grid-search-cv-diabetes-dataset/data) Columns details are as follow: Pregnancies: Number of times pregnant Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test BloodPressure: Diastolic blood pressure (mm Hg) SkinThickness: Triceps skin fold thickness (mm) Insulin: 2-Hour serum insulin (mu U/ml) BMI: Body mass index (weight in kg/(height in m)^2) DiabetesPedigreeFunction: Diabetes pedigree function Age: Age (years) Outcome: Class variable (0 or 1)

In [114]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import pickle

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score , f1_score ,classification_report, confusion_matrix

In [115]:
df=pd.read_csv("diabetes.xls")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [116]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [117]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [118]:
#replacing 0 values of SkinThickness and Insulin with mean
m1=df["SkinThickness"].mean()
df.SkinThickness= df.SkinThickness.replace(0,m1)
m2=df["Insulin"].mean()
df.Insulin= df.Insulin.replace(0,m2)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    float64
 4   Insulin                   768 non-null    float64
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(4), int64(5)
memory usage: 54.1 KB


In [119]:
X=df.drop(df.Outcome)
y=df.Outcome
X_columns=list(X.columns)
X_columns


['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'Outcome']

In [120]:
scaler=StandardScaler()
X=scaler.fit_transform(X)
X

array([[ 1.23290740e+00,  1.94334166e+00, -2.63619834e-01, ...,
         6.03883389e-01, -1.03976332e-01,  1.36708224e+00],
       [-8.44669430e-01, -9.98555001e-01, -1.60355813e-01, ...,
        -9.19639306e-01, -1.03998549e+00, -7.31484887e-01],
       [-1.14146612e+00,  5.03690100e-01, -1.50278808e+00, ...,
         5.47915601e+00, -1.88845902e-02,  1.36708224e+00],
       ...,
       [ 3.42517329e-01,  2.94173323e-03,  1.49436249e-01, ...,
        -6.84322929e-01, -2.74159816e-01, -7.31484887e-01],
       [-8.44669430e-01,  1.59425598e-01, -4.70147875e-01, ...,
        -3.70567761e-01,  1.17239980e+00,  1.36708224e+00],
       [-8.44669430e-01, -8.73367909e-01,  4.61722286e-02, ...,
        -4.73141566e-01, -8.69802008e-01, -7.31484887e-01]])

In [121]:
X=pd.DataFrame(X,columns=X_columns)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 766 entries, 0 to 765
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               766 non-null    float64
 1   Glucose                   766 non-null    float64
 2   BloodPressure             766 non-null    float64
 3   SkinThickness             766 non-null    float64
 4   Insulin                   766 non-null    float64
 5   BMI                       766 non-null    float64
 6   DiabetesPedigreeFunction  766 non-null    float64
 7   Age                       766 non-null    float64
 8   Outcome                   766 non-null    float64
dtypes: float64(9)
memory usage: 54.0 KB


In [122]:
y=y[:766]
y

0      1
1      0
2      1
3      0
4      1
      ..
761    1
762    0
763    0
764    0
765    0
Name: Outcome, Length: 766, dtype: int64

In [123]:
X_train, X_test, y_train, y_test =train_test_split(X,y,test_size=0.2,stratify=y)

In [124]:
model_logistic=LogisticRegression()
model_logistic=model_logistic.fit(X_train,y_train)
model_logistic

In [125]:
y_pred=model_logistic.predict(X_test)

In [126]:
accuracy=accuracy_score(y_test,y_pred)
print("accuracy : =",accuracy)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

accuracy : = 0.6428571428571429
              precision    recall  f1-score   support

           0       0.65      0.97      0.78       100
           1       0.40      0.04      0.07        54

    accuracy                           0.64       154
   macro avg       0.53      0.50      0.42       154
weighted avg       0.56      0.64      0.53       154

[[97  3]
 [52  2]]


In [None]:
model_randomForest=RandomForestClassifier()
model_randomForest.fit(X_train,y_train)


In [136]:
y_pred_RF=model_randomForest.predict(X_test)
accuracy_rf=accuracy_score(y_test,y_pred_RF)
print(classification_report(y_pred_RF,y_test))
print(confusion_matrix(y_test,y_pred_RF))


              precision    recall  f1-score   support

           0       0.89      0.64      0.74       139
           1       0.07      0.27      0.12        15

    accuracy                           0.60       154
   macro avg       0.48      0.45      0.43       154
weighted avg       0.81      0.60      0.68       154

[[89 11]
 [50  4]]
