Consider the diabetes dataset available on kaggle 
(https://www.kaggle.com/code/milanvaddoriya/grid-search-cv-diabetes-dataset/data)
Columns details are as follow:
Pregnancies: Number of times pregnant
Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test
BloodPressure: Diastolic blood pressure (mm Hg)
SkinThickness: Triceps skin fold thickness (mm)
Insulin: 2-Hour serum insulin (mu U/ml)
BMI: Body mass index (weight in kg/(height in m)^2)
DiabetesPedigreeFunction: Diabetes pedigree function
Age: Age (years)
Outcome: Class variable (0 or 1)

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [16]:
df=pd.read_csv('diabetes.csv')
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [18]:
x = df.drop('Outcome', axis=1)
y = df['Outcome']

In [19]:
x.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
dtype: int64

In [20]:
x.replace(0, np.nan, inplace=True)

In [21]:
x.isnull().sum()

Pregnancies                 111
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
dtype: int64

In [22]:
x["Outcome"] = df["Outcome"]
x

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,,33.6,0.627,50,1
1,1.0,85.0,66.0,29.0,,26.6,0.351,31,0
2,8.0,183.0,64.0,,,23.3,0.672,32,1
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,,137.0,40.0,35.0,168.0,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10.0,101.0,76.0,48.0,180.0,32.9,0.171,63,0
764,2.0,122.0,70.0,27.0,,36.8,0.340,27,0
765,5.0,121.0,72.0,23.0,112.0,26.2,0.245,30,0
766,1.0,126.0,60.0,,,30.1,0.349,47,1


In [23]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy='mean')
imputer = imputer.fit(x)
X = imputer.transform(x)
X

array([[  6.   , 148.   ,  72.   , ...,   0.627,  50.   ,   1.   ],
       [  1.   ,  85.   ,  66.   , ...,   0.351,  31.   ,   0.   ],
       [  8.   , 183.   ,  64.   , ...,   0.672,  32.   ,   1.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,   0.245,  30.   ,   0.   ],
       [  1.   , 126.   ,  60.   , ...,   0.349,  47.   ,   1.   ],
       [  1.   ,  93.   ,  70.   , ...,   0.315,  23.   ,   0.   ]])

In [24]:
df = pd.DataFrame(X, columns=x.columns)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,155.548223,33.6,0.627,50.0,1.0
1,1.0,85.0,66.0,29.0,155.548223,26.6,0.351,31.0,0.0
2,8.0,183.0,64.0,29.15342,155.548223,23.3,0.672,32.0,1.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0.0
4,4.494673,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1.0


In [25]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [26]:
x = df.drop('Outcome', axis=1)
y = df['Outcome']

In [36]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_scaler = scaler.fit_transform(x)
df_final = pd.DataFrame(df_scaler, columns=x.columns)
df_final.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.5062549,0.865108,-0.033518,0.6655021,-3.345079e-16,0.166292,0.468492,1.425995
1,-1.175289,-1.206162,-0.529859,-0.01746338,-3.345079e-16,-0.852531,-0.365061,-0.190672
2,1.178873,2.015813,-0.695306,8.087936e-16,-3.345079e-16,-1.332833,0.604397,-0.105584
3,-1.175289,-1.074652,-0.529859,-0.7004289,-0.7243887,-0.634212,-0.920763,-1.041549
4,2.987023e-16,0.503458,-2.680669,0.6655021,0.1465506,1.54898,5.484909,-0.020496


In [37]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [38]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(x_train, y_train)

In [39]:
y_pred = model.predict(x_test)

In [40]:
y_pred_train = model.predict(x_train)

In [46]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

accuracy = accuracy_score(y_train, y_pred_train)
print("Accuracy: ", accuracy)
print(classification_report(y_train, y_pred_train))
print(confusion_matrix(y_train, y_pred_train))

Accuracy:  0.7690875232774674
              precision    recall  f1-score   support

         0.0       0.79      0.89      0.84       358
         1.0       0.70      0.53      0.61       179

    accuracy                           0.77       537
   macro avg       0.75      0.71      0.72       537
weighted avg       0.76      0.77      0.76       537

[[318  40]
 [ 84  95]]


In [47]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

Accuracy:  0.7142857142857143
              precision    recall  f1-score   support

         0.0       0.73      0.86      0.79       142
         1.0       0.68      0.48      0.57        89

    accuracy                           0.71       231
   macro avg       0.70      0.67      0.68       231
weighted avg       0.71      0.71      0.70       231

[[122  20]
 [ 46  43]]


In [48]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.1, 1, 10], 'solver': ['liblinear', 'saga']}
grid = GridSearchCV(model,param_grid,cv=5)
grid.fit(x_train, y_train)
best_model = grid.best_estimator_
print(best_model)

LogisticRegression(C=10, solver='liblinear')


In [49]:
model1 = LogisticRegression()
model1.fit(x_train, y_train)

In [51]:
y_pred_train = model1.predict(x_train)

In [52]:
accuracy = accuracy_score(y_train, y_pred_train)
print("Accuracy: ", accuracy)
print(classification_report(y_train, y_pred_train))
print(confusion_matrix(y_train, y_pred_train))

Accuracy:  0.7690875232774674
              precision    recall  f1-score   support

         0.0       0.79      0.89      0.84       358
         1.0       0.70      0.53      0.61       179

    accuracy                           0.77       537
   macro avg       0.75      0.71      0.72       537
weighted avg       0.76      0.77      0.76       537

[[318  40]
 [ 84  95]]


In [53]:
y_pred = model1.predict(x_test)

In [54]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy:  0.7142857142857143
              precision    recall  f1-score   support

         0.0       0.73      0.86      0.79       142
         1.0       0.68      0.48      0.57        89

    accuracy                           0.71       231
   macro avg       0.70      0.67      0.68       231
weighted avg       0.71      0.71      0.70       231

[[122  20]
 [ 46  43]]


In [55]:
import pickle
pickle.dump(model1, open('diabetes.csv', 'wb'))

In [56]:
from sklearn.ensemble import RandomForestClassifier
model2 = RandomForestClassifier()
model2.fit(x_train, y_train)

In [57]:
pred_y = model2.predict(x_test)
accuracy = accuracy_score(y_test, pred_y)
print("Accuracy: ", accuracy)
print(classification_report(y_test, pred_y))
print(confusion_matrix(y_test, pred_y))

Accuracy:  0.7532467532467533
              precision    recall  f1-score   support

         0.0       0.76      0.87      0.81       142
         1.0       0.74      0.56      0.64        89

    accuracy                           0.75       231
   macro avg       0.75      0.72      0.73       231
weighted avg       0.75      0.75      0.75       231

[[124  18]
 [ 39  50]]


In [58]:
pred_y_train = model2.predict(x_train)

In [59]:
accuracy = accuracy_score(y_train, y_pred_train)
print("Accuracy: ", accuracy)
print(classification_report(y_train, y_pred_train))
print(confusion_matrix(y_train, y_pred_train))

Accuracy:  0.7690875232774674
              precision    recall  f1-score   support

         0.0       0.79      0.89      0.84       358
         1.0       0.70      0.53      0.61       179

    accuracy                           0.77       537
   macro avg       0.75      0.71      0.72       537
weighted avg       0.76      0.77      0.76       537

[[318  40]
 [ 84  95]]


In [60]:
pickle.dump(model2, open('diabetes.csv', 'wb'))