In [7]:
import numpy as np
import pandas as pd
import mlflow
import mlflow.sklearn

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

In [9]:
diabetes = pd.read_csv("PimaDiabetes.csv")

In [10]:
diabetes.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigree,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [11]:
diabetes.shape

(750, 9)

In [12]:
diabetes.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigree,Age,Outcome
count,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0
mean,3.844,120.737333,68.982667,20.489333,80.378667,31.959067,0.473544,33.166667,0.346667
std,3.370085,32.019671,19.508814,15.918828,115.019198,7.927399,0.332119,11.708872,0.476226
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.244,24.0,0.0
50%,3.0,117.0,72.0,23.0,36.5,32.0,0.377,29.0,0.0
75%,6.0,140.75,80.0,32.0,129.75,36.575,0.6285,40.75,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [13]:
diabetes['Outcome'].value_counts()

Outcome
0    490
1    260
Name: count, dtype: int64

In [14]:

diabetes.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigree,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.281633,110.008163,68.118367,19.536735,69.416327,30.286531,0.432278,31.181633
1,4.903846,140.957692,70.611538,22.284615,101.038462,35.111154,0.551315,36.907692


In [15]:
diabetes.isnull().any()

Pregnancies         False
Glucose             False
BloodPressure       False
SkinThickness       False
Insulin             False
BMI                 False
DiabetesPedigree    False
Age                 False
Outcome             False
dtype: bool

In [16]:
diabetes.isna().sum()

Pregnancies         0
Glucose             0
BloodPressure       0
SkinThickness       0
Insulin             0
BMI                 0
DiabetesPedigree    0
Age                 0
Outcome             0
dtype: int64

In [17]:
X = diabetes.drop(columns = 'Outcome', axis=1)
Y = diabetes['Outcome']

In [18]:
print(X)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
745           12      100             84             33      105  30.0   
746            1      147             94             41        0  49.3   
747            1       81             74             41       57  46.3   
748            3      187             70             22      200  36.4   
749            6      162             62              0        0  24.3   

     DiabetesPedigree  Age  
0               0.627   50  
1               0.351   31  
2               0.672   

In [19]:
print(Y)

0      1
1      0
2      1
3      0
4      1
      ..
745    0
746    1
747    0
748    1
749    1
Name: Outcome, Length: 750, dtype: int64


Train Test Split

In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y, random_state=2)

In [21]:
print(X.shape, X_train.shape, X_test.shape)

(750, 8) (600, 8) (150, 8)


In [22]:
models = { 
    "LogisticRegression":LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Gradiest Boost": GradientBoostingClassifier(),
    "Support Vector Machine": SVC(kernel='linear',probability=True)
}

In [23]:
mlflow.set_experiment("Diabetes Prediction Models")

<Experiment: artifact_location='file:///c:/abhishek/MANCHESTER/Diabetes%20Prediction%20Deployment/mlruns/975090899472897953', creation_time=1738014274998, experiment_id='975090899472897953', last_update_time=1738014274998, lifecycle_stage='active', name='Diabetes Prediction Models', tags={}>

In [24]:
for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        model.fit(X_train,Y_train)

        y_pred = model.predict(X_test)

        accuracy = accuracy_score(Y_test, y_pred)
        precision = precision_score(Y_test, y_pred, zero_division=0)
        recall = recall_score(Y_test, y_pred, zero_division=0)
        f1 = f1_score(Y_test, y_pred, zero_division=0)

        mlflow.log_param("model_name_V2", model_name)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)
        mlflow.sklearn.log_model(model, artifact_path="model")

        print(f"{model_name} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")
print("Models have been evaluated and logged in MLflow.")



LogisticRegression - Accuracy: 0.7733, Precision: 0.7500, Recall: 0.5192, F1 Score: 0.6136




Random Forest - Accuracy: 0.7533, Precision: 0.6923, Recall: 0.5192, F1 Score: 0.5934




Gradiest Boost - Accuracy: 0.7867, Precision: 0.7500, Recall: 0.5769, F1 Score: 0.6522




Support Vector Machine - Accuracy: 0.7933, Precision: 0.7838, Recall: 0.5577, F1 Score: 0.6517
Models have been evaluated and logged in MLflow.


In [25]:
# Replace 'best_run_id' with the actual Run ID of the best model
best_run_id = "d376b4b3f8b9441d98ba4b7c27040555"
best_model_uri = f"runs:/{best_run_id}/model"

# Load the model
best_model = mlflow.sklearn.load_model(best_model_uri)

# Make predictions
predictions = best_model.predict(X_test)
print(predictions)


[0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 1 0 1 1 0 1 0 1 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1
 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0
 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 1 0 0 0 1 0 0 0 0 0 1 0
 1 0]
