In [None]:
import numpy as np
import pandas as pd
import sklearn.model_selection as ms
import sklearn.linear_model as lm
import sklearn.preprocessing as pp
from sklearn.metrics import (confusion_matrix, matthews_corrcoef, 
                             brier_score_loss, accuracy_score, roc_auc_score)
import warnings

warnings.filterwarnings("ignore")

In [4]:
df = pd.read_csv("/Users/Siddhesh/My Files/VIT/PFE/PFECP/Data/diabetes.csv")

In [5]:
cols_with_zero = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
for col in cols_with_zero:
    df[col] = df[col].replace(0, df[col].median())

In [None]:
X = df.drop(columns="Outcome", axis=1)
Y = df["Outcome"]
#data split into feature and lables/target

In [None]:
X_train, X_test, Y_train, Y_test = ms.train_test_split(X, Y, test_size=0.2, random_state=45, stratify=Y)
#split into training and testing dataset

In [None]:
scaler = pp.StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
#normalization of data

In [None]:
model = lm.LogisticRegressionCV(cv=5, scoring="roc_auc", class_weight="balanced", 
                                 max_iter=1000, random_state=42)
model.fit(X_train_scaled, Y_train)

In [None]:
X_test_pred = model.predict(X_test) #trained data prediction
trainingscore = model.score(X_train, Y_train)
y_pred = model.predict(X_test_scaled)#tested data prediction
y_prob = model.predict_proba(X_test_scaled)[:, 1]

In [None]:

inputstr = [6,148,72,35,0,33.6,0.627,50]
inputarr = np.array(inputstr,dtype=float).reshape(1, -1)
prediction = model.predict(inputarr)
print(prediction)

if prediction[0] == 0:
    print("The person does not have a diabetes")
else:
    print("The person has a diabetes")

[1]
The person has a diabetes


In [14]:
print("Training Score: ", trainingscore)

Training Score:  0.7845210280373831


In [15]:
import pickle
filename = 'diabetes_lr.pkl'
pickle.dump(model, open(filename, 'wb'))
loaded_model = pickle.load(open('diabetes_lr.pkl', 'rb'))

In [16]:
# Generate classification report
from sklearn.metrics import classification_report
report = classification_report(Y_test, X_test_pred)
print(report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       100
           1       0.35      1.00      0.52        54

    accuracy                           0.35       154
   macro avg       0.18      0.50      0.26       154
weighted avg       0.12      0.35      0.18       154



In [18]:

# Compute confusion matrix values
tn, fp, fn, tp = confusion_matrix(Y_test, y_pred).ravel()

# Compute required metrics
tpr = tp / (tp + fn)  # Sensitivity / Recall
tnr = tn / (tn + fp)  # Specificity
mcc = matthews_corrcoef(Y_test, y_pred)  # MCC
brier = brier_score_loss(Y_test, y_prob)  # Brier Score
accuracy = accuracy_score(Y_test, y_pred)  # Accuracy
auc = roc_auc_score(Y_test, y_prob)  # AUC-ROC

In [19]:
# Print results
print(f"TPR (Sensitivity): {tpr:.4f}")
print(f"TNR (Specificity): {tnr:.4f}")
print(f"MCC: {mcc:.4f}")
print(f"Brier Score: {brier:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"AUC-ROC: {auc:.4f}")

TPR (Sensitivity): 0.6852
TNR (Specificity): 0.8100
MCC: 0.4912
Brier Score: 0.1539
Accuracy: 0.7662
AUC-ROC: 0.8478
