In [None]:
1. Data load
2. EDA
3. PreProcessing
4. Feature selection or feature engineering
5. Data transformation
6. Model build
7. Model evaluate
8. Hyperparameter Tuning
9. Model evaluate
10. Save the model
11. New dataset predict 

In [1]:
# Step 1: Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
import joblib  # For saving/loading model
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# Step 2: Load the Data
df = pd.read_csv("F:/0. The Data Psychology\\4. New_Machine Learning\\2. Logistic Regression\\diabetes.xls")
print("First 5 rows of the dataset:")
print(df.head())

First 5 rows of the dataset:
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [5]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [6]:
# Step 3: Check for missing values or zeros in critical columns
print("\nColumns with zeros which may represent missing values:")
cols_with_zero_invalid = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
print((df[cols_with_zero_invalid] == 0).sum())


Columns with zeros which may represent missing values:
Glucose            5
BloodPressure     35
SkinThickness    227
Insulin          374
BMI               11
dtype: int64


In [30]:
cols_with_zero_invalid

['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

In [37]:
df[cols_with_zero_invalid]==1

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
...,...,...,...,...,...
763,False,False,False,False,False
764,False,False,False,False,False
765,False,False,False,False,False
766,False,False,False,False,False


In [8]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [9]:
# Step 4: Replace 0 with NaN, then impute with median
df[cols_with_zero_invalid] = df[cols_with_zero_invalid].replace(0, np.nan)
df.fillna(df.median(numeric_only=True), inplace=True)

In [10]:
print((df[cols_with_zero_invalid] == 0).sum())

Glucose          0
BloodPressure    0
SkinThickness    0
Insulin          0
BMI              0
dtype: int64


In [11]:
# Step 5: Split into features and target
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

In [12]:
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148.0,72.0,35.0,125.0,33.6,0.627,50
1,1,85.0,66.0,29.0,125.0,26.6,0.351,31
2,8,183.0,64.0,29.0,125.0,23.3,0.672,32
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48.0,180.0,32.9,0.171,63
764,2,122.0,70.0,27.0,125.0,36.8,0.340,27
765,5,121.0,72.0,23.0,112.0,26.2,0.245,30
766,1,126.0,60.0,29.0,125.0,30.1,0.349,47


In [13]:
y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

In [14]:
# Step 6: Feature Scaling (Standardization)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [15]:
X_scaled

array([[ 0.63994726,  0.86604475, -0.03198993, ...,  0.16661938,
         0.46849198,  1.4259954 ],
       [-0.84488505, -1.20506583, -0.5283186 , ..., -0.85219976,
        -0.36506078, -0.19067191],
       [ 1.23388019,  2.01666174, -0.69376149, ..., -1.33250021,
         0.60439732, -0.10558415],
       ...,
       [ 0.3429808 , -0.02157407, -0.03198993, ..., -0.910418  ,
        -0.68519336, -0.27575966],
       [-0.84488505,  0.14279979, -1.02464727, ..., -0.34279019,
        -0.37110101,  1.17073215],
       [-0.84488505, -0.94206766, -0.19743282, ..., -0.29912651,
        -0.47378505, -0.87137393]])

In [16]:
# Step 7: Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [17]:
X_train

array([[-0.54791859, -1.2379406 , -0.03198993, ..., -0.02258989,
        -0.50700636, -1.04154944],
       [ 1.53084665, -0.31744701,  0.79522451, ..., -0.61932681,
         2.44666971,  1.4259954 ],
       [-0.84488505,  0.57017181, -2.18274749, ..., -0.54655402,
         0.55003518, -0.95646168],
       ...,
       [ 1.82781311, -0.67906949,  1.12611029, ...,  1.91316648,
         2.00573238,  0.40494237],
       [-1.14185152,  0.63592135, -0.03198993, ...,  1.44742059,
        -0.8059981 , -0.36084741],
       [-1.14185152,  0.10992502,  1.95332473, ..., -1.44893669,
        -0.63385134, -1.04154944]])

In [18]:
X_test

array([[ 0.63994726, -0.7776938 , -1.19009016, ...,  0.22483762,
        -0.1264714 ,  0.83038113],
       [-0.54791859, -0.31744701,  0.2161744 , ...,  0.47226512,
        -0.97814487, -1.04154944],
       [-0.54791859, -0.44894609, -0.69376149, ..., -0.24090828,
        -0.94794368, -1.04154944],
       ...,
       [ 1.23388019, -0.87631812, -0.03198993, ...,  0.63236527,
         0.03963513,  2.02160968],
       [-0.54791859,  0.80029521, -0.19743282, ..., -0.64843593,
        -0.40734244, -0.36084741],
       [ 1.23388019, -1.56668831, -0.19743282, ...,  0.41404689,
         0.70406123,  0.49003012]])

In [19]:
y_train

60     0
618    1
346    0
294    0
231    1
      ..
71     0
106    0
270    1
435    1
102    0
Name: Outcome, Length: 614, dtype: int64

In [20]:
y_test

668    0
324    0
624    0
690    0
473    0
      ..
355    1
534    0
344    0
296    1
462    0
Name: Outcome, Length: 154, dtype: int64

In [21]:
# Step 8: Build Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)

In [23]:
# Step 9: Evaluate the Model
y_pred = model.predict(X_test)
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))


Confusion Matrix:
[[82 17]
 [21 34]]

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.83      0.81        99
           1       0.67      0.62      0.64        55

    accuracy                           0.75       154
   macro avg       0.73      0.72      0.73       154
weighted avg       0.75      0.75      0.75       154

Accuracy Score: 0.7532467532467533


In [24]:
# Step 10: Hyperparameter Tuning using GridSearchCV
param_grid = {'C': [0.01, 0.1, 1, 10, 100], 'solver': ['liblinear', 'lbfgs']}
grid_model = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_model.fit(X_train, y_train)
print("\nBest Parameters from Grid Search:", grid_model.best_params_)


Best Parameters from Grid Search: {'C': 1, 'solver': 'lbfgs'}


In [25]:
# Re-evaluate best model
best_model = grid_model.best_estimator_
y_pred_best = best_model.predict(X_test)
print("\nEvaluation after tuning:")
print(confusion_matrix(y_test, y_pred_best))
print(classification_report(y_test, y_pred_best))
print("Tuned Accuracy Score:", accuracy_score(y_test, y_pred_best))


Evaluation after tuning:
[[82 17]
 [21 34]]
              precision    recall  f1-score   support

           0       0.80      0.83      0.81        99
           1       0.67      0.62      0.64        55

    accuracy                           0.75       154
   macro avg       0.73      0.72      0.73       154
weighted avg       0.75      0.75      0.75       154

Tuned Accuracy Score: 0.7532467532467533


In [26]:
# Step 11: Save the Model and Scaler
joblib.dump(best_model, "logistic_model.pkl")
joblib.dump(scaler, "scaler.pkl")
print("\nModel and Scaler saved successfully!")


Model and Scaler saved successfully!


In [27]:
# Step 12: Load and Predict on New Data
# Simulating a new data point (replace values as needed)
new_data = pd.DataFrame({
    "Pregnancies": [2],
    "Glucose": [120],
    "BloodPressure": [70],
    "SkinThickness": [20],
    "Insulin": [79],
    "BMI": [25.0],
    "DiabetesPedigreeFunction": [0.3],
    "Age": [32]
})

In [28]:
new_data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,2,120,70,20,79,25.0,0.3,32


In [29]:
# Load saved model and scaler
loaded_model = joblib.load("logistic_model.pkl")
loaded_scaler = joblib.load("scaler.pkl")

# Preprocess new data
new_data_scaled = loaded_scaler.transform(new_data)
prediction = loaded_model.predict(new_data_scaled)

print("\nPrediction for new data (1 = Diabetic, 0 = Not Diabetic):", prediction[0])


Prediction for new data (1 = Diabetic, 0 = Not Diabetic): 0
