In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('diabetes.csv')
df.head()



Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [14]:
df.shape

(768, 9)

In [9]:
X = df.drop(columns=['Outcome'])
y = df['Outcome']

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
# building the model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

In [13]:
y_pred = model.predict(X_test_scaled)
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7532467532467533
Confusion Matrix:
 [[79 20]
 [18 37]]
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.80      0.81        99
           1       0.65      0.67      0.66        55

    accuracy                           0.75       154
   macro avg       0.73      0.74      0.73       154
weighted avg       0.76      0.75      0.75       154



In [18]:

y_pred_train = model.predict(X_train_scaled)
print("Training Accuracy:", accuracy_score(y_train, y_pred_train))

Training Accuracy: 0.7703583061889251


In [22]:
diff = np.abs(accuracy_score(y_test, y_pred) - accuracy_score(y_train, y_pred_train))
diff

0.017111552942171815

In [27]:
X_new_data = np.array(
    [11,800,70,30,0,31.3,0.200, 24]
)

X_new_data_scaled = scaler.transform([X_new_data])
y_new_pred = model.predict(X_new_data_scaled)
y_new_pred[0]



1

In [28]:

# Example new data points (each as a numpy array)
X_new_data_1 = np.array([2, 120, 70, 25, 80, 28.5, 0.400, 29])
X_new_data_2 = np.array([4, 150, 85, 30, 130, 33.0, 0.700, 42])
X_new_data_3 = np.array([0, 100, 60, 20, 0, 25.0, 0.200, 22])

# Assuming you have 'scaler' and 'model' already loaded and fitted:

# Scale the new data points
X_new_data_1_scaled = scaler.transform([X_new_data_1])
X_new_data_2_scaled = scaler.transform([X_new_data_2])
X_new_data_3_scaled = scaler.transform([X_new_data_3])

# Predict
y_pred_1 = model.predict(X_new_data_1_scaled)
y_pred_2 = model.predict(X_new_data_2_scaled)
y_pred_3 = model.predict(X_new_data_3_scaled)

print("Prediction for new data point 1:", y_pred_1[0])
print("Prediction for new data point 2:", y_pred_2[0])
print("Prediction for new data point 3:", y_pred_3[0])


Prediction for new data point 1: 0
Prediction for new data point 2: 1
Prediction for new data point 3: 0




In [None]:
# Handling multicollinearity in the dataset
# drop column one by one if vif value is greater than 5

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Now handling multicollinearity 
def calculate_vif(X):
    x_columns = X.columns
    for i in range (0, x_columns.shape[0]):
        y = x_columns[i]
        x_others = x_columns.drop(x_columns[i])
        linear = LinearRegression()
        linear.fit(X[x_others], X[y])
        rsq = r2_score(X[y], linear.predict(X[x_others]))
        try:
            vif = round(1 / (1 - rsq), 2)
        except ZeroDivisionError:
            vif = float("inf")
        print(y, "VIF :", vif)

calculate_vif(X_train)

Note: you may need to restart the kernel to use updated packages.
Pregnancies VIF : 1.45
Glucose VIF : 1.33
BloodPressure VIF : 1.16
SkinThickness VIF : 1.53
Insulin VIF : 1.45
BMI VIF : 1.28
DiabetesPedigreeFunction VIF : 1.06
Age VIF : 1.62


In [45]:
#impact of features using p-value z-test
import statsmodels.api as sm

# Adding a constant to the model (intercept)
X_train_sm = sm.add_constant(X_train_scaled)

# Fit the model
model_sm = sm.Logit(y_train, X_train)
results = model_sm.fit()

# Print the summary
print(results.summary())

Optimization terminated successfully.
         Current function value: 0.615161
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:                Outcome   No. Observations:                  614
Model:                          Logit   Df Residuals:                      606
Method:                           MLE   Df Model:                            7
Date:                Sun, 08 Jun 2025   Pseudo R-squ.:                 0.04702
Time:                        22:16:03   Log-Likelihood:                -377.71
converged:                       True   LL-Null:                       -396.34
Covariance Type:            nonrobust   LLR p-value:                 4.172e-06
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
Pregnancies                  0.0947      0.032      2.972      0.003       0.032

In [40]:
X_train_new = X_train.drop(columns=['SkinThickness', 'Insulin', 'DiabetesPedigreeFunction'], axis=1)
X_train_new.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,BMI,Age
60,2,84,0,0.0,21
618,9,112,82,28.2,50
346,1,139,46,28.7,22
294,0,161,50,21.9,65
231,6,134,80,46.2,46


In [43]:
from sklearn.preprocessing import StandardScaler

# Fit a new scaler on the reduced feature set
scaler_new = StandardScaler()
X_train_scaled_new = scaler_new.fit_transform(X_train_new)

# Refit the model with the new training data
model = LogisticRegression()
model.fit(X_train_scaled_new, y_train)

In [44]:
y_pred_hehe = model.predict(scaler_new.transform(X_test.drop(columns=['SkinThickness', 'Insulin', 'DiabetesPedigreeFunction'])))
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print("New Accuracy:", accuracy_score(y_test, y_pred_hehe))
print("New Confusion Matrix:\n", confusion_matrix(y_test, y_pred_hehe))

New Accuracy: 0.7142857142857143
New Confusion Matrix:
 [[75 24]
 [20 35]]
