In [1]:
# import the necessary modules
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
# in order to ignore the warnings
import warnings
warnings.filterwarnings("ignore")

In [3]:
#load and inspect the dataset
data = pd.read_csv("diabetes.csv")
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [6]:
# replace the 0s in the dataset with mean values
df = data.copy()
df[["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "Age"]] = data[["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "Age"]].replace(0, np.nan)
features_to_replace_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'Age']

for i in features_to_replace_zeros:
    df[i] = df[i].fillna(df[i].mean())

In [7]:
# split the dataset into features and labels
features = df[["Pregnancies", "Glucose", "BMI", "Age"]]
labels = df["Outcome"]

In [8]:
# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=0)

In [9]:
# standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
# create and fit the model
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [11]:
# inspect the results
print(lr.score(X_test, y_test))
print(metrics.classification_report(y_test, y_pred))
columns = ["Pregnancies", "Glucose", "BMI", "Age"]
coefficients = {}

for i in range(len(columns)):
    coefficients[columns[i]] = float(lr.coef_[0, i].round(3))


0.8051948051948052
              precision    recall  f1-score   support

           0       0.83      0.91      0.87       107
           1       0.73      0.57      0.64        47

    accuracy                           0.81       154
   macro avg       0.78      0.74      0.75       154
weighted avg       0.80      0.81      0.80       154



In [12]:
# test for subject1
subject1 = pd.DataFrame(np.array([0, 115, 26.5, 48]).reshape(1,-1))
subject1 = scaler.transform(subject1)
sub1_pred = lr.predict_proba(subject1)
print(sub1_pred)
print(lr.predict(subject1))

[[0.84256344 0.15743656]]
[0]


In [13]:
# test for subject2
subject2 = pd.DataFrame(np.array([3, 98, 21.5, 49]).reshape(1,-1))
subject2 = scaler.transform(subject2)
sub2_pred = lr.predict_proba(subject2)
print(sub2_pred)
print(lr.predict(subject2))

[[0.91938598 0.08061402]]
[0]
