In [1]:
import pandas as pd

In [2]:
import pandas as pd
diabetes = pd.read_csv('diabetes2.csv')

## Data Cleaning

In [3]:
diabetes.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [4]:
diabetes.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [5]:
unreal_data_percentage = {}
total_rows = len(diabetes.index)

for column in diabetes.columns:
    number_of_zeros = diabetes[column].isin([0]).sum()
    unreal_data_percentage[column] = (number_of_zeros / total_rows) * 100

for column, percentage in unreal_data_percentage.items():
    print(f"Percentage of missing data for {column}: {round(percentage, 2)}%")

Percentage of missing data for Pregnancies: 14.45%
Percentage of missing data for Glucose: 0.65%
Percentage of missing data for BloodPressure: 4.56%
Percentage of missing data for SkinThickness: 29.56%
Percentage of missing data for Insulin: 48.7%
Percentage of missing data for BMI: 1.43%
Percentage of missing data for DiabetesPedigreeFunction: 0.0%
Percentage of missing data for Age: 0.0%
Percentage of missing data for Outcome: 65.1%


In [6]:
diabetes_clean = diabetes.copy()

columns = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

for column in columns:
    diabetes_clean[column] = diabetes_clean[column].replace(0, diabetes[column].mean())

In [7]:
diabetes_clean['Pregnancies'].values[diabetes_clean['Pregnancies'] > 0] = 1

In [8]:
diabetes_clean.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,0.855469,121.681605,72.254807,26.606479,118.660163,32.450805,0.471876,33.240885,0.348958
std,0.351857,30.436016,12.115932,9.631241,93.080358,6.875374,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.75,64.0,20.536458,79.799479,27.5,0.24375,24.0,0.0
50%,1.0,117.0,72.0,23.0,79.799479,32.0,0.3725,29.0,0.0
75%,1.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,1.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [9]:
diabetes_clean.to_csv('cleaned_data.csv')

In [10]:
diabetes_clean['Outcome'].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

In [11]:
diabetes_clean['Pregnancies'].value_counts()

Pregnancies
1    657
0    111
Name: count, dtype: int64

### Logistic Regression

In [12]:
import numpy as np

columns = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction']

for column in columns:
    print(column + "\n")
    res = diabetes_clean.groupby("Outcome").agg({
        column: [np.mean, np.std, np.size]
    })

    for i in [0, 1]:

        std = res.loc[i, (column, 'std')]
        size = res.loc[i, (column, 'size')]
        mean = res.loc[i, (column, 'mean')]

        standard_error = std / np.sqrt(size)

        lcb = mean - 1.96 * standard_error
        ucb = mean + 1.96 * standard_error
        label = "Diabetes" if i == 1 else "Non-Diabetes"
        
        print(label)
        print(round(lcb, 3), round(ucb, 3))
        
    print("------")

Glucose

Non-Diabetes
108.539 112.872
Diabetes
138.622 145.697
------
BloodPressure

Non-Diabetes
69.764 71.856
Diabetes
73.512 76.389
------
SkinThickness

Non-Diabetes
24.582 26.165
Diabetes
27.675 30.14
------
Insulin

Non-Diabetes
99.571 113.344
Diabetes
128.012 154.842
------
BMI

Non-Diabetes
30.31 31.45
Diabetes
34.591 36.171
------
DiabetesPedigreeFunction

Non-Diabetes
0.404 0.456
Diabetes
0.506 0.595
------


## Initial Logistic regression model

In [13]:
import numpy as np

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

class LogisticRegression:
    def __init__(self, learning_rate=0.01, num_iterations=1000):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        num_samples, num_features = X.shape
        self.weights = np.zeros(num_features)
        self.bias = 0

        for i in range(self.num_iterations):
            linear_model = np.dot(X, self.weights) + self.bias
            y_pred = sigmoid(linear_model)
            dw = (1 / num_samples) * np.dot(X.T, (y_pred - y))
            db = (1 / num_samples) * np.sum(y_pred - y)

            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_pred = sigmoid(linear_model)
        y_pred_class = [1 if i > 0.5 else 0 for i in y_pred]
        return y_pred_class

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

y = diabetes['Outcome']
X = diabetes.drop('Outcome', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

model = LogisticRegression(learning_rate=0.1, num_iterations=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = np.sum(y_pred == y_test) / len(y_test)
print('Accuracy:', accuracy)

Accuracy: 0.8246753246753247
