# Data Cleaning

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# load the dataset
data = pd.read_csv('diabetes_prediction_dataset.csv')

In [4]:
print(data)

       gender   age  hypertension  heart_disease smoking_history    bmi  \
0      Female  80.0             0              1           never  25.19   
1      Female  54.0             0              0         No Info  27.32   
2        Male  28.0             0              0           never  27.32   
3      Female  36.0             0              0         current  23.45   
4        Male  76.0             1              1         current  20.14   
...       ...   ...           ...            ...             ...    ...   
99995  Female  80.0             0              0         No Info  27.32   
99996  Female   2.0             0              0         No Info  17.37   
99997    Male  66.0             0              0          former  27.83   
99998  Female  24.0             0              0           never  35.42   
99999  Female  57.0             0              0         current  22.43   

       HbA1c_level  blood_glucose_level  diabetes  
0              6.6                  140        

In [5]:
df= data.drop(['gender','hypertension','heart_disease','smoking_history','HbA1c_level', 'blood_glucose_level'], axis=1)

In [6]:
print(df)

        age    bmi  diabetes
0      80.0  25.19         0
1      54.0  27.32         0
2      28.0  27.32         0
3      36.0  23.45         0
4      76.0  20.14         0
...     ...    ...       ...
99995  80.0  27.32         0
99996   2.0  17.37         0
99997  66.0  27.83         0
99998  24.0  35.42         0
99999  57.0  22.43         0

[100000 rows x 3 columns]


In [7]:
df.drop_duplicates (inplace = True)
print(df.shape)

(58170, 3)


In [8]:
data_set= df.dropna()

In [9]:
print(data_set)

        age    bmi  diabetes
0      80.0  25.19         0
1      54.0  27.32         0
2      28.0  27.32         0
3      36.0  23.45         0
4      76.0  20.14         0
...     ...    ...       ...
99993  40.0  40.69         0
99994  36.0  24.60         0
99997  66.0  27.83         0
99998  24.0  35.42         0
99999  57.0  22.43         0

[58170 rows x 3 columns]


In [10]:
Data=data_set.to_numpy()

In [11]:
Data

array([[80.  , 25.19,  0.  ],
       [54.  , 27.32,  0.  ],
       [28.  , 27.32,  0.  ],
       ...,
       [66.  , 27.83,  0.  ],
       [24.  , 35.42,  0.  ],
       [57.  , 22.43,  0.  ]])

# Logistic Regression

In [12]:
# seperate them into X and y
X = np.c_[Data[:,:-1]]
y = np.c_[Data[:,-1]]

In [13]:
print (X)

[[80.   25.19]
 [54.   27.32]
 [28.   27.32]
 ...
 [66.   27.83]
 [24.   35.42]
 [57.   22.43]]


In [14]:
print(y)

[[0.]
 [0.]
 [0.]
 ...
 [0.]
 [0.]
 [0.]]


In [15]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [16]:
def costFunction(X, y, theta=None):
    m = y.size
    if theta is None:
        theta = np.zeros((X.shape[1] + 1, 1))

    X_extend = np.c_[np.ones((m, 1)), X]   # add bias
    h = sigmoid(X_extend.dot(theta))

    cost = -1/m * np.sum(
        y * np.log(h + 1e-9) + (1 - y) * np.log(1 - h + 1e-9)
    )
    return cost

In [17]:
costFunction(X,y)

0.6931471785599453

In [42]:
def gradientDescent(X, y, theta=None, epochs=1000, learning_rate=0.01):
    m = y.size
    y = y.reshape(m, 1)

    if theta is None:
        theta = np.zeros((X.shape[1] + 1, 1))

    X_extend = np.c_[np.ones((m, 1)), X]

    cost_history = []
    
    for i in range(epochs):
        h = sigmoid(X_extend.dot(theta))
        gradient = (1/m) * X_extend.T.dot(h - y)

        theta = theta - learning_rate * gradient

        cost = -(1/m) * np.sum(
            y*np.log(h + 1e-9) + (1-y)*np.log(1-h + 1e-9)
        )
        cost_history.append(cost)

    return theta, cost_history

In [26]:
print(theta)

[[-0.1858152 ]
 [ 0.02007143]
 [-0.09063777]]


In [23]:
def predict_class(user_input, theta):
    # user_input must be (1,2) for age,bmi
    if user_input.ndim == 1:
        user_input = user_input.reshape(1, -1)

    # add bias
    user_input = np.c_[np.ones((user_input.shape[0], 1)), user_input]

    prediction = sigmoid(user_input.dot(theta))

    return int(prediction >= 0.5)

In [29]:
X = (X - X.mean(axis=0)) / X.std(axis=0)


In [30]:
# Suppose your X contains age and bmi
# And y contains diabetes column (0 or 1)

theta, costs = gradientDescent(X, y, epochs=5000, learning_rate=0.001)

mon_input = np.array([30, 30])  # age=30, bmi=30
résultat = predict_class(mon_input, theta)

print("Prédiction :", résultat)

Prédiction : 1


  return int(prediction >= 0.5)
