In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
df = pd.read_csv(r'/kaggle/input/personal-key-indicators-of-heart-disease/2020/heart_2020_cleaned.csv')

In [None]:
df.head()
df.info()
print("df columns : ", len(df.columns))

In [None]:
# Remove unnecessary columns
columns_remove = ['SleepTime', 'Asthma', 'KidneyDisease', 'MentalHealth',
'SkinCancer', 'Race']

df_cleaned = df.drop(columns = columns_remove)
df_cleaned.head()

In [None]:
df_cleaned.info() 

In [None]:
print("Smoking : ", df_cleaned['Smoking'].unique())
print("AlcoholDrinking   : ", df_cleaned['AlcoholDrinking'].unique())
print("Stroke : ", df_cleaned['Stroke'].unique())
print("DiffWalking : ", df_cleaned['DiffWalking'].unique())
print("PhysicalActivity : ", df_cleaned['PhysicalActivity'].unique())
print("AgeCategory : ", df_cleaned['AgeCategory'].unique())
print("Diabetic : ", df_cleaned['Diabetic'].unique())

# we can use a loop to print all these

In [None]:
for col in df_cleaned.columns:
    if df_cleaned[col].dtype == 'object':
        print("Column Name : " ,col, " value : ",df_cleaned[col].unique())

In [None]:
# Now convert the object type to integer to float type
df_cleaned['HeartDisease'] = df_cleaned['HeartDisease'].map({'Yes': 1, 'No': 0})
print("HeartDisease : ", df_cleaned['HeartDisease'].unique())

In [None]:
# Repeat the same process which column is having object datatype.

df_cleaned['Smoking'] = df_cleaned['Smoking'].map({'Yes': 1, 'No': 0})
df_cleaned['AlcoholDrinking'] = df_cleaned['AlcoholDrinking'].map({'Yes': 1, 'No': 0})
df_cleaned['Stroke'] = df_cleaned['Stroke'].map({'Yes': 1, 'No': 0})
df_cleaned['Sex'] = df_cleaned['Sex'].map({'Male': 1, 'Female' : 0})

# Now yes or no are converted into categorical values (remember you can use loop for 
# mapping the values)

df_cleaned['DiffWalking'] = df_cleaned['DiffWalking'].map({'Yes': 1, 'No': 0})
df_cleaned['PhysicalActivity'] = df_cleaned['PhysicalActivity'].map({'Yes': 1, 'No': 0})

In [None]:
diabetic_mapping = {
    'Yes': 1, 
    'No': 0, 
    'No, borderline diabetes': 2, 
    'Yes (during pregnancy)': 3
}
df_cleaned['Diabetic'] = df_cleaned['Diabetic'].map(diabetic_mapping)
# Check the unique values after mapping
print("Diabetic after mapping: ", df_cleaned['Diabetic'].unique())

In [None]:
genHealth_mapping = {
    'Excellent': 1, 
    'Very good': 2, 
    'Poor': 0, 
    'Fair': 4,
    'Good': 3
}
df_cleaned['GenHealth'] = df_cleaned['GenHealth'].map(genHealth_mapping)
# Check the unique values after mapping
print("GenHealth after mapping: ", df_cleaned['GenHealth'].unique())

In [None]:
age_mapping = {
    '18-24': 1, '25-29': 2, '30-34': 3, '35-39': 4, '40-44': 5,
    '45-49': 6, '50-54': 7, '55-59': 8, '60-64': 9, '65-69': 10,
    '70-74': 11, '75-79': 12, '80 or older': 13
}
df_cleaned['AgeCategory'] = df_cleaned['AgeCategory'].map(age_mapping)
# Check the unique values after mapping
print("AgeCategory after mapping: ", df_cleaned['AgeCategory'].unique())

In [None]:
# All are converted into (int) or (float) type

print("Smoking : ", df_cleaned['Smoking'].unique())
print("AlcoholDrinking   : ", df_cleaned['AlcoholDrinking'].unique())
print("Stroke : ", df_cleaned['Stroke'].unique())
print("DiffWalking : ", df_cleaned['DiffWalking'].unique())
print("PhysicalActivity : ", df_cleaned['PhysicalActivity'].unique())
print("AgeCategory : ", df_cleaned['AgeCategory'].unique())
print("Diabetic : ", df_cleaned['Diabetic'].unique())

In [None]:
df_cleaned.info()
print("HeartDisease : ", df_cleaned['HeartDisease'].unique())
print(df_cleaned.shape)

In [None]:
x = np.array(df_cleaned.drop(columns = ['HeartDisease']))
y = np.array(df_cleaned['HeartDisease'])

print(x.shape)
print("y shape : ", y.shape)

print("HeartDisease : ", df_cleaned['HeartDisease'].unique())

In [None]:
#  Remove the outliers
# formula - > IQR = q3 - q1 , lower = q1 - 1.5 * IQR, upper = q3 + 1.5 * IQR

q1 = df_cleaned.quantile(0.25)
q3 = df_cleaned.quantile(0.75)

IQR = q3 - q1

lower = q1 - 1.5 * IQR
upper = q3 + 1.5 * IQR

df_perfect = df_cleaned[~((df_cleaned < lower) | (df_cleaned > upper)).any(axis=1)]

print("Size of data before removing outlier : ", df_cleaned.shape)
print("Size of data after removing outlier : ", df_perfect.shape)

In [None]:
# Normalization not needed as the values are already in 0s and 1s
def sigmoid(z):
    z = 1 / (1 + np.exp(-z))
    return z

In [None]:
def compute_cost(w, x, y, b):
    # formula = (-y(i)) log(f_x) - (1 - y(i)) log (1 - f_x)
    m = len(w)
    total_cost = 0
    for i in range(m):
        f_x = np.dot(w, x[i]) + b
        f_wb = sigmoid(f_x)
        total_cost += -(y[i] *  np.log(f_wb))- (1 - y[i]) * np.log(1 - f_wb)
    total_cost /= m

    return total_cost

In [None]:
def regularization(lamda, w, x , y, b):
    m = len(w)

    reg = 0
    cost_without_reg = compute_cost(w, x, y, b)
    for i in range(m):
        zwj = (w[i] ** 2)
        reg += zwj
    reg = lamda / (2 * m) *  reg

    total_cost = cost_without_reg + reg
    return total_cost

In [None]:
# formula = w - alpha (dw)
# b = b - alpha (db)

def compute_gradient(w, x ,y, b):
    m, n= x.shape
    dw_b = np.zeros((n))
    dj_b = 0
    for i in range(m):
        fw_z = np.dot(w,x[i]) + b
        fx_i = sigmoid(fw_z)
        for j in range(n):
            dw_b[j] += (fx_i - y[i]) * x[i,j]
        dj_b += (fx_i - y[i])

    dw_b = dw_b / m
    dj_b = dj_b / m

    return dw_b, dj_b

In [None]:
import math
def gradient_descent(alpha , w, x, y, b, iterations):

    J = []
    prev_cost = regularization(0.4, w,x, y, b)
    print("prev_cost", prev_cost)
    for i in range(iterations):
        dw, db = compute_gradient(w,x,y,b)
        w -= alpha * dw
        b -= alpha * db
        curr_cost = regularization(0.4, w,x, y, b)
        if i % 10 == 0 :
            print("curr : ", curr_cost)
            J.append(curr_cost)
            print(f"Iteration {i:2}: Cost {(J[-1]):8.4f}")
        prev_cost=curr_cost
    return w, b, J

initial_w = [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]
initial_b = 0.32
alpha = 0.3

w, b, cost = gradient_descent(alpha, initial_w, x, y, initial_b, 200)

print("w found by gradient descent:", w)
print("b found : ", b)

# the code block above takes a couple of minutes to run, 
# especially with a non-vectorized version. You can reduce the iterations to test your implementation and iterate faster. 

In [None]:
plt.plot(cost)
plt.xlabel('Iterations')
plt.ylabel('cost')
plt.show

In [None]:
feature_names = df_perfect.drop(columns=['HeartDisease']).columns
print(feature_names)

In [None]:
print(df_perfect.columns)
print(df_cleaned['HeartDisease'].unique())

In [None]:
def predict(w, x, b):
    
    # Calculate the linear combination of inputs and weights
    m, n = x.shape   
    p = np.zeros(m)
    for i in range(m):
        z_wb = 0
        for j in range(n): 
            z_wb_ij = x[i, j] * w[j]
            z_wb += z_wb_ij
        
        z_wb += b
        f_wb = sigmoid(z_wb)
        p[i] = f_wb >= 0.5
        
    return p

In [None]:
def compute_accuracy(w, x, y, b):
    y_pred = predict(w, x, b)
    print(f'Output of predict: shape {y_pred.shape}, value {y_pred}')

    accuracy = np.mean(y_pred == y)  # Proportion of correct predictions
    return accuracy * 100  # Return percentage accuracy

print("w : ", w)
accuracy = compute_accuracy(w, x, y, b)
print(f"Training Accuracy: {accuracy:.2f}%")

In [None]:
df_heart_disease = df_cleaned[df_cleaned['HeartDisease'] == 1]
df_heart_disease.head(3)

In [None]:
new_data = np.array([28.87, 1, 0, 0, 6.0, 1, 0, 12, 0, 0, 4])
prediction = np.dot(w, new_data) + b

result = sigmoid(prediction)
print(result)

disease = 1 if result >=0.5 else 0
print(f"Prediction Probability: {result:.9f}")
print(disease)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

x = np.array(df_cleaned.drop(columns = ['HeartDisease']))
y = np.array(df_cleaned['HeartDisease'])

# Scale the data
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)  # Scaling the feature matrix

# Create a logistic regression model
model = LogisticRegression(max_iter=100)  # Increase max_iter if needed

# Fit the model to the scaled data
model.fit(x_scaled, y)

y_pred = model.predict(x_scaled) 

# Predict on new data (make sure to scale new data as well)
new_data_scaled = scaler.transform([[20.34, 0, 0, 1, 0.0, 0, 0, 13, 0, 1, 2]])
prediction = model.predict(new_data_scaled)

print("Predicted class:", prediction)

accuracy = accuracy_score(y, y_pred)
print(f"Training Accuracy with scikit-learn: {accuracy * 100:.2f}%")
