<H1> Heart Disease prediction for early detection

<H2>Data Loading and Environment Setup</H2>

Link- https://www.kaggle.com/datasets/neurocipher/heartdisease

In [None]:
import pandas as pd
import os
import numpy as np
import kagglehub
import seaborn as sns
import matplotlib.pyplot as plt 



In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("neurocipher/heartdisease")


In [None]:
import os

path = '/Users/abrarzarif/.cache/kagglehub/datasets/neurocipher/heartdisease/versions/1'

print(os.listdir(path))
csv_file = os.path.join(path, 'Heart_Disease_Prediction.csv')  # adjust filename if different
df = pd.read_csv(csv_file)




<H2>EDA</H2>

In [None]:
df.shape

In [None]:
df.sample(5)

In [None]:
df.info()  #info finding

In [None]:
df.isnull().sum() #Finding missing values

In [None]:
df.describe()  #statistics finding

In [None]:
df.duplicated().sum()  #Finding if there is any duplicated value

In [None]:
df["Heart Disease"] = df["Heart Disease"].replace({
    "Presence": 1,
    "Absence":0
})                   #Encoding heart disease catagorical data with numerical data for analysis

In [None]:
df

<H2>Univariate analysis</H2>

In [None]:
df['Heart Disease'].value_counts().plot(kind='pie',autopct="%2f") #Finding percentage of people having/not having heart diseases

In [None]:
#not imbalanced

In [None]:
plt.figure(figsize=(20,5))
plt.subplot(1,4,1)
sns.boxplot(x=df['BP']) #boxplots for outlier detection
plt.subplot(1,4,2)
sns.boxplot(x=df["Age"])
plt.subplot(1,4,3)
sns.boxplot(x=df['Max HR'])
plt.subplot(1,4,4)
sns.boxplot(x=df['Cholesterol'])


In [None]:
#There are outliers, need to delete 




<H2>Multivariate Analysis</H2>

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

features = ['Sex', 'Chest pain type', 'BP', 'Cholesterol', 'FBS over 120', 
            'EKG results', 'Max HR', 'Exercise angina', 'ST depression', 
            'Slope of ST', 'Number of vessels fluro', 'Thallium', 'Age']

for feature in features:
    plt.figure(figsize=(15, 5))
    
    # Histogram
    plt.subplot(1, 3, 1)
    plt.hist(df[feature], bins=20, color='black', edgecolor='black', alpha=0.5)
    plt.title(f'{feature} Distribution', fontsize=14, fontweight='bold')
    plt.xlabel(feature)
    plt.ylabel('Frequency')
    
    plt.subplot(1, 3, 2)
    sns.histplot(data=df, x=feature, hue='Heart Disease', 
                 palette=['#2ecc71', '#e74c3c'], alpha=0.6, kde=True)
    plt.title(f'{feature} vs Heart Disease', fontsize=14, fontweight='bold')
    
    # Scatterplot 
    plt.subplot(1, 3, 3)
    sns.scatterplot(data=df, x='Age', y=feature, hue='Heart Disease', 
                    palette=['#2ecc71', '#e74c3c'], alpha=0.8)
    plt.title(f'Age vs {feature}', fontsize=14, fontweight='bold')
    
    plt.tight_layout()
    plt.show()

In [None]:
#Important features identified- chest pain type, bp, cholestorol, max HR, angina, Ischemia, Number of vessel fluro, Thallium, Age


<H2> Feature Engineering </H2>

In [None]:
def ischemia_risk(row):
    if row["ST depression"] >= 2.0 and row["Slope of ST"] == 3:
        return "high"                                           #Engineering a new feature called ischemia_risk by combining ST depression and slope of ST
    elif row["ST depression"] >= 1.0:
        return "moderate"
    else:
        return "low"

df["ischemia_risk"] = df.apply(ischemia_risk, axis=1)


In [None]:
df = pd.get_dummies(df, columns=["ischemia_risk"], drop_first=False)
df["ischemia_risk_low"] = df["ischemia_risk_low"].astype(int)
df["ischemia_risk_moderate"] = df["ischemia_risk_moderate"].astype(int) #one hot encoding the ischemia risks 


In [None]:
df = df.drop(["ischemia_risk_high", "EKG results", "FBS over 120"], axis=1) #Dropping less important features

In [None]:
df = df.drop(["ST depression", "Slope of ST"], axis=1) #dropping features which are already combined into another

In [None]:
#train-test split before scaling

from sklearn.model_selection import train_test_split 

df_train, df_test = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    shuffle=True )

#splitting target from training
X_train = df_train.drop(columns=["Heart Disease"])
y_train = df_train["Heart Disease"]

X_test = df_test.drop(columns=["Heart Disease"])
y_test = df_test["Heart Disease"]


In [None]:
X_test.sample(5)


<H2>Model Implementation</H2>


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import LogisticRegression         #Binary classifier problem so choosing Logistic Regression
from sklearn.compose import ColumnTransformer

# Columns with outliers
outlier_cols = ["Cholesterol", "BP", "Max HR"]
normal_cols = [c for c in X_train.columns if c not in outlier_cols]

# Preprocessing: Robust scaling for outliers, standard scaling for others
preprocessor = ColumnTransformer([
    ("robust", RobustScaler(), outlier_cols),
    ("standard", StandardScaler(), normal_cols)
])

reg = Pipeline([
    ("preprocess", preprocessor),         #using Elasticnet as dataset is less than 50k samples, but not extremely small. 
    ("model", LogisticRegression(
        penalty="elasticnet",
        solver="saga",     
        l1_ratio=0.7,        
        C=10,               
        max_iter=5000,
        random_state=42
    ))
])


# Train
prd=reg.fit(X_train, y_train)

# Predict
y_pred_lr = reg.predict(X_test)
y_prob_lr = reg.predict_proba(X_test)[:, 1]



In [None]:
#will check here if robust scaling actually worked in reducing the outliers
# Get fitted preprocessor
preprocessor_fitted = reg.named_steps["preprocess"]

# Transform training data
X_train_transformed = preprocessor_fitted.transform(X_train)

# Get column order after ColumnTransformer
robust_features = outlier_cols
standard_features = normal_cols
all_features = robust_features + standard_features

X_train_scaled = pd.DataFrame(
    X_train_transformed,
    columns=all_features,
    index=X_train.index
)


In [None]:
feature = "Cholesterol"

fig, axes = plt.subplots(1, 2, figsize=(10, 4))

# Before scaling
axes[0].boxplot(X_train[feature], vert=False)
axes[0].set_title(f"Before Scaling: {feature}")

# After Robust Scaling
axes[1].boxplot(X_train_scaled[feature], vert=False)
axes[1].set_title(f"After Robust Scaling: {feature}")

plt.show()


<H2>Performance Metrics</H2>

In [None]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score

def evaluate_model(name, y_true, y_pred, y_prob):
    print(f"\n{name}")
    print("-" * len(name))
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))
    print("ROC-AUC:", roc_auc_score(y_true, y_prob))


In [None]:
evaluate_model("Logistic Regression", y_test, y_pred_lr, y_prob_lr)
accuracy = accuracy_score(y_test, y_pred_lr)
print(f"Accuracy: {accuracy:.4f}")
recall = recall_score(y_test, y_pred_lr)
print(f"Recall:{recall:.4f}")


In [None]:
#confusion matrix
cmlr = [[32, 1],
      [5, 16]]

labels = ['No Disease (0)', 'Disease (1)']

# Plotting
plt.figure(figsize=(6,5))
sns.heatmap(cmlr, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Logistic Regression')
plt.show()

In [None]:
#importing the model for deployment
import pickle
with open("reg.pkl","wb") as f:
    pickle.dump(reg, f)

