In [None]:
import pandas as pd
import numpy as np

In [None]:
raw_df=pd.read_csv('heart_disease.csv')

In [None]:
raw_df.info()

In [None]:
raw_df.describe()

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

In [None]:
# Encode target variable
raw_df['Heart Disease Status'] = (raw_df['Heart Disease Status'] == 'Yes').astype(int)

In [None]:
from sklearn.model_selection import train_test_split

train_val_df,test_dataset=train_test_split(raw_df,test_size=0.2,random_state=42)
training_dataset,validation_dataset=train_test_split(train_val_df,test_size=0.25,random_state=42)

In [None]:
input_column=list(training_dataset)[1:-1]
target_column='Heart Disease Status'

In [None]:
train_input=training_dataset[input_column].copy()
train_target=training_dataset[target_column].copy()

val_input=validation_dataset[input_column].copy()
val_target=validation_dataset[target_column].copy()

test_input=test_dataset[input_column].copy()
test_target=test_dataset[target_column].copy()

In [None]:
numercal_cols=train_input.select_dtypes(include=np.number).columns.tolist()
categorical_cols=train_input.select_dtypes('object').columns.tolist()

In [None]:
print("="*70)
print("TARGET VARIABLE DISTRIBUTION")
print("="*70)
print(train_target.value_counts())
print("\nClass Distribution (%):")
print(train_target.value_counts(normalize=True) * 100)
print("\n")
# Visualize target distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
train_target.value_counts().plot(kind='bar', ax=axes[0], color=['#3498db', '#e74c3c'])
axes[0].set_title('Heart Disease Status Distribution (Count)')
axes[0].set_xlabel('Heart Disease Status')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=0)

train_target.value_counts(normalize=True).plot(kind='pie', ax=axes[1], autopct='%1.1f%%', colors=['#3498db', '#e74c3c'], labels=['No', 'Yes'])
axes[1].set_title('Heart Disease Status Distribution (%)')
axes[1].set_ylabel('')
plt.tight_layout()
plt.show()

In [None]:
correlations = {}
for col in numercal_cols:
    corr = train_input[col].corr(train_target)
    correlations[col] = corr

corr_df = pd.DataFrame(list(correlations.items()), columns=['Feature', 'Correlation'])
corr_df['Abs_Correlation'] = abs(corr_df['Correlation'])
corr_df = corr_df.sort_values('Abs_Correlation', ascending=False)

print(corr_df[['Feature', 'Correlation']])
print("\n")

In [None]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(strategy='mean')

imputer.fit(raw_df[numercal_cols])

train_input[numercal_cols]=imputer.transform(train_input[numercal_cols])
val_input[numercal_cols]=imputer.transform(val_input[numercal_cols])
test_input[numercal_cols]=imputer.transform(test_input[numercal_cols])

In [None]:
# Median method for categorical missing value 
imputer=SimpleImputer(strategy='most_frequent')
imputer.fit(raw_df[categorical_cols])

train_input[categorical_cols]=imputer.transform(train_input[categorical_cols])
val_input[categorical_cols]=imputer.transform(val_input[categorical_cols])
test_input[categorical_cols]=imputer.transform(test_input[categorical_cols])

In [None]:
# Feature Scalling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(raw_df[numercal_cols])

train_input[numercal_cols]=scaler.transform(train_input[numercal_cols])
val_input[numercal_cols]=scaler.transform(val_input[numercal_cols])
test_input[numercal_cols]=scaler.transform(test_input[numercal_cols])

In [None]:
train_input.describe()

In [None]:
#  Encoding Categorcal data 
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoder.fit(raw_df[categorical_cols])

encoded_cols=(encoder.get_feature_names_out(categorical_cols))

train_input[encoded_cols]=encoder.transform(train_input[categorical_cols])
val_input[encoded_cols]=encoder.transform(val_input[categorical_cols])
test_input[encoded_cols]=encoder.transform(test_input[categorical_cols])

In [None]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression(solver='liblinear')

model.fit(train_input[numercal_cols+list(encoded_cols)],train_target)

In [None]:
print(model.coef_)
print(model.intercept_)

In [None]:
# Use the same encoded columns as during training
encoded_feature_names = numercal_cols + list(encoded_cols)
training_predictions = model.predict(train_input[encoded_feature_names])
validation_predictions = model.predict(val_input[encoded_feature_names])
test_predictions = model.predict(test_input[encoded_feature_names])

In [None]:
# finding accurency(error) in predictions
from sklearn.metrics import accuracy_score
print(accuracy_score(train_target,training_predictions))
print(accuracy_score(val_target,validation_predictions))
print(accuracy_score(test_target,test_predictions))

In [None]:
import joblib
Heart_Disease_Predictor = {
    'model': model,
    'imputer': imputer,
    'scaler': scaler,
    'encoder': encoder,
    'input_cols': input_column,
    'target_col': target_column,
    'numeric_cols': numercal_cols,
    'categorical_cols': categorical_cols,
    'encoded_cols': encoded_cols
}
joblib.dump(Heart_Disease_Predictor, 'Heart_Disease_Predictor.joblib')

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
data = { 
    'Gender': 'Male',
    'Blood Pressure': 0.466667,
    'Cholesterol Level': 0.933333,
    'Exercise Habits': 'High',
    'Smoking': 'Yes',
    'Family Heart Disease': 'No',
    'Diabetes': 'Yes',
    'BMI': 0.877067,
    'High Blood Pressure': 'Yes',
    'Low HDL Cholesterol': 'No',
    'High LDL Cholesterol': 'No',
    'Alcohol Consumption': 'Low',
    'Stress Level': 'Medium',
    'Sleep Hours': 0.153273,
    'Sugar Consumption': 'Low',
    'Triglyceride Level': 0.22,
    'Fasting Blood Sugar': 0.575,
    'CRP Level': 0.85884,
    'Homocysteine Level': 0.835992
}
new_input_df = pd.DataFrame([data])
numerical_cols = [
    'Blood Pressure',
    'Cholesterol Level',
    'BMI',
    'Sleep Hours',
    'Triglyceride Level',
    'Fasting Blood Sugar',
    'CRP Level',
    'Homocysteine Level'
]
imputer_cols = imputer.feature_names_in_

for col in imputer_cols:
    if col not in new_input_df.columns:
        new_input_df[col] = np.nan

new_input_df[imputer_cols] = imputer.transform(new_input_df[imputer_cols])
new_input_df[numerical_cols] = scaler.transform(new_input_df[numerical_cols])
new_input_encoded = pd.get_dummies(new_input_df)

for col in encoded_feature_names:
    if col not in new_input_encoded.columns:
        new_input_encoded[col] = 0

new_input_encoded = new_input_encoded[encoded_feature_names]
prediction = model.predict(new_input_encoded)
print("Predicted output:", prediction)
training_predictions = model.predict(train_input[encoded_feature_names])
accuracy = accuracy_score(train_target, training_predictions)
print("Training Accuracy:", accuracy)