In [None]:
# Heart Disease Risk Prediction

: 

## Objective
### The goal of this project is to build a machine learning model that predicts the likelihood of heart disease based on patient health metrics.

### Importing required libraries for the Machine Learning Workflow

In [None]:
import  pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

### Loading the Dataset

In [None]:
df = pd.read_csv('heart_data.csv')
df.head(5)

In [None]:
df.tail(5)

### Checking numbers of rows and columns in data set

In [None]:
df.shape

In [None]:
df.describe()

### Checking for Missing or Zero Values

In [None]:
(df == 0.0).sum()

### Exploring Data Types and Target Distribution

In [None]:
print(df.dtypes)

In [None]:
print(df['target'].value_counts())

### Visualizes how categorical features relate to heart disease. For example, the sex countplot shows the number of males and females with or without heart disease, helping identify patterns and important features for prediction.

In [None]:
sns.countplot(x='sex', hue='target', data=df, palette='coolwarm')
plt.title("Sex vs Heart Disease")
plt.xticks([0,1], ['Female','Male'])
plt.show()

### Correlation Heatmap
#### Shows how features are correlated with each other and with the target.

#### Helps in understanding which features may be more important for prediction.

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap of Features")
plt.show()

In [None]:
df.hist(bins = 50, figsize = (20,15))

In [None]:
(df == 0.0).sum()

### Handling Missing/Zero Values
#### Replaces 0 values in cholesterol with the median of non-zero cholesterol values.

#### This avoids biasing the model with invalid zeros.

In [None]:
median_cholesterol = df[df['cholesterol'] > 0] ['cholesterol'].median()
print(median_cholesterol)


In [None]:
df['cholesterol'] = df['cholesterol'].replace(0, median_cholesterol)

finding data where their values is 0.0

In [None]:
(df == 0.0).sum()

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap of Features")
plt.show()

### Dropping Less Useful Features ie, negative correlated columns 
#### Keeps the dataset clean and reduces noise.

In [None]:
df1 = df.drop([ 'resting bp s', 'cholesterol',
               'fasting blood sugar', 'resting ecg', 'max heart rate'], axis=1)


In [None]:
df1

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df1.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap of Features")
plt.show()

### Checking Data Distribution
#### Prints the distribution of values for each column.
#### Helps understand categorical feature balance.

In [None]:
for col in df1.columns:
    print(f"\nColumn: {col}")
    print(df1[col].value_counts())

In [None]:
print(df.dtypes)

### Train-Test Split
#### Splits the data into training set (80%) and testing set (20%)

In [None]:
from sklearn.model_selection import train_test_split

X = df1.drop('target', axis=1)
y = df1['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=45, stratify=y  # stratify keeps balance
)

print("Train target distribution:\n", y_train.value_counts())
print("Test target distribution:\n", y_test.value_counts())

In [None]:
for col in ['sex', 'chest pain type', 'exercise angina', 'ST slope','oldpeak']:
    print(f"\n{col} - Train:")
    print(X_train[col].value_counts(normalize=True))
    print(f"{col} - Test:")
    print(X_test[col].value_counts(normalize=True))

### Handling Negative Values
#### Replaces negative values in oldpeak with 0.

#### Negative values are invalid in this context.

In [None]:
X['oldpeak'] = X['oldpeak'].apply(lambda x: 0 if x < 0 else x)
X_train['oldpeak'] = X_train['oldpeak'].apply(lambda x: 0 if x < 0 else x)
X_test['oldpeak']  = X_test['oldpeak'].apply(lambda x: 0 if x < 0 else x)

In [None]:
print(df1['oldpeak'])

In [None]:
for col in ['oldpeak']:
    print(f"\n{col} - Train:")
    print(X_train[col].value_counts(normalize=True))
    print(f"{col} - Test:")
    print(X_test[col].value_counts(normalize=True))

In [None]:
df1_train = X_train.copy()
df1_train['target'] = y_train
df1_test = X_test.copy()
df1_test['target'] = y_test

### Comparing Feature Distribution in Train vs Test
#### Plots bar charts for selected categorical features (sex, chest pain type, exercise angina, ST slope) to verify that train and test distributions are similar.

In [None]:
cols = ['sex', 'chest pain type', 'exercise angina', 'ST slope']

for col in cols:
    plt.figure(figsize=(8,4))
    train_counts = df1_train[col].value_counts(normalize=True).sort_index()
    test_counts  = df1_test[col].value_counts(normalize=True).sort_index()
    df_plot = pd.DataFrame({'Train': train_counts, 'Test': test_counts})
    df_plot.plot(kind='bar', figsize=(8,4))
    plt.title(f'Train vs Test Distribution: {col}')
    plt.ylabel('Proportion')
    plt.xlabel(col)
    plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

pipeline = Pipeline([
    ('scaler', StandardScaler()),      
    ('model', LogisticRegression())    
])

# Train
pipeline.fit(X_train, y_train)

# Predict
y_pred = pipeline.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Pipeline
pipe_dt = Pipeline([
    ('scaler', StandardScaler()),         
    ('dt', DecisionTreeClassifier(random_state=45))
])

# Fit
pipe_dt.fit(X_train[cols], y_train)

# Predict
y_pred_dt = pipe_dt.predict(X_test[cols])

# Evaluate
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Random Forest pipeline
pipe_rf = Pipeline([
    ('scaler', StandardScaler()),  # optional for tree-based models
    ('rf', RandomForestClassifier(n_estimators=900, random_state=45))
])

# Train
pipe_rf.fit(X_train, y_train)

# Predict
y_pred_rf = pipe_rf.predict(X_test)

# Evaluate
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))

print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Linear Regression pipeline
pipe_lr_model = Pipeline([
    ('scaler', StandardScaler()),
    ('lr', LinearRegression())
])

# Train
pipe_lr_model.fit(X_train, y_train)

# Predict
y_pred_lr = pipe_lr_model.predict(X_test)

# Evaluate

print("MSE:", mean_squared_error(y_test, y_pred_lr))
print("R2 Score:", r2_score(y_test, y_pred_lr))

y_pred_lr_bin = (y_pred_lr >= 0.5).astype(int)
from sklearn.metrics import accuracy_score
print("Linear Regression Accuracy (binary):", accuracy_score(y_test, y_pred_lr_bin))


In [None]:
from sklearn.neighbors import KNeighborsClassifier

# KNN pipeline
pipe_knn = Pipeline([
    ('scaler', StandardScaler()),  # scaling is important for KNN
    ('knn', KNeighborsClassifier(n_neighbors=5))  # you can tune n_neighbors
])

# Train
pipe_knn.fit(X_train, y_train)

# Predict
y_pred_knn = pipe_knn.predict(X_test)

# Evaluate
print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize the Gradient Boosting model
gb_model = GradientBoostingClassifier(
    n_estimators=1000,    # number of trees
    learning_rate=0.10,   # step size shr
    max_depth=4,         # depth of each tree
    random_state=45
)

# Train the model
gb_model.fit(X_train, y_train)

# Make predictions
y_pred_gb = gb_model.predict(X_test)

# Evaluate performance
print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred_gb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_gb))


In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Create a dictionary to store your models and predictions
models_preds = {
    'Logistic Regression': y_pred,
    'Decision Tree': y_pred_dt,
    'Random Forest': y_pred_rf,
    'Linear Regression (binary)': y_pred_lr_bin,
    'K-Nearest Neighbors': y_pred_knn,
    'Gradient Boosting': y_pred_gb
}

# Initialize a list to store metrics
metrics_list = []

# Calculate metrics for each model
for model_name, preds in models_preds.items():
    accuracy = accuracy_score(y_test, preds)
    precision = precision_score(y_test, preds)
    recall = recall_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    
    
    
    metrics_list.append({
        'Model': model_name,
        'Accuracy': round(accuracy, 3),
        'Precision': round(precision, 3),
        'Recall': round(recall, 3),
        'F1-score': round(f1, 3),
        
    })

# Convert to DataFrame
metrics_df = pd.DataFrame(metrics_list)
metrics_df = metrics_df.sort_values(by='Accuracy', ascending=False).reset_index(drop=True)

print(metrics_df)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report, accuracy_score

# --- 1. Accuracy Table and Bar Plot ---
accuracy_dict = {
    'Logistic Regression': accuracy_score(y_test, y_pred),
    'Decision Tree': accuracy_score(y_test, y_pred_dt),
    'Random Forest': accuracy_score(y_test, y_pred_rf),
    'Linear Regression (binary)': accuracy_score(y_test, y_pred_lr_bin),
    'K-Nearest Neighbors': accuracy_score(y_test, y_pred_knn),
    'Gradient Boosting': accuracy_score(y_test, y_pred_gb)
}

accuracy_table = pd.DataFrame(list(accuracy_dict.items()), columns=['Model', 'Accuracy']).sort_values(by='Accuracy', ascending=False)
print(accuracy_table)

# Bar plot
plt.figure(figsize=(8,5))
sns.barplot(x='Accuracy', y='Model', data=accuracy_table, palette='viridis')
plt.title('Model Accuracy Comparison')
plt.xlim(0,1)
plt.show()

# --- 3. ROC Curves ---
models_proba = {
    'Logistic Regression': pipeline,
    'Random Forest': pipe_rf,
    'Gradient Boosting': gb_model,
    'K-Nearest Neighbors': pipe_knn
}

plt.figure(figsize=(8,6))
for name, model in models_proba.items():
    try:
        y_proba = model.predict_proba(X_test)[:,1]  
        fpr, tpr, _ = roc_curve(y_test, y_proba)
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f'{name} (AUC={roc_auc:.2f})')
    except:
        print(f'{name} skipped (no predict_proba)')


plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves')
plt.legend()
plt.grid(True)
plt.show()



In [None]:
from sklearn.metrics import confusion_matrix

# Function to plot confusion matrix
def plot_conf_matrix(y_true, y_pred, model_name):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                xticklabels=['No Disease', 'Disease'],
                yticklabels=['No Disease', 'Disease'])
    plt.title(f'Confusion Matrix - {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

# Example usage for your models:
plot_conf_matrix(y_test, y_pred, "Logistic Regression")
plot_conf_matrix(y_test, y_pred_dt, "Decision Tree")
plot_conf_matrix(y_test, y_pred_rf, "Random Forest")
plot_conf_matrix(y_test, y_pred_lr_bin, "Linear Regression (Binary)")
plot_conf_matrix(y_test, y_pred_knn, "KNN")
plot_conf_matrix(y_test, y_pred_gb, "Gradient Boosting")


In [None]:
import matplotlib.pyplot as plt


x_feature = 'age'
y_feature = 'chest pain type'

# Scatter plot
plt.figure(figsize=(8,6))
plt.scatter(df[df['target']==0][x_feature], df[df['target']==0][y_feature],
            color='blue', label='No (0)', alpha=0.6, edgecolor='k')

plt.scatter(df[df['target']==1][x_feature], df[df['target']==1][y_feature],
            color='red', label='Yes (1)', alpha=0.6, edgecolor='k')

plt.xlabel(x_feature)
plt.ylabel(y_feature)
plt.title(f"Scatter Plot of {x_feature} vs {y_feature}")
plt.legend()
plt.show()



In [None]:
X_train.info()

In [None]:
# ============================================
# Run this in your Jupyter Notebook
# ============================================

import joblib
import pandas as pd
import numpy as np

# Assuming you have:
# - df: your heart disease dataframe
# - gb_model: your trained Gradient Boost model
# - X: your features dataframe
# - y: your target variable

# ============================================
# 1. SAVE THE TRAINED MODEL
# ============================================

# Save your trained Gradient Boost model
joblib.dump(gb_model, "heart_model.pkl")
print("✅ Model saved as 'heart_model.pkl'")

# ============================================
# 2. SAVE HEALTHY AVERAGE VALUES
# ============================================

# Calculate average values for healthy patients (target == 0)
# Using only the 6 features we need for prediction
features_needed = ['age', 'sex', 'chest pain type', 'exercise angina', 'oldpeak', 'ST slope']

healthy_avg = df[df['target'] == 0][features_needed].mean()

# Convert to dictionary format
healthy_avg_dict = healthy_avg.to_dict()

# Save as JSON
import json
with open('healthy_avg.json', 'w') as f:
    json.dump([healthy_avg_dict], f, indent=4)

print("✅ Healthy averages saved as 'healthy_avg.json'")
print("\nHealthy Average Values:")
print(healthy_avg_dict)

# ============================================
# 3. VERIFY THE SAVED FILES
# ============================================

# Test loading the model
loaded_model = joblib.load("heart_model.pkl")
print("\n✅ Model loaded successfully")

# Test prediction with sample data
sample_data = np.array([[55, 1, 3, 1, 2.5, 2]])  # Example values
prediction = loaded_model.predict(sample_data)
prediction_proba = loaded_model.predict_proba(sample_data)

print(f"\n🧪 Test Prediction:")
print(f"   Prediction: {prediction[0]}")
print(f"   Probability: {prediction_proba[0]}")

# Check if model has feature_importances_
if hasattr(loaded_model, 'feature_importances_'):
    print("\n✅ Feature importances available")
    feature_names = ['Age', 'Sex', 'Chest Pain Type', 'Exercise Angina', 'Oldpeak', 'ST Slope']
    for name, importance in zip(feature_names, loaded_model.feature_importances_):
        print(f"   {name}: {importance:.4f}")
else:
    print("\n⚠️ Model doesn't have feature_importances_")

# ============================================
# 4. VERIFY JSON FILE
# ============================================

with open('healthy_avg.json', 'r') as f:
    loaded_avg = json.load(f)
print("\n✅ Healthy averages loaded successfully:")
print(loaded_avg)

print("\n" + "="*50)
print("📦 FILES READY FOR DEPLOYMENT:")
print("="*50)
print("1. heart_model.pkl - Place in: backend/model/")
print("2. healthy_avg.json - Place in: backend/model/")
print("\nYou can now move these files to your Django backend!")


In [None]:
import joblib
import pandas as pd
import json

# Save your trained model (assuming gb_model is your trained model)
joblib.dump(gb_model, "heart_model.pkl")

# Save healthy averages (assuming df is your dataframe)
features_needed = ['age', 'sex', 'chest pain type', 'exercise angina', 'oldpeak', 'ST slope']
healthy_avg = df[df['target'] == 0][features_needed].mean()
with open('healthy_avg.json', 'w') as f:
    json.dump([healthy_avg.to_dict()], f, indent=4)

print("✅ Files created: heart_model.pkl and healthy_avg.json")

In [None]:
import joblib
import sklearn

print("scikit-learn version:", sklearn.__version__)

# Re-save your model (replace 'gb_model' with your actual model variable)
joblib.dump(gb_model, "heart_model.pkl")
print("✅ Model re-saved with current scikit-learn version")

In [None]:
import sklearn
print('Jupyter scikit-learn:', sklearn.__version__)


In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
import joblib
import json

# Check version
import sklearn
print('scikit-learn version:', sklearn.__version__)

# Load your data (adjust column names if needed)
# df = pd.read_csv('your_heart_disease_data.csv')

# Train your model again (use your actual training code)
# Example:
# X = df[['age', 'sex', 'chest pain type', 'exercise angina', 'oldpeak', 'ST slope']]
# y = df['target']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# gb_model = GradientBoostingClassifier(random_state=42)
# gb_model.fit(X_train, y_train)

# Save the newly trained model
joblib.dump(gb_model, "model.pkl")
print("✅ Model saved!")

# Save healthy averages
features_needed = ['age', 'sex', 'chest pain type', 'exercise angina', 'oldpeak', 'ST slope']
healthy_avg = df[df['target'] == 0][features_needed].mean()

with open('healthy_avg.json', 'w') as f:
    json.dump([healthy_avg.to_dict()], f, indent=4)
print("✅ Healthy averages saved!")

In [None]:
print('numpy version:', numpy.__version__)

In [None]:
!pip list