## ‚öôÔ∏è **Heart Disease Prediction using Logistic Regression and KNN**
A complete end-to-end ML pipeline built for Google Colab.

This notebook demonstrates data exploration, visualization, model building, and evaluation for predicting heart disease.

In [None]:

# ‚úÖ Install Required Libraries
!pip install pandas matplotlib seaborn scikit-learn

# ‚úÖ Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statistics as s

# ‚úÖ Visualization style setup
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")

print("‚úÖ Libraries successfully imported!")


In [None]:

# ‚úÖ Load Dataset
df = pd.read_csv("https://raw.githubusercontent.com/krishnaik06/Herat-Disease-UCI-Dataset/master/heart.csv")
df.head()


In [None]:

# ‚úÖ Dataset Information
df.info()
print("Shape:", df.shape)
print("Size:", df.size)
df.describe()


In [None]:

# ‚úÖ Data Type and Missing Value Check
print("Data Types:
", df.dtypes.value_counts())
print("\nMissing Values per Column:\n", df.isnull().sum())
print("\nPercentage of Missing Values:\n", (df.isnull().sum()/len(df))*100)


In [None]:

# ‚úÖ Correlation Heatmap
corr_matrix = df.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap="YlGnBu", fmt=".2f")
plt.title("Feature Correlation Heatmap")
plt.show()


In [None]:

# ‚úÖ Average Age Calculation
average_age = s.mean(df['age'])
print(f"Average Age of Patients: {average_age:.2f} years")
df[['age', 'sex', 'trestbps', 'chol']].head()


In [None]:

# ‚úÖ Split Dataset
from sklearn.model_selection import train_test_split

X = df.drop('target', axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=56)

print("Training Data Shape:", X_train.shape)
print("Testing Data Shape:", X_test.shape)


In [None]:

# ‚úÖ Evaluation Function
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        print("\nüìò Train Performance:")
    else:
        pred = clf.predict(X_test)
        print("\nüìï Test Performance:")
    
    acc = accuracy_score(y_test if not train else y_train, pred)*100
    print(f"Accuracy: {acc:.2f}%")
    print("Classification Report:\n", classification_report(y_test if not train else y_train, pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test if not train else y_train, pred))


In [None]:

# ‚úÖ Logistic Regression Model
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(solver='liblinear')
lr_clf.fit(X_train, y_train)

print_score(lr_clf, X_train, y_train, X_test, y_test, train=True)
print_score(lr_clf, X_train, y_train, X_test, y_test, train=False)


In [None]:

# ‚úÖ K-Nearest Neighbors Model
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(n_neighbors=5)
knn_clf.fit(X_train, y_train)

print_score(knn_clf, X_train, y_train, X_test, y_test, train=True)
print_score(knn_clf, X_train, y_train, X_test, y_test, train=False)


In [None]:

# ‚úÖ Model Comparison
lr_train_acc = accuracy_score(y_train, lr_clf.predict(X_train)) * 100
lr_test_acc = accuracy_score(y_test, lr_clf.predict(X_test)) * 100

knn_train_acc = accuracy_score(y_train, knn_clf.predict(X_train)) * 100
knn_test_acc = accuracy_score(y_test, knn_clf.predict(X_test)) * 100

results_df = pd.DataFrame({
    'Model': ['Logistic Regression', 'K-Nearest Neighbors'],
    'Training Accuracy %': [lr_train_acc, knn_train_acc],
    'Testing Accuracy %': [lr_test_acc, knn_test_acc]
})
results_df


In [None]:

# ‚úÖ Visualization of Model Comparison
plt.figure(figsize=(8, 5))
sns.barplot(data=results_df, x='Model', y='Testing Accuracy %')
plt.title("Model Performance Comparison (Testing Accuracy)")
plt.ylabel("Accuracy %")
plt.show()
