# Task 3 — Heart Disease Prediction (Beginner)
This notebook loads the Heart Disease UCI/Cleveland dataset (user should upload the CSV or replace the path), performs basic preprocessing, trains a Logistic Regression classifier, and evaluates accuracy, confusion matrix, and ROC AUC.
If you don't have the CSV, download from Kaggle and upload to the runtime, then update `csv_path`.


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

try:
    get_ipython().run_line_magic('matplotlib', 'inline')
except:
    pass

# Replace this path with your uploaded CSV path in Colab/Jupyter
csv_path = 'heart.csv'  # update if needed
try:
    df = pd.read_csv(csv_path)
except FileNotFoundError:
    # If file not found, create a tiny example to show structure
    print('CSV not found at', csv_path)
    df = pd.DataFrame({
        'age':[63,37,41,56],
        'sex':[1,1,0,1],
        'cp':[1,2,1,1],
        'trestbps':[145,130,130,120],
        'chol':[233,250,204,236],
        'fbs':[1,0,0,0],
        'restecg':[0,1,0,1],
        'thalach':[150,187,172,178],
        'exang':[0,0,0,0],
        'oldpeak':[2.3,3.5,1.4,0.8],
        'slope':[3,2,2,2],
        'ca':[0,0,0,0],
        'thal':[1,2,3,2],
        'target':[1,0,0,1]
    })
df.head()


In [None]:
# Basic EDA
print('Shape:', df.shape)
print(df.info())
print(df.describe())
sns.countplot(x='target', data=df)
plt.title('Target distribution (1 = disease, 0 = no disease)')
plt.show()


In [None]:
# Simple preprocessing: drop rows with missing values (beginner)
df_clean = df.dropna().copy()

# Features and target (choose numeric columns)
X = df_clean.drop(columns=['target'])
y = df_clean['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:,1]

print('Accuracy:', accuracy_score(y_test, y_pred))
print('ROC AUC:', roc_auc_score(y_test, y_proba))


In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_proba)
plt.figure()
plt.plot(fpr, tpr, label='ROC Curve (AUC = %0.2f)' % roc_auc_score(y_test, y_proba))
plt.plot([0,1],[0,1],'--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()


### Notes (Beginner)
- Replace `csv_path` with the path to the Heart Disease dataset CSV you upload.
- This is a basic pipeline: more advanced preprocessing (imputation, encoding) and feature selection will improve results.
