1. Setup and Install Libraries

In [None]:
# Install libraries if not already available (in Colab, most are pre-installed)
!pip install seaborn scikit-learn

2. Import Required Libraries

In [None]:
# Basic libraries
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# ML models and evaluation
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc

3. Load Dataset

In [None]:
# Load the UCI Heart Disease Dataset (upload CSV or link if available)
# Example: assume you upload 'heart.csv' into Colab
from google.colab import files
uploaded = files.upload()

# Read the dataset
df = pd.read_csv("heart.csv")
df.head()

4. Basic Data Understanding

In [None]:
# Shape of dataset
print("Dataset shape:", df.shape)

# Data types and missing values
print(df.info())

# Summary statistics
print(df.describe())

5. Data Cleaning & Preparation

In [None]:
# Check missing values
print(df.isnull().sum())

# (If any missing values exist, handle them - here dataset usually has none)
# Example handling missing:
# df['chol'] = df['chol'].fillna(df['chol'].mean())

# Encode categorical variables if present
# Example: if "sex" is categorical with values 0/1, no encoding needed


6. Exploratory Data Analysis (EDA)

In [None]:
# Correlation Heatmap
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

# Histogram of features
df.hist(bins=20, figsize=(15, 10))
plt.show()

# Boxplot for cholesterol vs target
sns.boxplot(x="target", y="chol", data=df)
plt.title("Cholesterol Levels vs Heart Disease")
plt.show()


7. Feature Selection & Train-Test Split

In [None]:
# Define features (X) and target (y)
X = df.drop("target", axis=1)
y = df["target"]

# Split into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

8. Model Training

In [None]:
# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Support Vector Machine
svm = SVC(probability=True)
svm.fit(X_train, y_train)

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

9. Model Evaluation

In [None]:
models = [("Logistic Regression", log_reg),
          ("SVM", svm),
          ("Random Forest", rf)]

for name, model in models:
    y_pred = model.predict(X_test)
    print(f"--- {name} ---")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\n")

10. ROC Curve and AUC

In [None]:
plt.figure(figsize=(8,6))

for name, model in models:
    y_probs = model.predict_proba(X_test)[:,1]
    fpr, tpr, _ = roc_curve(y_test, y_probs)
    plt.plot(fpr, tpr, label=f"{name} (AUC = {auc(fpr,tpr):.2f})")

plt.plot([0,1], [0,1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve Comparison")
plt.legend()
plt.show()


11. Prediction on New Patient Data

In [None]:
# Example new patient data [age, sex, cp, trestbps, chol, fbs, restecg, thalach, exang, oldpeak, slope, ca, thal]
new_patient = np.array([[52, 1, 0, 125, 212, 0, 1, 168, 0, 1.0, 2, 2, 3]])

# Scale new data
new_patient_scaled = scaler.transform(new_patient)

# Predict using best model (Random Forest here)
prediction = rf.predict(new_patient_scaled)
print("Prediction:", "Heart Disease" if prediction[0] == 1 else "No Heart Disease")