# Exploratory Data Analysis - Heart Disease Dataset

This notebook performs comprehensive EDA on the Heart Disease UCI dataset.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys

# Add parent directory to path
sys.path.append('..')
from src.data.download_data import download_heart_disease_dataset
from src.utils.preprocessing import load_and_preprocess_data

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)


## 1. Data Loading


In [None]:
# Download data if not exists
data_path = Path("../data/raw/heart_disease_cleveland.csv")
if not data_path.exists():
    download_heart_disease_dataset()

# Load data
df = pd.read_csv(data_path)
print(f"Dataset shape: {df.shape}")
df.head()


## 2. Data Overview


In [None]:
df.info()
df.describe()


In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values:")
print(missing_values[missing_values > 0])
if missing_values.sum() == 0:
    print("\nNo missing values found!")


## 3. Target Variable Analysis (Class Balance)


In [None]:
# Convert target to binary
df['target_binary'] = (df['target'] > 0).astype(int)

# Class distribution
class_counts = df['target_binary'].value_counts()
print("Class distribution:")
print(class_counts)
print(f"\nClass balance: {class_counts[0]/len(df)*100:.2f}% No Disease, {class_counts[1]/len(df)*100:.2f}% Disease")

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

class_counts.plot(kind='bar', ax=axes[0], color=['skyblue', 'coral'])
axes[0].set_title('Class Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Target (0=No Disease, 1=Disease)', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].set_xticklabels(['No Disease', 'Disease'], rotation=0)

class_counts.plot(kind='pie', ax=axes[1], autopct='%1.1f%%', colors=['skyblue', 'coral'])
axes[1].set_title('Class Balance', fontsize=14, fontweight='bold')
axes[1].set_ylabel('')

plt.tight_layout()
plt.savefig('../screenshots/class_distribution.png', dpi=300, bbox_inches='tight')
plt.show()


## 4. Feature Distributions (Histograms)


In [None]:
# Select numeric features
numeric_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for idx, feature in enumerate(numeric_features):
    df[feature].hist(bins=30, ax=axes[idx], color='steelblue', edgecolor='black')
    axes[idx].set_title(f'{feature.capitalize()} Distribution', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(feature, fontsize=10)
    axes[idx].set_ylabel('Frequency', fontsize=10)
    axes[idx].grid(axis='y', alpha=0.3)

# Remove extra subplot
fig.delaxes(axes[5])

plt.tight_layout()
plt.savefig('../screenshots/feature_distributions.png', dpi=300, bbox_inches='tight')
plt.show()


## 5. Correlation Heatmap


In [None]:
# Calculate correlation matrix
feature_columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
                  'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target_binary']

correlation_matrix = df[feature_columns].corr()

# Create heatmap
plt.figure(figsize=(14, 12))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Heatmap', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('../screenshots/correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

# Show correlations with target
target_corr = correlation_matrix['target_binary'].sort_values(ascending=False)
print("\nCorrelation with Target:")
print(target_corr)
