# Heart Disease UCI Dataset - Exploratory Data Analysis

This notebook performs comprehensive EDA on the Heart Disease dataset.

## Contents
1. Data Loading & Overview
2. Missing Values Analysis
3. Statistical Summary
4. Distribution Analysis
5. Correlation Analysis
6. Class Balance
7. Feature Relationships


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings

warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print("Libraries loaded successfully!")


## 1. Data Loading & Overview


In [None]:
# Load the dataset
data_path = Path('../data/raw/heart.csv')

# If data doesn't exist, run download script
if not data_path.exists():
    print("Data not found. Please run: python scripts/download_data.py")
else:
    df = pd.read_csv(data_path)
    print(f"Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")

# Display first few rows
print("\nFirst 5 rows:")
df.head()


In [None]:
# Dataset info
print("Dataset Information:")
print("=" * 50)
df.info()

# Column descriptions
column_descriptions = {
    'age': 'Age in years',
    'sex': 'Sex (1 = male, 0 = female)',
    'cp': 'Chest pain type (0-3)',
    'trestbps': 'Resting blood pressure (mm Hg)',
    'chol': 'Serum cholesterol (mg/dl)',
    'fbs': 'Fasting blood sugar > 120 mg/dl (1 = true, 0 = false)',
    'restecg': 'Resting ECG results (0-2)',
    'thalach': 'Maximum heart rate achieved',
    'exang': 'Exercise induced angina (1 = yes, 0 = no)',
    'oldpeak': 'ST depression induced by exercise relative to rest',
    'slope': 'Slope of peak exercise ST segment (0-2)',
    'ca': 'Number of major vessels colored by fluoroscopy (0-3)',
    'thal': 'Thalassemia (0-3)',
    'target': 'Heart disease (0 = no, 1+ = yes)'
}

print("\nColumn Descriptions:")
print("=" * 50)
for col, desc in column_descriptions.items():
    print(f"{col:12} : {desc}")


## 2. Missing Values Analysis


In [None]:
# Check for missing values
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Missing %': missing_pct
})

print("Missing Values Summary:")
print("=" * 50)
print(missing_df[missing_df['Missing Count'] > 0])

if missing.sum() == 0:
    print("\nâœ“ No missing values found!")

# Visualize missing values
fig, ax = plt.subplots(figsize=(12, 4))
sns.heatmap(df.isnull(), yticklabels=False, cbar=True, cmap='viridis', ax=ax)
ax.set_title('Missing Values Heatmap', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('../reports/screenshots/missing_values.png', dpi=150, bbox_inches='tight')
plt.show()


## 3. Statistical Summary


In [None]:
# Statistical summary
print("Statistical Summary:")
print("=" * 50)
df.describe().round(2)


## 4. Distribution Analysis


In [None]:
# Histogram for numerical features
numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for idx, col in enumerate(numerical_cols):
    ax = axes[idx]
    df[col].hist(bins=30, ax=ax, color='steelblue', edgecolor='white', alpha=0.7)
    ax.axvline(df[col].mean(), color='red', linestyle='--', label=f'Mean: {df[col].mean():.1f}')
    ax.axvline(df[col].median(), color='green', linestyle='--', label=f'Median: {df[col].median():.1f}')
    ax.set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
    ax.set_xlabel(col)
    ax.set_ylabel('Frequency')
    ax.legend(fontsize=8)

# Hide empty subplot
axes[-1].set_visible(False)

plt.suptitle('Distribution of Numerical Features', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('../reports/screenshots/numerical_distributions.png', dpi=150, bbox_inches='tight')
plt.show()


In [None]:
# Box plots for numerical features
fig, axes = plt.subplots(1, 5, figsize=(18, 5))

for idx, col in enumerate(numerical_cols):
    ax = axes[idx]
    sns.boxplot(y=df[col], ax=ax, color='lightblue')
    ax.set_title(f'{col}', fontsize=12, fontweight='bold')
    ax.set_ylabel(col)

plt.suptitle('Box Plots of Numerical Features', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('../reports/screenshots/boxplots.png', dpi=150, bbox_inches='tight')
plt.show()


## 5. Correlation Analysis


In [None]:
# Correlation heatmap
plt.figure(figsize=(14, 10))
correlation_matrix = df.corr()

mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, 
            annot=True, 
            fmt='.2f', 
            cmap='RdBu_r',
            center=0,
            mask=mask,
            square=True,
            linewidths=0.5)

plt.title('Feature Correlation Heatmap', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('../reports/screenshots/correlation_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()


In [None]:
# Correlation with target
target_corr = correlation_matrix['target'].drop('target').sort_values(key=abs, ascending=False)

plt.figure(figsize=(10, 6))
colors = ['green' if x > 0 else 'red' for x in target_corr.values]
bars = plt.barh(target_corr.index, target_corr.values, color=colors, alpha=0.7)
plt.axvline(0, color='black', linewidth=0.5)
plt.xlabel('Correlation with Target')
plt.title('Feature Correlation with Heart Disease Target', fontsize=14, fontweight='bold')

# Add correlation values
for bar, val in zip(bars, target_corr.values):
    plt.text(val + 0.02 if val > 0 else val - 0.08, bar.get_y() + bar.get_height()/2, 
             f'{val:.2f}', va='center', fontsize=9)

plt.tight_layout()
plt.savefig('../reports/screenshots/target_correlation.png', dpi=150, bbox_inches='tight')
plt.show()


## 6. Class Balance Analysis


In [None]:
# Convert target to binary (0 = no disease, 1 = disease)
df['target_binary'] = (df['target'] > 0).astype(int)

# Class distribution
class_counts = df['target_binary'].value_counts()
class_pct = df['target_binary'].value_counts(normalize=True) * 100

print("Class Distribution:")
print("=" * 50)
print(f"No Disease (0): {class_counts[0]} samples ({class_pct[0]:.1f}%)")
print(f"Disease (1):    {class_counts[1]} samples ({class_pct[1]:.1f}%)")

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Bar chart
ax1 = axes[0]
bars = ax1.bar(['No Disease (0)', 'Disease (1)'], class_counts.values, 
               color=['#2ecc71', '#e74c3c'], edgecolor='white', linewidth=2)
ax1.set_ylabel('Count')
ax1.set_title('Target Class Distribution', fontsize=14, fontweight='bold')
for bar, count, pct in zip(bars, class_counts.values, class_pct.values):
    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 5, 
             f'{count}\n({pct:.1f}%)', ha='center', fontsize=11)

# Pie chart
ax2 = axes[1]
ax2.pie(class_counts.values, labels=['No Disease', 'Disease'], 
        autopct='%1.1f%%', colors=['#2ecc71', '#e74c3c'],
        explode=[0.02, 0.02], shadow=True, startangle=90)
ax2.set_title('Target Class Proportion', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('../reports/screenshots/class_balance.png', dpi=150, bbox_inches='tight')
plt.show()


## 7. Feature Relationships with Target


In [None]:
# Numerical features by target class
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for idx, col in enumerate(numerical_cols):
    ax = axes[idx]
    for label, color in zip([0, 1], ['#2ecc71', '#e74c3c']):
        subset = df[df['target_binary'] == label][col]
        ax.hist(subset, bins=20, alpha=0.6, label=f'Class {label}', color=color, edgecolor='white')
    ax.set_title(f'{col} by Target Class', fontsize=12, fontweight='bold')
    ax.set_xlabel(col)
    ax.set_ylabel('Frequency')
    ax.legend()

axes[-1].set_visible(False)

plt.suptitle('Numerical Features Distribution by Target Class', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('../reports/screenshots/features_by_target.png', dpi=150, bbox_inches='tight')
plt.show()


In [None]:
# Violin plots for key features
fig, axes = plt.subplots(1, 4, figsize=(16, 5))
key_features = ['age', 'trestbps', 'chol', 'thalach']

for idx, col in enumerate(key_features):
    ax = axes[idx]
    sns.violinplot(x='target_binary', y=col, data=df, ax=ax, 
                   palette=['#2ecc71', '#e74c3c'])
    ax.set_title(f'{col}', fontsize=12, fontweight='bold')
    ax.set_xlabel('Heart Disease')
    ax.set_xticklabels(['No', 'Yes'])

plt.suptitle('Violin Plots of Key Features by Target', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('../reports/screenshots/violin_plots.png', dpi=150, bbox_inches='tight')
plt.show()


In [None]:
# Summary statistics by target
print("="*60)
print("KEY INSIGHTS FROM EDA")
print("="*60)

print("\n1. DATASET OVERVIEW:")
print(f"   - Total samples: {len(df)}")
print(f"   - Features: {len(df.columns) - 2} (excluding target variants)")
print(f"   - Missing values: {df.isnull().sum().sum()}")

print("\n2. CLASS BALANCE:")
print(f"   - No Disease: {class_counts[0]} ({class_pct[0]:.1f}%)")
print(f"   - Disease: {class_counts[1]} ({class_pct[1]:.1f}%)")
print(f"   - Dataset is {'balanced' if abs(class_pct[0] - 50) < 10 else 'imbalanced'}")

print("\n3. TOP CORRELATED FEATURES WITH TARGET:")
for feat, corr in target_corr.head(5).items():
    print(f"   - {feat}: {corr:.3f}")

print("\n4. OBSERVATIONS:")
print("   - Patients with heart disease tend to have:")
print("     * Lower maximum heart rate (thalach)")
print("     * Higher ST depression (oldpeak)")
print("     * Different chest pain patterns")
print("   - Age shows moderate positive correlation with heart disease")
print("   - Sex (male) shows association with higher disease risk")

print("\n" + "="*60)

# Clean up
df = df.drop('target_binary', axis=1)
print("\nEDA Complete! Figures saved to reports/screenshots/")
