# Heart Disease UCI Dataset - Exploratory Data Analysis

**Assignment:** MLOps (S1-25_AIMLCZG523)

This notebook performs comprehensive EDA on the UCI Heart Disease dataset including:
- Data loading and inspection
- Missing value analysis
- Statistical summaries
- Visualizations (histograms, correlation heatmaps, class balance)


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Settings
plt.style.use('seaborn-v0_8-whitegrid')
pd.set_option('display.max_columns', None)
%matplotlib inline


## 1. Load Dataset


In [None]:
# Load dataset
data_path = Path('../data/raw/heart.csv')

if not data_path.exists():
    print("Dataset not found. Run: python scripts/download_data.py")
else:
    df = pd.read_csv(data_path)
    print(f"Dataset loaded: {df.shape[0]} samples, {df.shape[1]} features")
    display(df.head())


In [None]:
# Dataset info
print("Dataset Info:")
print("="*50)
df.info()
print("\nStatistical Summary:")
df.describe()


## 2. Target Distribution (Class Balance)


In [None]:
# Target distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

target_counts = df['target'].value_counts()
colors = ['#2ecc71', '#e74c3c']

# Bar plot
axes[0].bar(['No Disease (0)', 'Disease (1)'], target_counts.values, color=colors)
axes[0].set_title('Target Distribution', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Count')
for i, v in enumerate(target_counts.values):
    axes[0].text(i, v + 3, str(v), ha='center', fontweight='bold')

# Pie chart
axes[1].pie(target_counts.values, labels=['No Disease', 'Disease'], 
            autopct='%1.1f%%', colors=colors, explode=(0.05, 0.05))
axes[1].set_title('Target Distribution (%)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print(f"Class Balance: No Disease={target_counts[0]}, Disease={target_counts[1]}")


## 3. Correlation Heatmap


In [None]:
# Correlation heatmap
plt.figure(figsize=(14, 10))
correlation_matrix = df.corr()
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='RdYlBu_r',
            mask=mask, linewidths=0.5, vmin=-1, vmax=1, square=True)

plt.title('Feature Correlation Heatmap', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()


## 4. Feature Distributions


In [None]:
# Feature distributions by target
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, col in enumerate(numerical_features):
    ax = axes[i]
    df[df['target']==0][col].hist(ax=ax, alpha=0.7, label='No Disease', color='#2ecc71', bins=20)
    df[df['target']==1][col].hist(ax=ax, alpha=0.7, label='Disease', color='#e74c3c', bins=20)
    ax.set_title(f'{col.upper()} Distribution', fontsize=12, fontweight='bold')
    ax.set_xlabel(col)
    ax.legend()

axes[5].axis('off')
plt.tight_layout()
plt.show()
