# Exploratory Data Analysis (EDA) - Heart Disease Prediction

This notebook covers the initial data analysis as part of the MLOps pipeline Phase 1.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport
import sys
import os

# Add src to path
sys.path.append(os.path.abspath(os.path.join('../src')))

# Load data

# Option 1: Load from raw CSV if it exists
data_path = '../data/raw/heart_disease_raw.csv'
if os.path.exists(data_path):
    df = pd.read_csv(data_path)
else:
    # Option 2: Use the load_data module
    from data.load_data import load_data_pipeline
    df = load_data_pipeline()

print(df.shape)

## 1. Data Quality Assessment

In [None]:
df.info()

In [None]:
# Missing values
missing = df.isnull().sum()
print(missing[missing > 0])

# Visualize missing values if useful
plt.figure(figsize=(10,6))
sns.heatmap(df.isnull(), cbar=False)
plt.title("Missing Values Heatmap")
plt.show()

## 2. Distribution Analysis

In [None]:
# Target distribution
sns.countplot(x='target', data=df)
plt.title("Target Distribution")
plt.show()

In [None]:
# Numerical distributions
num_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
df[num_cols].hist(figsize=(12, 10), bins=20)
plt.show()

## 3. Correlation Analysis

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

## 4. Automated Report

In [None]:
# Profile Report
profile = ProfileReport(df, title="Heart Disease EDA Report")
profile.to_file("../reports/eda_report.html")