# 01. Exploratory Data Analysis (EDA)

This notebook explores the Diabetes 130-US Hospitals dataset, focusing on understanding the features and the target variable (readmission).

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys

# Add src to path
sys.path.append('../src')
from preprocessing import load_data

sns.set_style('whitegrid')

## Load Data

In [None]:
data_path = '../data/diabetic_data.csv'
if not Path(data_path).exists():
    print(f"Data file not found at {data_path}. Please ensure the dataset is in the data/ directory.")
else:
    df = load_data(data_path)
    print(f"Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")

## Target Variable Analysis

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='readmitted', data=df, palette='viridis')
plt.title('Distribution of Readmission Status')
plt.show()

## Feature Analysis

In [None]:
# Numerical features correlation
numerical_cols = df.select_dtypes(include=['number']).columns
plt.figure(figsize=(12, 10))
sns.heatmap(df[numerical_cols].corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Numerical Features')
plt.show()