# 01. Exploratory Data Analysis (EDA)

This notebook explores the Diabetes 130-US Hospitals dataset, focusing on understanding the features and the target variable (readmission).

In [None]:
# CRITICAL FOR COLAB: Path fix to import from src/
import sys
import os
# Fix path to allow importing from src when running in Colab or locally
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from src.preprocessing import load_raw_data

sns.set_style('whitegrid')
plt.rcParams['figure.dpi'] = 100

## Load Data

In [None]:
data_path = '../data/raw/diabetic_data.csv'
if not Path(data_path).exists():
    print(f"Data file not found at {data_path}. Please ensure the dataset is in the data/raw/ directory.")
else:
    df = load_raw_data(data_path)
    print(f"Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")
    print(f"\nFirst 5 rows:")
    display(df.head())

## Target Variable Analysis

In [None]:
# Value counts for readmitted column
print("Readmission distribution:")
print(df['readmitted'].value_counts())
print(f"\nBinary target (readmitted <30): {(df['readmitted'] == '<30').sum()} samples")

In [None]:
# Visualize readmission distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='readmitted', data=df, palette='viridis')
plt.title('Distribution of Readmission Status', fontsize=14, fontweight='bold')
plt.xlabel('Readmission Status')
plt.ylabel('Count')
plt.show()

## Feature Analysis

In [None]:
# Numerical features summary
numerical_cols = df.select_dtypes(include=['number']).columns
print(f"Numerical features ({len(numerical_cols)}):")
display(df[numerical_cols].describe())

In [None]:
# Correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(df[numerical_cols].corr(), annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix of Numerical Features', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Missing values analysis
missing_data = df.isnull().sum()
missing_data = missing_data[missing_data > 0].sort_values(ascending=False)

if len(missing_data) > 0:
    print("Missing values:")
    print(missing_data)
else:
    print("No missing values detected!")

# Check for '?' values (common placeholder in this dataset)
question_mark_cols = []
for col in df.columns:
    if df[col].dtype == 'object':
        if '?' in df[col].values:
            count = (df[col] == '?').sum()
            question_mark_cols.append((col, count))

if question_mark_cols:
    print("\nColumns with '?' placeholder:")
    for col, count in question_mark_cols:
        print(f"  {col}: {count}")

## Key Insights

Use this space to document your findings:
- Class imbalance in readmission target
- Feature distributions
- Missing data patterns
- Correlations worth investigating