# 01 - Data Exploration

Initial exploration and analysis of the Diabetes 130-US Hospitals dataset.

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.data_loader import load_raw_data

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

%matplotlib inline

## Load Data

In [None]:
# Load the raw dataset
df = load_raw_data()
df.head()

## Basic Statistics

In [None]:
print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nData types:\n{df.dtypes}")

In [None]:
# Summary statistics
df.describe()

## Missing Values Analysis

In [None]:
# Count missing values
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_pct
})

missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Percentage', ascending=False)
print(missing_df)

In [None]:
# Visualize missing values
if len(missing_df) > 0:
    plt.figure(figsize=(10, 6))
    missing_df['Percentage'].head(20).plot(kind='barh')
    plt.xlabel('Percentage Missing (%)')
    plt.title('Top 20 Features with Missing Values')
    plt.tight_layout()
    plt.show()

## Feature Distribution Analysis

In [None]:
# Analyze numerical features
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"Numerical features: {numeric_cols}")

# Plot distributions
if len(numeric_cols) > 0:
    n_cols = min(3, len(numeric_cols))
    n_rows = (len(numeric_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4*n_rows))
    axes = axes.flatten() if n_rows > 1 else [axes]
    
    for idx, col in enumerate(numeric_cols[:min(9, len(numeric_cols))]):
        df[col].hist(bins=30, ax=axes[idx], edgecolor='black')
        axes[idx].set_title(f'Distribution of {col}')
        axes[idx].set_xlabel(col)
        axes[idx].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Analyze categorical features
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
print(f"Categorical features: {categorical_cols}")
print(f"\nSample value counts for first categorical column:")
if len(categorical_cols) > 0:
    print(df[categorical_cols[0]].value_counts().head(10))

## Correlation Analysis

In [None]:
# Correlation heatmap for numerical features
if len(numeric_cols) > 1:
    corr_matrix = df[numeric_cols].corr()
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
                fmt='.2f', square=True, linewidths=1)
    plt.title('Correlation Matrix of Numerical Features')
    plt.tight_layout()
    plt.show()

## Key Insights

**TODO: Document key findings from exploration:**
- Dataset size and structure
- Missing value patterns
- Feature distributions
- Notable correlations
- Data quality issues

## Next Steps

Proceed to `02_preprocessing.ipynb` for data cleaning and feature engineering.