# Data Exploration and Validation

**Purpose**: Establish data quality and baseline bias metrics

**Dataset**: CMS Medicare 2008 (116,352 patients)

**Outputs**: Race distribution, clinical characteristics, baseline FNR disparity

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## Load Data

In [None]:
df = pd.read_csv('data/DE1_0_2008_Beneficiary_Summary_File_Sample_1.csv')
print(f'Loaded {len(df):,} patients')

## Feature Engineering

In [None]:
# See scripts/generate_full_clinical_report.py for full implementation
df['age'] = 2008 - (df['BENE_BIRTH_DT'] // 10000)
df['race_white'] = (df['BENE_RACE_CD'] == 1).astype(int)
df['has_diabetes'] = (df['SP_DIABETES'] == 1).astype(int)
df['has_chf'] = (df['SP_CHF'] == 1).astype(int)
df['chronic_count'] = (
    (df['SP_ALZHDMTA'] == 1).astype(int) +
    (df['SP_CHF'] == 1).astype(int) +
    (df['SP_DIABETES'] == 1).astype(int)
)
df['total_cost'] = df['MEDREIMB_IP'].fillna(0) + df['MEDREIMB_OP'].fillna(0)
df['high_cost'] = (df['total_cost'] > df['total_cost'].quantile(0.75)).astype(int)

## Visualize Race Distribution and Outcomes

In [None]:
race_dist = df['race_white'].value_counts(normalize=True)
print(f'White: {race_dist[1]:.1%}, Non-White: {race_dist[0]:.1%}')

outcome_by_race = df.groupby('race_white')['high_cost'].mean()
print(f'High-cost rate - White: {outcome_by_race[1]:.1%}, Non-White: {outcome_by_race[0]:.1%}')