In [9]:
import pandas as pd
from pathlib import Path

# Load your data
parquet_path = "../assets/extracted.csv"
cohort_df = pd.read_csv(parquet_path)
# rename columns to match final dataset
cohort_df.rename(columns={'admid': 'hadm_id'}, inplace=True)

final_parquet_path = "../dataset/raw/lab_event_data_with_demographics.parquet"
final_df = pd.read_parquet(final_parquet_path)

print(f"Original cohort size: {len(cohort_df)}")
print(f"Final lab events size: {len(final_df)}")
print(f"Missing rows: {len(cohort_df) - len(final_df.drop_duplicates(subset=['subject_id', 'hadm_id']))}")

# Check for missing/invalid IDs in cohort
invalid_ids = cohort_df[
    cohort_df['subject_id'].isna() | 
    cohort_df['hadm_id'].isna()
]
print(f"Rows with missing subject_id or hadm_id: {len(invalid_ids)}")

# Check which patients are missing from final dataset
cohort_patients = set(zip(cohort_df['subject_id'], cohort_df['hadm_id']))
final_patients = set(zip(final_df['subject_id'], final_df['hadm_id']))
missing_patients = cohort_patients - final_patients

print(f"Patients in cohort: {len(cohort_patients)}")
print(f"Patients in final dataset: {len(final_patients)}")
print(f"Missing patients: {len(missing_patients)}")

# Show some missing patients
if missing_patients:
    print("\nFirst 10 missing patients:")
    for i, (sid, hid) in enumerate(list(missing_patients)[:10]):
        print(f"  Subject ID: {sid}, Hospital Admission ID: {hid}")

# Check if missing patients have valid IDs
missing_df = cohort_df[
    cohort_df.apply(lambda row: (row['subject_id'], row['hadm_id']) in missing_patients, axis=1)
]
print(f"\nMissing patients with valid IDs: {len(missing_df[~missing_df['subject_id'].isna() & ~missing_df['hadm_id'].isna()])}")

# Check for potential data type issues
print(f"\nData types in cohort:")
print(f"subject_id: {cohort_df['subject_id'].dtype}")
print(f"hadm_id: {cohort_df['hadm_id'].dtype}")

print(f"\nData types in final dataset:")
print(f"subject_id: {final_df['subject_id'].dtype}")
print(f"hadm_id: {final_df['hadm_id'].dtype}")

Original cohort size: 1092498
Final lab events size: 1170853
Missing rows: 1087590
Rows with missing subject_id or hadm_id: 0
Patients in cohort: 5280
Patients in final dataset: 4908
Missing patients: 372

First 10 missing patients:
  Subject ID: 14634633, Hospital Admission ID: 28387252
  Subject ID: 17957482, Hospital Admission ID: 20759114
  Subject ID: 12956096, Hospital Admission ID: 27116694
  Subject ID: 11101913, Hospital Admission ID: 27589462
  Subject ID: 12593003, Hospital Admission ID: 25929337
  Subject ID: 13474206, Hospital Admission ID: 21575924
  Subject ID: 12956096, Hospital Admission ID: 21897330
  Subject ID: 11338207, Hospital Admission ID: 23582798
  Subject ID: 13474206, Hospital Admission ID: 24246939
  Subject ID: 18215560, Hospital Admission ID: 22709370

Missing patients with valid IDs: 14222

Data types in cohort:
subject_id: int64
hadm_id: int64

Data types in final dataset:
subject_id: int64
hadm_id: int64
