In [1]:
import pandas as pd
import numpy as np
import os
import json
from collections import defaultdict
from tableone import TableOne
from great_tables import GT, html, style, loc

# Read tables

In [None]:
# folder
input_path = "your path"

# file
adt = pd.read_parquet(os.path.join(input_path, "clif_adt.parquet"))
hosp = pd.read_parquet(os.path.join(input_path, "clif_hospitalization.parquet"))
patient = pd.read_parquet(os.path.join(input_path, "clif_patient.parquet"))
labs = pd.read_parquet(os.path.join(input_path, "clif_labs.parquet"))
meds = pd.read_parquet(os.path.join(input_path, "clif_medication_admin_continuous.parquet"))
vitals = pd.read_parquet(os.path.join(input_path, "clif_vitals.parquet"))
resp = pd.read_parquet(os.path.join(input_path, "clif_respiratory_support.parquet"))
prone = pd.read_parquet(os.path.join(input_path, "clif_position.parquet"))
gcs = pd.read_parquet(os.path.join(input_path, "clif_patient_assessments.parquet"))
covid = pd.read_parquet(os.path.join(input_path, "clif_microbiology_nonculture.parquet"))

In [None]:
# output path
parent_folder = os.path.dirname(os.getcwd())
folder_path = os.path.join(parent_folder, "data")
os.makedirs(folder_path, exist_ok=True)

# 1. Table schema summary

In [54]:
tables_dict = {
    'hospitalization': hosp,
    'patient': patient,
    'adt': adt,
    'labs': labs,
    'medication_admin_continuous': meds,
    'vitals': vitals,
    'respiratory_support': resp,
    'position': prone,
    'patient_assessments': gcs,
    'microbiology_nonculture': covid
}

In [None]:
def table_summary(table_name, table, hosp, year_start, year_end):
    summary = defaultdict(dict)
    print(table_name)

    # overall
    summary[table_name][f'{year_start}-{year_end}'] = {
            "row_count": len(table),
            "column_count": len(table.columns),
            "columns": list(table.columns),
            "patient_count": int(table['patient_id'].nunique()) if 'patient_id' in table.columns else None,
            "hospitalization_count": int(table['hospitalization_id'].nunique()) if 'hospitalization_id' in table.columns else None,
        }

    # each year
    for y in range(year_start, year_end+1, 1):
        if table_name == 'microbiology_nonculture':
            table['admission_date'] = pd.to_datetime(table['admission_date'])
            table_sub = table[table['admission_date'].dt.year.eq(y)]
        else:
            hosp_id = hosp.loc[hosp['admission_dttm'].dt.year.eq(y), 'hospitalization_id'].unique()
            pat_id = hosp.loc[hosp['admission_dttm'].dt.year.eq(y), 'patient_id'].unique()

            if 'hospitalization_id' not in table.columns:
                table_sub = table[table['patient_id'].isin(pat_id)]
            else:
                table_sub = table[table['hospitalization_id'].isin(hosp_id)]

        summary[table_name][y] = {
            "row_count": len(table_sub),
            "column_count": len(table_sub.columns),
            "patient_count": int(table_sub['patient_id'].nunique()) if 'patient_id' in table_sub.columns else None,
            "hospitalization_count": int(table_sub['hospitalization_id'].nunique()) if 'hospitalization_id' in table_sub.columns else None,
        }
    return summary



## create yearly summary json file

In [75]:
summary_all = defaultdict(dict)
year_start = 2018
year_end = 2024

for key, values in tables_dict.items():
    summary = table_summary(key, values, hosp, year_start, year_end)
    summary_all.update(summary)


hospitalization
patient
adt
labs
medication_admin_continuous
vitals
respiratory_support
position
patient_assessments
microbiology_nonculture


In [76]:
# save json file for records
with open("../data/table_summary_by_year.json", "w") as f:
    json.dump(summary_all, f, indent=4)

## create summary df

In [None]:
records = []
for table_name, year_dict in summary_all.items():
    for year, summary in year_dict.items():
        row = {'table': table_name, 'year': year}
        row.update(summary)
        records.append(row)

# Create summary df
df_summary = pd.DataFrame(records)

# Get overall year data
df_summary = df_summary[df_summary['year'].eq('2018-2024')]
cols = [c for c in df_summary if c not in ['columns', 'year']]
df_summary = df_summary[cols]
df_summary.rename(columns={
    'table': 'Table Name ',
    'row_count': 'Total Rows',
    'column_count': 'Total Columns',
    'patient_count': 'Unique Patients',
    'hospitalization_count': 'Unique Hospitalizations'
}, inplace=True)

In [87]:
# create great-table
gt = (
    GT(df_summary)
    .tab_header(title="Year", subtitle="2018-01-01 to 2024-12-31")
    .opt_horizontal_padding(scale=2)
    .fmt_number(columns=['Total Rows','Unique Patients', 'Unique Hospitalizations'], decimals=0)
)

gt.show()

# # save table
# gt.write_raw_html("../data/table_cummary_overall.html")

Year,Year,Year,Year,Year
2018-01-01 to 2024-12-31,2018-01-01 to 2024-12-31,2018-01-01 to 2024-12-31,2018-01-01 to 2024-12-31,2018-01-01 to 2024-12-31
Table Name,Total Rows,Total Columns,Unique Patients,Unique Hospitalizations
hospitalization,114902,9,90269.0,114902.0
patient,90559,10,90559.0,
adt,613063,8,,121324.0
labs,31493211,9,,120722.0
medication_admin_continuous,8081409,9,,85556.0
vitals,127929417,5,,122140.0
respiratory_support,14477923,25,,122094.0
position,10730591,4,,115865.0
patient_assessments,15228790,8,,115139.0
microbiology_nonculture,2457156,14,1017260.0,


# 2. Demographic summary

### create each year summary table

simple demographic summary
- hospitalization that has icu stay, what's their demographic statistics
- how many patient required ventilation
- how many patient died during hospitalization
- how many paitnet required vassopressor

In [94]:
# Create sub dfs
hosp = hosp[['patient_id', 'hospitalization_id', 'admission_dttm', 'discharge_dttm', 'age_at_admission', 'discharge_category']].drop_duplicates()
patient = patient[['patient_id','race_category', 'ethnicity_category', 'sex_category']].drop_duplicates()
adt = adt[['hospitalization_id', 'in_dttm', 'out_dttm', 'location_category']].drop_duplicates()

# Merge hospital and patient demographics
merged_df = hosp.merge(patient, on='patient_id', how='left')
print("Number of duplicates", merged_df.duplicated(subset=['hospitalization_id', 'patient_id']).sum())

# Get only ventilator
vent = resp[resp['device_category'].eq('IMV')].drop_duplicates()

# Get only vasopressors
vasopressors = ["norepinephrine", "epinephrine", "phenylephrine", "vasopressin", "dopamine", "angiotensin"]
med_vas = meds[meds['med_category'].isin(vasopressors)].drop_duplicates()

# Initialize summary df
df_summary_all = pd.DataFrame()

# Year range
year_start = 2018
year_end = 2024

Number of duplicates 0


In [97]:
for y in range(year_start, year_end+1):
    print("year:", y)
    # Filter hospitalizations for the year
    df_sub = merged_df[merged_df['admission_dttm'].dt.year.eq(y)].copy()

    # Calculate hospitalization days
    df_sub['hospitalization_days'] = (df_sub['discharge_dttm'] - df_sub['admission_dttm']).dt.total_seconds() / (3600*24)

    # Get hospitalization_ids for the year 
    hosp_id = df_sub.hospitalization_id.unique()
    adt_sub = adt[adt['hospitalization_id'].isin(hosp_id)].copy()
    vent_sub = vent[vent['hospitalization_id'].isin(hosp_id)].copy()
    med_vas_sub = med_vas[med_vas['hospitalization_id'].isin(hosp_id)].copy()

    # Filter patient had icu stay
    icu_adt = adt_sub[adt_sub['location_category'] == 'icu'].copy()
    icu_adt = icu_adt.drop_duplicates()
    icu_hosp_id = icu_adt.hospitalization_id.unique()
    df_icu = df_sub[df_sub['hospitalization_id'].isin(icu_hosp_id)].copy()

    # Add total icu days
    icu_adt['icu_days'] = (icu_adt['out_dttm'] - icu_adt['in_dttm']).dt.total_seconds() / (3600*24)
    icu_days = icu_adt.groupby('hospitalization_id')['icu_days'].sum().reset_index()
    df_icu = df_icu.merge(icu_days, on='hospitalization_id', how='left')

    # Create death flag for patient died with hospitalization
    df_icu['death_flag'] = df_icu['discharge_category'].eq('Expired').astype(int)

    # Demographic summary
    df_demo = df_icu[['hospitalization_id', 'hospitalization_days', 'icu_days','age_at_admission', 
                      'race_category', 'ethnicity_category', 'sex_category', 'death_flag']].drop_duplicates(subset='hospitalization_id')

    # Add year
    df_demo['year'] = y

    # Add ventilator flag
    vent_flag = vent_sub[['hospitalization_id']].drop_duplicates()
    vent_flag['vent_flag'] = 1
    df_demo = df_demo.merge(vent_flag, on='hospitalization_id', how='left')
    df_demo['vent_flag'] = df_demo['vent_flag'].fillna(0)

    # Add vasopressor flag
    vas_flag = med_vas_sub[['hospitalization_id']].drop_duplicates()
    vas_flag['vasopressor_flag'] = 1
    df_demo = df_demo.merge(vas_flag, on='hospitalization_id', how='left')
    df_demo['vasopressor_flag'] = df_demo['vasopressor_flag'].fillna(0)

    # Combine into all year summary
    df_summary_all = pd.concat([df_summary_all, df_demo], ignore_index=True)



# Fill patient with missing demographic data with 'Unknown'
df_summary_all['race_category'] = df_summary_all['race_category'].fillna('Unknown')
df_summary_all['sex_category'] = df_summary_all['sex_category'].fillna('Unknown')
df_summary_all['ethnicity_category'] = df_summary_all['ethnicity_category'].fillna('Unknown')

year: 2018
year: 2019
year: 2020
year: 2021
year: 2022
year: 2023
year: 2024


### table one

In [98]:
categorical_cols = ['race_category', 'ethnicity_category', 'sex_category', 'death_flag', 'vent_flag', 'vasopressor_flag']
numerical_cols = ['age_at_admission', 'hospitalization_days', 'icu_days']
cols = [col for col in df_summary_all.columns if col not in ['hospitalization_id']]

# Define the desired order
race_order = [
    'White',
    'Black or African American',
    'Asian',
    'American Indian or Alaska Native',
    'Native Hawaiian or Other Pacific Islander',
    'Other',
    'Unknown',
]

# Convert Race column to a categorical type with the specified order
df_summary_all['race_category'] = pd.Categorical(
    df_summary_all['race_category'],
    categories=race_order,
    ordered=True
)

# Convert boolean columns to Yes/No (can change or ignore this)
boolean_order = ['Yes', 'No']
flags = ['vent_flag', 'vasopressor_flag', 'death_flag']

for flag in flags:
    df_summary_all[flag] = df_summary_all[flag].map({1: 'Yes', 0: 'No'})
    df_summary_all[flag] = pd.Categorical(df_summary_all[flag], categories=boolean_order, ordered=True)



# Column rename
rename_cols = {
    'race_category': 'Race',
    'ethnicity_category': 'Ethnicity',
    'sex_category': 'Gender',
    'age_at_admission': 'Age (years)',
    'hospitalization_days': 'Hospitalization Days',
    'icu_days': 'Total ICU Days',
    'death_flag': 'Hospital Mortality',
    'vent_flag': 'Received Ventilation During Admission',
    'vasopressor_flag': 'Received Vasopressor During Admission'
}


tb1_summary = TableOne(
    df_summary_all, 
    columns=cols, 
    categorical=categorical_cols, 
    nonnormal = numerical_cols,
    groupby='year',
    pval=False,
    rename=rename_cols,
    label_suffix=True)
tb1_summary

Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by year,Grouped by year,Grouped by year,Grouped by year,Grouped by year,Grouped by year,Grouped by year,Grouped by year,Grouped by year
Unnamed: 0_level_1,Unnamed: 1_level_1,Missing,Overall,2018,2019,2020,2021,2022,2023,2024
n,,,214960,24620,28394,28810,30736,29270,36358,36772
"Hospitalization Days, median [Q1,Q3]",,0.0,"6.0 [3.2,11.1]","5.4 [3.0,9.9]","5.4 [3.0,10.1]","6.1 [3.1,11.6]","6.1 [3.2,11.6]","6.0 [3.2,11.5]","6.1 [3.2,11.3]","6.2 [3.3,11.4]"
"Total ICU Days, median [Q1,Q3]",,0.0,"2.0 [1.1,4.1]","1.9 [1.0,3.7]","1.8 [1.0,3.5]","2.0 [1.1,4.1]","2.0 [1.1,4.2]","2.1 [1.1,4.2]","2.2 [1.2,4.3]","2.1 [1.1,4.3]"
"Age (years), median [Q1,Q3]",,0.0,"66.0 [53.0,76.0]","64.0 [51.0,75.0]","65.0 [52.0,75.0]","65.0 [53.0,75.0]","66.0 [53.0,76.0]","66.0 [54.0,76.0]","67.0 [54.0,76.0]","67.0 [54.0,76.0]"
"Race, n (%)",White,,157710 (73.4),18480 (75.1),20868 (73.5),21186 (73.5),22394 (72.9),21802 (74.5),26616 (73.2),26364 (71.7)
"Race, n (%)",Black or African American,,28374 (13.2),3432 (13.9),4172 (14.7),3866 (13.4),4248 (13.8),3488 (11.9),4530 (12.5),4638 (12.6)
"Race, n (%)",Asian,,7650 (3.6),904 (3.7),1022 (3.6),990 (3.4),990 (3.2),1102 (3.8),1322 (3.6),1320 (3.6)
"Race, n (%)",American Indian or Alaska Native,,466 (0.2),56 (0.2),52 (0.2),56 (0.2),84 (0.3),80 (0.3),68 (0.2),70 (0.2)
"Race, n (%)",Native Hawaiian or Other Pacific Islander,,386 (0.2),50 (0.2),60 (0.2),42 (0.1),36 (0.1),50 (0.2),66 (0.2),82 (0.2)
"Race, n (%)",Other,,14512 (6.8),1218 (4.9),1590 (5.6),1954 (6.8),2262 (7.4),1952 (6.7),2594 (7.1),2942 (8.0)


In [100]:
# Prepare table for great-table
table1_df = tb1_summary.tableone
table1_df = table1_df.reset_index(drop=False)
table1_df.columns = table1_df.columns.droplevel(0)

# Drop missing columns
table1_df.drop(columns=['Missing'], inplace=True)
table1_df.columns.values[0:2] = ['Variable', 'Subcategory']

# Remove unecessary rows
idx = (
    table1_df['Variable'].eq('Gender, n (%)') & table1_df['Subcategory'].isin(['Male', 'Unknown']) |        # just keep female percentage
    table1_df['Variable'].eq('Hospital Mortality, n (%)') & table1_df['Subcategory'].isin(['No']) |
    table1_df['Variable'].eq('Received Ventilation During Admission, n (%)') & table1_df['Subcategory'].isin(['No']) |
    table1_df['Variable'].eq('Received Vasopressor During Admission, n (%)') & table1_df['Subcategory'].isin(['No'])
    )
table1_df = table1_df[~idx]


# Move Variable to Subcategory where Subcategory is empty to fit great-table format
variable_counts = table1_df['Variable'].value_counts()
mask = mask = (table1_df['Subcategory'].isna() | table1_df['Subcategory'].eq('')) | (table1_df['Variable'].map(variable_counts) == 1) # copy value if only has one category
table1_df.loc[mask, 'Subcategory'] = table1_df.loc[mask, 'Variable']
table1_df.loc[mask, 'Variable'] = ''

In [None]:
# Create great-table
gt = (
    GT(table1_df)
    .tab_header(title="ICU Patient Yearly Summary", subtitle=f"{year_start}-01-01 to {year_end}-12-31")
    .opt_horizontal_padding(scale=2)
    .tab_stub(groupname_col="Variable", rowname_col="Subcategory",)
    .opt_vertical_padding(scale=0.8)
    # .tab_style(
    #     style=[
    #         style.text(size=10),
    #     ],
    #     locations=loc.stub()
    # )
)
gt.show()

# save table
# gt.write_raw_html("../data/yearly_summary.html")

ICU Patient Yearly Summary,ICU Patient Yearly Summary,ICU Patient Yearly Summary,ICU Patient Yearly Summary,ICU Patient Yearly Summary,ICU Patient Yearly Summary,ICU Patient Yearly Summary,ICU Patient Yearly Summary,ICU Patient Yearly Summary
2018-01-01 to 2024-12-31,2018-01-01 to 2024-12-31,2018-01-01 to 2024-12-31,2018-01-01 to 2024-12-31,2018-01-01 to 2024-12-31,2018-01-01 to 2024-12-31,2018-01-01 to 2024-12-31,2018-01-01 to 2024-12-31,2018-01-01 to 2024-12-31
Unnamed: 0_level_2,Overall,2018,2019,2020,2021,2022,2023,2024
,,,,,,,,
n,214960,24620,28394,28810,30736,29270,36358,36772
"Hospitalization Days, median [Q1,Q3]","6.0 [3.2,11.1]","5.4 [3.0,9.9]","5.4 [3.0,10.1]","6.1 [3.1,11.6]","6.1 [3.2,11.6]","6.0 [3.2,11.5]","6.1 [3.2,11.3]","6.2 [3.3,11.4]"
"Total ICU Days, median [Q1,Q3]","2.0 [1.1,4.1]","1.9 [1.0,3.7]","1.8 [1.0,3.5]","2.0 [1.1,4.1]","2.0 [1.1,4.2]","2.1 [1.1,4.2]","2.2 [1.2,4.3]","2.1 [1.1,4.3]"
"Age (years), median [Q1,Q3]","66.0 [53.0,76.0]","64.0 [51.0,75.0]","65.0 [52.0,75.0]","65.0 [53.0,75.0]","66.0 [53.0,76.0]","66.0 [54.0,76.0]","67.0 [54.0,76.0]","67.0 [54.0,76.0]"
"Gender, n (%)",95240 (44.3),11206 (45.5),12896 (45.4),12462 (43.3),13600 (44.2),13056 (44.6),15774 (43.4),16246 (44.2)
"Hospital Mortality, n (%)",20352 (9.5),1868 (7.6),2216 (7.8),2908 (10.1),3164 (10.3),3048 (10.4),3580 (9.8),3568 (9.7)
"Received Ventilation During Admission, n (%)",60696 (28.2),6652 (27.0),7682 (27.1),7926 (27.5),8184 (26.6),7940 (27.1),10890 (30.0),11422 (31.1)
"Received Vasopressor During Admission, n (%)",107658 (50.1),10692 (43.4),12958 (45.6),13126 (45.6),14584 (47.4),14650 (50.1),20302 (55.8),21346 (58.0)
"Race, n (%)","Race, n (%)","Race, n (%)","Race, n (%)","Race, n (%)","Race, n (%)","Race, n (%)","Race, n (%)","Race, n (%)"
