In [None]:
import sys 
import os
import polars as pl 
project_root = os.path.dirname(os.getcwd())
sys.path.insert(0, project_root)
from utils.config import config
from utils.io import read_data

In [None]:
# cohort_df = pl.read_parquet("../output/intermediate/relevant_cohort_with_deathage.parquet")
cohort_df = pl.read_parquet("../output/intermediate/calc_final_df_with_desc.parquet")

In [None]:
site_name = config['site_name']
tables_path = config['tables_path']
file_type = config['file_type']

adt_filepath = f"{tables_path}/clif_adt.{file_type}"
hospitalization_filepath = f"{tables_path}/clif_hospitalization.{file_type}"
patient_filepath = f"{tables_path}/clif_patient.{file_type}"
hospital_dx_filepath = f"{tables_path}/clif_hospital_diagnosis.{file_type}"
labs_filepath = f"{tables_path}/clif_labs.{file_type}"
patient_assess_filepath = f"{tables_path}/clif_patient_assessments.{file_type}"
respiratory_filepath = f"{tables_path}/clif_respiratory_support.{file_type}"

adt_df = read_data(adt_filepath, file_type)
hospitalization_df = read_data(hospitalization_filepath, file_type)
patient_df = read_data(patient_filepath, file_type)
hospital_dx_df = read_data(hospital_dx_filepath, file_type)
labs_df = read_data(labs_filepath, file_type)
pat_assess_df = read_data(patient_assess_filepath, file_type)
resp_df = read_data(respiratory_filepath, file_type)


In [None]:
# Get the first ICU dttm for ICU encounters, Calculate ICU LOS and Hospital LOS for each encounter in days. 
# ==============================================================================
adt_cohort = cohort_df.join(adt_df, on = "hospitalization_id", how = "left")

hosp_admission_summary = (
    adt_cohort
    .group_by('encounter_block')
    .agg([
        pl.col('in_dttm').min().alias('min_in_dttm'),
        pl.col('out_dttm').max().alias('max_out_dttm'),
        pl.col('location_category').first().alias('first_admission_location')
    ])
)
hosp_admission_summary = hosp_admission_summary.with_columns(
    (
        (pl.col('max_out_dttm') - pl.col('min_in_dttm')).dt.total_days().alias('hospital_length_of_stay_days')
    )
)

# lowercase the column in polars
adt_cohort = adt_cohort.with_columns(
    pl.col("location_category").str.to_lowercase()
)

# restrict to ICU rows
icu_df = adt_cohort.filter(pl.col('location_category') == "icu")

# find first ICU in time per 'encounter_block'
first_in = (
    icu_df
    .group_by('encounter_block')
    .agg(
        pl.col('in_dttm').min().alias('first_icu_in_dttm')
    )
)

# join to get the matching 'out_dttm'
icu_summary = (
    first_in.join(
        icu_df.select(['hospitalization_id', 'in_dttm', 'out_dttm', 'encounter_block']),
        left_on=['encounter_block', 'first_icu_in_dttm'],
        right_on=['encounter_block', 'in_dttm'],
        how='left',
    )
    .rename({'out_dttm': 'first_icu_out_dttm'})
)

# compute LOS in days (out - in)
icu_summary = icu_summary.with_columns(
    (
        (pl.col('first_icu_out_dttm') - pl.col('first_icu_in_dttm'))
        .dt.total_days()
        .alias('first_icu_los_days')
    )
)

# trim to just the columns needed
icu_summary = icu_summary.select([
    'encounter_block', 'first_icu_in_dttm', 'first_icu_out_dttm', 'first_icu_los_days'
])


In [None]:
# Join to get first_recorded_dttm for each encounter_block for those on imv
imv_df = resp_df.filter(pl.col("device_category") == "IMV")
imv_cohort = cohort_df.join(imv_df, on = "hospitalization_id", how = "left")
first_recorded_dttm = (
    imv_cohort
    .group_by("encounter_block")
    .agg(
        pl.col("recorded_dttm").min().alias("first_recorded_dttm")
    )
)

# Filter labs_df for target categories
target_lab_order_categories = ["blood_gas", "cbc"]
labs_cohort = cohort_df.join(labs_df, on = "hospitalization_id", how = "left")
labs_of_interest = labs_cohort.filter(
    pl.col("lab_order_category").is_in(target_lab_order_categories))

# Join labs to first_recorded_dttm by encounter_block
labs_with_first = labs_of_interest.join(
    first_recorded_dttm, on="encounter_block", how="inner"
)
labs_with_first

# Only keep rows where labs were collected within 24 hrs after first_recorded_dttm
timeframe = 24
labs_within_timeframe = labs_with_first.filter(
    (pl.col("lab_collect_dttm") >= pl.col("first_recorded_dttm")) &
    (pl.col("lab_collect_dttm") < (pl.col("first_recorded_dttm") + pl.duration(hours=timeframe)))
).unique()

In [None]:
assessment_interest = ["gcs_total", "RASS"]
assessments_cohort = cohort_df.join(pat_assess_df, on = "hospitalization_id", how = "left")
assessments_of_interest = assessments_cohort.filter(
    pl.col("assessment_category").is_in(assessment_interest)).unique()


In [None]:
cohort_df_1 = cohort_df.join(patient_df, on='patient_id', how='left')
cohort_df_2 = cohort_df_1.join(
    hospitalization_df,
    on=['patient_id', 'hospitalization_id'],
    how='left',
    suffix="_hosp"
)
cohort_df_3 = cohort_df_2.join(
    hospital_dx_df,
    on=['hospitalization_id'],
    how='left',
    suffix="_dx"
)
cohort_df_4 = cohort_df_3.join(
    labs_within_timeframe,
    on=['patient_id', 'hospitalization_id', 'encounter_block'],
    how='left',
    suffix="_labs"
)
cohort_df_5 = (
    cohort_df_4
    .join(icu_summary, on='encounter_block', how='left', suffix="_icu")
    .join(hosp_admission_summary, on='encounter_block', how='left', suffix="_hadm")
)
cohort_df_6 = cohort_df_5.with_columns(
    pl.col('first_admission_location').fill_null('Missing')
)

final_tableone_df = cohort_df_6.select([
    'patient_id', 'hospitalization_id', 'encounter_block', 'admission_dttm',
    'discharge_dttm', 'age_at_admission', 'age_at_death', 'discharge_category', 'first_admission_location', 'admission_type_category',
    'race_category', 'ethnicity_category', 'sex_category', 'language_category', 'first_icu_los_days', 'hospital_length_of_stay_days',
    'diagnosis_code', 'diagnosis_code_format', 'diagnosis_primary', 'poa_present', 'lab_category', 'lab_value_numeric'
]).unique()
final_tableone_df = final_tableone_df.with_columns(
    pl.col('admission_dttm').dt.year().alias('admission_year')
)

In [None]:
duplicates = final_tableone_df['encounter_block'].is_duplicated().sum()
print(f"Duplicate encounter_blocks: {duplicates}")

if duplicates > 0:
    print("Multiple rows per encounter_block found. Keeping last value...")
    # pl.DataFrame.unique(keep='last') drops duplicate rows, keeping the last occurrence
    tableone_df = final_tableone_df.unique(subset=['encounter_block'], keep='last')
else:
    tableone_df = final_tableone_df

print(f"Final tableone_df shape: {tableone_df.shape}")

In [None]:
patient_cohort = tableone_df.select(['patient_id', 'race_category', 'ethnicity_category', 'sex_category', 'age_at_admission']).unique(subset=['patient_id'])
print(f"Unique patients: {len(patient_cohort):,}")

In [None]:
# Load ICD-10 descriptions - CORRECTED VERSION
def load_icd_descriptions(filepath):
    """
    Load ICD-10 codes and descriptions from text file.
    Format: CODE    DESCRIPTION (separated by whitespace)
    Returns: dict mapping code (no dots) to description
    """
    icd_dict = {}
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            # Split by whitespace: first part is code, rest is description
            parts = line.split(None, 1)  # Split on whitespace, max 2 parts
            if len(parts) >= 2:
                code = parts[0].strip()
                description = parts[1].strip()
                icd_dict[code] = description
    return icd_dict

# Helper function to clean ICD codes (remove dots)
def clean_icd_code(code):
    """Remove dots from ICD codes for matching"""
    if code is None:
        return None
    return str(code).replace(".", "")

# Load the ICD descriptions
icd_descriptions = load_icd_descriptions("../icd10orderfiles/icd10cm_codes_2026.txt")
print(f"Loaded {len(icd_descriptions):,} ICD-10 code descriptions")

# Test
test_code = "A41.9"
clean_test = clean_icd_code(test_code)
print(f"Test: {test_code} -> {clean_test} -> {icd_descriptions.get(clean_test, 'Not found')}")

In [None]:

# Table One for Patient Assessments (e.g., for assessments_of_interest)
def make_table_one_patient_assessments(df, value_col="numerical_value", category_col="assessment_category"):
    """
    Summary table for patient assessments by category.
    For each assessment_category:
      - N
      - median [Q1, Q3] of assessment_value
    """
    # Aggregate stats per assessment_category
    grouped = (
        df
        .filter(pl.col(category_col).is_not_null() & pl.col(value_col).is_not_null())
        .group_by(category_col, maintain_order=True)
        .agg([
            pl.count().alias("n"),
            pl.col(value_col).median().alias("median"),
            pl.col(value_col).quantile(0.25).alias("q1"),
            pl.col(value_col).quantile(0.75).alias("q3"),
        ])
        .sort("n", descending=True)
    )
    # Build table: one row for N, one for median/q1/q3 for each category
    rows = []
    for cat, n, median, q1, q3 in grouped.rows():
        cat_str = str(cat)
        rows.append((f"N: {cat_str}", f"{n:,}"))
        rows.append((f"  {cat_str} value, median [Q1, Q3]", f"{median:.1f} [{q1:.1f}, {q3:.1f}]"))
    return pl.DataFrame({"Variable": [r[0] for r in rows], "Overall": [r[1] for r in rows]})

# Example usage for patient assessments:

tbl_patient_assessments = make_table_one_patient_assessments(assessments_of_interest)

# You can concatenate this with the main table one as needed, e.g.:
# tbl_final = pl.concat([tbl_overall, tbl_patient_assessments], how="vertical")



In [None]:
tbl_patient_assessments

In [None]:

def make_table_one_optimized(df, patient_demographics, id_col='encounter_block'):
    """
    Optimized Table One generation - pre-computes values, minimizes iterations.
    """
    rows = []

    N_enc = df.height
    N_pat = patient_demographics.height
    N_hospitals = df['hospital_id'].n_unique() if "hospital_id" in df.columns else None


    continuous_stats = {}
    for col in ['age_at_admission', 'age_at_death', 'first_icu_los_days', 'hospital_length_of_stay_days']:
        if col in df.columns:
            data = (
                patient_demographics[col].drop_nulls()
                if col == 'age_at_admission'
                else df[col].drop_nulls()
            )
            if data.len() > 0:
                continuous_stats[col] = {
                    'median': data.median(),
                    'q1': data.quantile(0.25),
                    'q3': data.quantile(0.75)
                }

    # -------------------------------------------------------------------------
    # 1. Size
    # -------------------------------------------------------------------------
    rows.append(("N: Encounter blocks", f"{N_enc:,}"))
    rows.append(("N: Unique patients", f"{N_pat:,}"))
    if N_hospitals is not None:
        rows.append(("N: Hospitals", f"{N_hospitals:,}"))

    # -------------------------------------------------------------------------
    # 2. Demographics
    # -------------------------------------------------------------------------
    # Age
    if 'age_at_admission' in continuous_stats:
        s = continuous_stats['age_at_admission']
        rows.append((
            "Age at admission, median [Q1, Q3]",
            f"{s['median']:.0f} [{s['q1']:.0f}, {s['q3']:.0f}]"
        ))
    # Add age_at_death statistics
    if 'age_at_death' in continuous_stats:
        s = continuous_stats['age_at_death']
        rows.append((
            "Age at death, median [Q1, Q3]",
            f"{s['median']:.0f} [{s['q1']:.0f}, {s['q3']:.0f}]"
        ))

    # Fast categorical stats in Polars
    def cat_n_pct_fast(data, col, title, denominator):
        value_counts_df = data.select([
            pl.col(col)
        ]).with_columns(
            pl.col(col).alias("level")
        ).group_by("level", maintain_order=True).len().rename(
            {"len": "count"}
        ).sort("count", descending=True)
        for row in value_counts_df.rows():
            lvl, cnt = row
            pct = 100 * cnt / denominator
            lvl_str = str(lvl) if (lvl is not None and not (isinstance(lvl, float) and lvl != lvl)) else "Missing"
            rows.append((f"  {title}: {lvl_str}", f"{cnt:,} ({pct:.1f}%)"))

    cat_n_pct_fast(patient_demographics, 'race_category', 'Race', N_pat)
    cat_n_pct_fast(patient_demographics, 'ethnicity_category', 'Ethnicity', N_pat)
    cat_n_pct_fast(patient_demographics, 'sex_category', 'Sex', N_pat)

    # -------------------------------------------------------------------------
    # 3. Diagnoses (Primary and Secondary)
    # -------------------------------------------------------------------------
    # Primary Diagnosis (diagnosis_primary==1), top 5
    if "diagnosis_code" in df.columns and "diagnosis_primary" in df.columns:
        primary_dx = (
            df.filter(pl.col("diagnosis_primary") == 1)
              .group_by("diagnosis_code", maintain_order=True)
              .len()
              .sort("len", descending=True)
        )
        total_primary = primary_dx["len"].sum() if primary_dx.height > 0 else 0
        if hasattr(total_primary, 'item'):
            total_primary = total_primary.item()
        else:
            total_primary = int(total_primary)
        if primary_dx.height > 0:
            rows.append(("Top 5 primary diagnoses", ""))
            for i, row_ in enumerate(primary_dx.rows()):
                if i >= 5:
                    break
                dx_code, dx_cnt = row_
                pct = 100 * dx_cnt / max(1, total_primary)
                clean_code = clean_icd_code(dx_code)
                desc = icd_descriptions.get(clean_code, "Description not found")
                rows.append((f"  PrimDx #{i+1}: {dx_code} - {desc}", f"{dx_cnt:,} ({pct:.1f}%)"))
        else:
            rows.append(("Top 5 primary diagnoses", "N/A"))

        # Secondary Diagnoses (diagnosis_primary==0), top 5
        secondary_dx = (
            df.filter(pl.col("diagnosis_primary") == 0)
              .group_by("diagnosis_code", maintain_order=True)
              .len()
              .sort("len", descending=True)
        )
        # Only if there are secondary diagnoses
        total_sec = secondary_dx["len"].sum() if secondary_dx.height > 0 else 0
        if hasattr(total_sec, 'item'):
            total_sec = total_sec.item()
        else:
            total_sec = int(total_sec)
        if secondary_dx.height > 0:
            rows.append(("Top 5 secondary diagnoses", ""))
            for i, row_ in enumerate(secondary_dx.rows()):
                if i >= 5:
                    break
                dx_code, dx_cnt = row_
                pct = 100 * dx_cnt / max(1, total_sec)
                clean_code = clean_icd_code(dx_code)
                desc = icd_descriptions.get(clean_code, "Description not found")
                rows.append((f"  SecDx #{i+1}: {dx_code} - {desc}", f"{dx_cnt:,} ({pct:.1f}%)"))
        else:
            rows.append(("Top 5 secondary diagnoses", "N/A"))

    # -------------------------------------------------------------------------
    # 4. Admission and Location
    # -------------------------------------------------------------------------
    cat_n_pct_fast(df, 'first_admission_location', 'First admission location', N_enc)
    if 'admission_type_category' in df.columns:
        cat_n_pct_fast(df, 'admission_type_category', 'Admission type', N_enc)

    # -------------------------------------------------------------------------
    # 5. Length of Stay (use pre-computed stats)
    # -------------------------------------------------------------------------
    for col, label in [
        ('first_icu_los_days', 'ICU length of stay (days)'),
        ('hospital_length_of_stay_days', 'Hospital length of stay (days)')
    ]:
        if col in continuous_stats:
            s = continuous_stats[col]
            rows.append((
                f"{label}, median [Q1, Q3]",
                f"{s['median']:.1f} [{s['q1']:.1f}, {s['q3']:.1f}]"
            ))

    # -------------------------------------------------------------------------
    # 6. Labs within specified time frame (per lab category)
    # -------------------------------------------------------------------------
    # Use lab_category and lab_value_numeric from df
    if 'lab_category' in df.columns and 'lab_value_numeric' in df.columns:
        lab_cat = "lab_category"
        # Get only numeric values
        lab_stats = (
            df
            .filter(pl.col(lab_cat).is_not_null() & pl.col("lab_value_numeric").is_not_null())
            .group_by(lab_cat)
            .agg([
                pl.count().alias("n"),
                pl.col("lab_value_numeric").median().alias("median"),
                pl.col("lab_value_numeric").quantile(0.25, "nearest").alias("q1"),
                pl.col("lab_value_numeric").quantile(0.75, "nearest").alias("q3")
            ])
            .sort("n", descending=True)
        )

        if lab_stats.height > 0:
            rows.append((f"Lab values within {timeframe} of IMV", ""))
            for i, row in enumerate(lab_stats.rows()):
                cat, n, median, q1, q3 = row
                rows.append((f"  {cat}: count", f"{n:,}"))
                rows.append((f"  {cat}: median [Q1, Q3]", f"{median:.1f} [{q1:.1f}, {q3:.1f}]"))
        else:
            rows.append((f"Lab values within {timeframe} of IMV", "N/A"))
    else:
        rows.append((f"Lab values within {timeframe} of IMV", "N/A"))


    # Assemble Polars DataFrame
    return pl.DataFrame({"Variable": [r[0] for r in rows], "Overall": [r[1] for r in rows]})



# Generate overall table
tbl_overall = make_table_one_optimized(tableone_df, patient_cohort)
tbl_final = pl.concat([tbl_overall, tbl_patient_assessments], how="vertical")

# print(tbl_overall.write_csv(separator='|'))  # POLARS always includes header when writing to a string
# print(tbl_overall)

# # Save
tbl_final.write_csv("../output/final/table_one_overall_calc.csv")


# ============================================================================
#Generate by Year (Optimized, POLARS)
# ============================================================================

if 'admission_year' in tableone_df.columns:
    print("\n" + "="*80)
    print("GENERATING TABLE ONE BY YEAR (OPTIMIZED)")
    print("="*80)

    # Get unique years once, sort once
    years = sorted([y for y in tableone_df['admission_year'].unique().to_list() if y is not None])
    print(f"Years found: {years}")

    var_order = tbl_overall["Variable"].to_list()
    results = {"Overall": tbl_overall.with_columns(
        pl.col("Variable"), pl.col("Overall")
    ).to_dict(as_series=False)["Overall"]}

    # Group once, iterate over years
    for yr in years:
        # Filter data for that year
        df_year = tableone_df.filter(pl.col("admission_year") == yr)
        pat_year = patient_cohort.filter(pl.col('patient_id').is_in(df_year['patient_id'].implode()))

        # Generate table for this year
        tbl_year = make_table_one_optimized(df_year, pat_year)
        # Map by variable for reindexing
        yr_map = dict(zip(tbl_year["Variable"].to_list(), tbl_year["Overall"].to_list()))
        results[str(int(yr))] = [yr_map.get(v, "") for v in var_order]
    
    # Assemble a wide Polars DataFrame  
    table_by_year = pl.DataFrame(
        {"Variable": var_order, **{k: v for k, v in results.items()}}
    )

    # Save
    # table_by_year.write_csv("../output/final/table_one_by_year.csv")
    # print("Saved: ../output/final/table_one_by_year.csv")



In [None]:

# gcs & rass 
