To do:

* Inputs - 
* Hospitalization_ids (required csv path)
* If cohort_ids not given, then take all hospitalization_ids in the adt table with at least one ICU stay -- make the base cohort. 
* Base path to all the clif tables
* Optional start and stop times in the cohort_ids.csv (dates)
* Output- n_clinical_events--> times _category variable is recorded. 

In [None]:
import os
import sys
import logging
import duckdb
import pandas as pd

In [15]:
# ---------------------------------------------------------------------
# Logging Configuration
# ---------------------------------------------------------------------
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s] %(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout),
        logging.FileHandler('tableone.log', mode='a')
    ]
)
logger = logging.getLogger(__name__)

In [4]:
# -------------------------------------------------------------
# 2. Helper functions to read tables via DuckDB
# -------------------------------------------------------------
def load_clif_table(con, table_name, base_path, file_format='parquet'):
    """
    Load a CLIF table into the DuckDB in-memory connection.
    Adjust if you store data as CSV or other formats.
    """
    try:
        file_path = os.path.join(base_path, f"{table_name}.{file_format}")
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"{file_path} not found.")

        logger.info(f"Loading table {table_name} from {file_path}")
        
        # Example: if Parquet
        con.execute(f"""
            CREATE OR REPLACE TABLE {table_name} AS
            SELECT * FROM '{file_path}'
        """)
        logger.info(f"Successfully loaded {table_name}")
    except Exception as e:
        logger.error(f"Failed to load {table_name}: {str(e)}")
        raise

In [18]:
# ---------------------------------------------------------------------
# Main Function: create_table_one
# ---------------------------------------------------------------------
def create_table_one(hospitalization_ids, base_path):
    """
    Generates Table One and related summaries for the given hospitalization_ids
    from Parquet tables in the specified base_path, *keeping* the 'clif_' prefix
    in table names.

    1. We do "first recorded" for labs, meds, and vitals, as in prior code.
    2. We do an ALL-ROWS numeric summary for the respiratory support table.
    3. We only process data for the filtered (cohort) tables, so it’s
       restricted to your input hospitalization_ids.
    
    Returns:
        df_tableone (pd.DataFrame): A single DataFrame containing the entire
                                    Table One in rowwise format.
    """

    if not hospitalization_ids:
        logger.error("No hospitalization_ids provided.")
        return None

    # Connect to DuckDB in memory
    con = duckdb.connect(database=':memory:')

    # Tables we expect to find
    tables_needed = [
        "clif_adt",
        "clif_hospitalization",
        "clif_labs",
        "clif_medication_admin_continuous",
        "clif_patient",
        "clif_respiratory_support",
        "clif_vitals"
    ]

    # Helper to load each table directly as "clif_X"
    def load_clif_table(con, table_name, base_path):
        file_path = os.path.join(base_path, f"{table_name}.parquet")
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File not found: {file_path}")
        logger.info(f"Loading table {table_name} from {file_path}")
        con.execute(f"""
            CREATE OR REPLACE TABLE {table_name} AS
            SELECT * FROM '{file_path}'
        """)
        logger.info(f"Successfully loaded {table_name}")

    # Load the needed tables
    try:
        for tbl in tables_needed:
            load_clif_table(con, tbl, base_path)
    except Exception as e:
        logger.error(f"Error loading tables: {str(e)}")
        return None

    # Convert the list of hospitalization_ids into a string for WHERE ... IN
    hosp_ids_str = "', '".join(map(str, hospitalization_ids))

    # Step 1) Create clif_hospitalization_cohort
    try:
        con.execute("SELECT 1 FROM information_schema.tables WHERE table_name='clif_hospitalization'")
        check_exists = con.fetchone()
        if check_exists:
            con.execute(f"""
                CREATE OR REPLACE TABLE clif_hospitalization_cohort AS
                SELECT *
                FROM clif_hospitalization
                WHERE hospitalization_id IN ('{hosp_ids_str}')
            """)
        else:
            logger.warning("clif_hospitalization not found. Returning empty table.")
            return pd.DataFrame([["No hospitalizations found!", None, None, None, None, None]],
                                columns=["Variable", "Count", "Percent", "Mean[SD]", "Median", "IQR"])
    except Exception as e:
        logger.error(f"Could not create clif_hospitalization_cohort: {str(e)}")
        return None

    # Step 2) Create cohorts for other tables that have hospitalization_id
    tables_with_hosp_id = [
        "clif_adt",
        "clif_labs",
        "clif_medication_admin_continuous",
        "clif_respiratory_support",
        "clif_vitals"
    ]
    for tbl in tables_with_hosp_id:
        try:
            con.execute(f"SELECT 1 FROM information_schema.tables WHERE table_name='{tbl}'")
            check_exists = con.fetchone()
            if check_exists:
                con.execute(f"""
                    CREATE OR REPLACE TABLE {tbl}_cohort AS
                    SELECT *
                    FROM {tbl}
                    WHERE hospitalization_id IN (
                        SELECT hospitalization_id
                        FROM clif_hospitalization_cohort
                    )
                """)
        except Exception as e:
            logger.warning(f"Could not create {tbl}_cohort: {str(e)}")
            pass

    # Step 3) Filter clif_patient by patient_id from clif_hospitalization_cohort
    try:
        con.execute("SELECT 1 FROM information_schema.tables WHERE table_name='clif_patient'")
        check_exists = con.fetchone()
        if check_exists:
            con.execute(f"""
                CREATE OR REPLACE TABLE clif_patient_cohort AS
                SELECT p.*
                FROM clif_patient p
                JOIN (
                    SELECT DISTINCT patient_id
                    FROM clif_hospitalization_cohort
                ) h ON p.patient_id = h.patient_id
            """)
            logger.info("Successfully created clif_patient_cohort via patient_id join.")
        else:
            logger.warning("clif_patient not found. Skipping patient_cohort creation.")
    except Exception as e:
        logger.warning(f"Could not create clif_patient_cohort: {str(e)}")
        pass

    # Helper function to compute IQR
    def iqr(series):
        return series.quantile(0.75) - series.quantile(0.25)

    # ---------------------------------------------------------
    # A) Basic Cohort Size
    # ---------------------------------------------------------
    query_total_n = """
        SELECT COUNT(DISTINCT hospitalization_id) AS total_n
        FROM clif_hospitalization_cohort
    """
    total_n = con.execute(query_total_n).fetchone()[0]

    if total_n == 0:
        logger.warning("No matching hospitalizations found in the data after filtering.")
        return pd.DataFrame(
            [["No hospitalizations found!", None, None, None, None, None]],
            columns=["Variable", "Count", "Percent", "Mean[SD]", "Median", "IQR"]
        )

    # ---------------------------------------------------------
    # B) Age
    # ---------------------------------------------------------
    query_age = """
        SELECT
            AVG(age_at_admission) AS mean_age,
            STDDEV_SAMP(age_at_admission) AS sd_age
        FROM clif_hospitalization_cohort
        WHERE age_at_admission IS NOT NULL
    """
    row_age = con.execute(query_age).fetchdf()
    mean_age = row_age['mean_age'][0]
    sd_age   = row_age['sd_age'][0]

    # ---------------------------------------------------------
    # C) Female n(%)
    # ---------------------------------------------------------
    query_female = """
        WITH female AS (
            SELECT h.hospitalization_id
            FROM clif_hospitalization_cohort h
            JOIN clif_patient_cohort p
              ON h.patient_id = p.patient_id
            WHERE p.sex_category = 'Female'
        )
        SELECT COUNT(DISTINCT hospitalization_id) AS female_count
        FROM female
    """
    female_count = con.execute(query_female).fetchone()[0]
    female_percent = (female_count / total_n) * 100.0 if total_n else 0

    # ---------------------------------------------------------
    # D) Race distribution
    # ---------------------------------------------------------
    query_race = """
        SELECT p.race_category, COUNT(DISTINCT h.hospitalization_id) AS count_race
        FROM clif_hospitalization_cohort h
        JOIN clif_patient_cohort p
          ON h.patient_id = p.patient_id
        GROUP BY p.race_category
    """
    df_race = con.execute(query_race).fetchdf()
    if not df_race.empty:
        df_race["percent_race"] = (df_race["count_race"] / total_n) * 100.0

    # ---------------------------------------------------------
    # E) Ethnicity distribution
    # ---------------------------------------------------------
    query_ethnicity = """
        SELECT p.ethnicity_category, COUNT(DISTINCT h.hospitalization_id) AS count_ethnicity
        FROM clif_hospitalization_cohort h
        JOIN clif_patient_cohort p
          ON h.patient_id = p.patient_id
        GROUP BY p.ethnicity_category
    """
    df_ethnicity = con.execute(query_ethnicity).fetchdf()
    if not df_ethnicity.empty:
        df_ethnicity["percent_ethnicity"] = (df_ethnicity["count_ethnicity"] / total_n) * 100.0

    # ---------------------------------------------------------
    # F) Hospital Mortality
    # ---------------------------------------------------------
    query_mortality = """
        SELECT COUNT(*) AS mortality_count
        FROM clif_hospitalization_cohort
        WHERE discharge_category = 'Expired'
    """
    mortality_count = con.execute(query_mortality).fetchone()[0]
    mortality_percent = (mortality_count / total_n) * 100.0

    # ---------------------------------------------------------
    # G) First Location
    # ---------------------------------------------------------
    query_first_location = """
        WITH first_adt AS (
            SELECT
                hospitalization_id,
                location_category,
                ROW_NUMBER() OVER (
                    PARTITION BY hospitalization_id
                    ORDER BY in_dttm
                ) AS rn
            FROM clif_adt_cohort
        )
        SELECT location_category, COUNT(DISTINCT hospitalization_id) AS count_loc
        FROM first_adt
        WHERE rn = 1
        GROUP BY location_category
    """
    df_first_loc = con.execute(query_first_location).fetchdf()
    if not df_first_loc.empty:
        df_first_loc["percent_loc"] = (df_first_loc["count_loc"] / total_n) * 100.0

    # ---------------------------------------------------------
    # H) Mechanically Ventilated (any IMV)
    # ---------------------------------------------------------
    query_vent = """
        SELECT COUNT(DISTINCT hospitalization_id) AS vent_count
        FROM clif_respiratory_support_cohort
        WHERE device_category = 'IMV'
    """
    vent_count = con.execute(query_vent).fetchone()[0]
    vent_percent = (vent_count / total_n) * 100.0

    # ---------------------------------------------------------
    # I) Initial Mode Category among ventilated
    # ---------------------------------------------------------
    query_initial_mode = """
        WITH vent AS (
            SELECT
                hospitalization_id,
                mode_category,
                recorded_dttm,
                ROW_NUMBER() OVER (
                    PARTITION BY hospitalization_id
                    ORDER BY recorded_dttm
                ) AS rn
            FROM clif_respiratory_support_cohort
            WHERE device_category = 'IMV'
        )
        SELECT mode_category, COUNT(DISTINCT hospitalization_id) AS count_mode
        FROM vent
        WHERE rn = 1
        GROUP BY mode_category
    """
    df_initial_mode = con.execute(query_initial_mode).fetchdf()
    if vent_count > 0 and not df_initial_mode.empty:
        df_initial_mode["percent_mode"] = (df_initial_mode["count_mode"] / vent_count) * 100.0

    # ---------------------------------------------------------
    # J) Labs (First Recorded Value)
    # ---------------------------------------------------------
    labs_query = """
        WITH first_lab AS (
            SELECT
                hospitalization_id,
                lab_category,
                lab_collect_dttm,
                lab_value_numeric,
                ROW_NUMBER() OVER (
                    PARTITION BY hospitalization_id, lab_category
                    ORDER BY lab_collect_dttm
                ) AS rn
            FROM clif_labs_cohort
            WHERE lab_category IS NOT NULL
              AND lab_value_numeric IS NOT NULL
        )
        SELECT hospitalization_id, lab_category, lab_value_numeric
        FROM first_lab
        WHERE rn = 1
    """
    df_labs_first = con.execute(labs_query).fetchdf()
    df_labs_summary = pd.DataFrame()
    if not df_labs_first.empty:
        def iqr(series):
            return series.quantile(0.75) - series.quantile(0.25)
        df_labs_summary = (
            df_labs_first
            .groupby("lab_category")["lab_value_numeric"]
            .agg(
                count="count",
                median="median",
                q1=lambda x: x.quantile(0.25),
                q3=lambda x: x.quantile(0.75),
                iqr=iqr
            )
            .reset_index()
        )

    # ---------------------------------------------------------
    # K) Med Admin Continuous (First Recorded Value)
    # ---------------------------------------------------------
    meds_query = """
        WITH first_med AS (
            SELECT
                hospitalization_id,
                med_category,
                admin_dttm,
                med_dose,
                ROW_NUMBER() OVER (
                    PARTITION BY hospitalization_id, med_category
                    ORDER BY admin_dttm
                ) AS rn
            FROM clif_medication_admin_continuous_cohort
            WHERE med_category IS NOT NULL
                  AND med_dose IS NOT NULL
                  AND med_dose > 0
        )
        SELECT hospitalization_id, med_category, med_dose
        FROM first_med
        WHERE rn = 1
    """
    df_meds_first = con.execute(meds_query).fetchdf()
    df_meds_summary = pd.DataFrame()
    if not df_meds_first.empty:
        df_meds_summary = (
            df_meds_first
            .groupby("med_category")["med_dose"]
            .agg(
                count="count",
                median="median",
                q1=lambda x: x.quantile(0.25),
                q3=lambda x: x.quantile(0.75),
                iqr=lambda x: x.quantile(0.75) - x.quantile(0.25)
            )
            .reset_index()
        )

    # ---------------------------------------------------------
    # L) Vitals (First Recorded Value)
    # ---------------------------------------------------------
    vitals_query = """
        WITH first_vital AS (
            SELECT
                hospitalization_id,
                vital_category,
                recorded_dttm,
                vital_value,
                ROW_NUMBER() OVER (
                    PARTITION BY hospitalization_id, vital_category
                    ORDER BY recorded_dttm
                ) AS rn
            FROM clif_vitals_cohort
            WHERE vital_category IS NOT NULL
              AND vital_value IS NOT NULL
        )
        SELECT hospitalization_id, vital_category, vital_value
        FROM first_vital
        WHERE rn = 1
    """
    df_vitals_first = con.execute(vitals_query).fetchdf()
    df_vitals_summary = pd.DataFrame()
    if not df_vitals_first.empty:
        df_vitals_summary = (
            df_vitals_first
            .groupby("vital_category")["vital_value"]
            .agg(
                count="count",
                median="median",
                q1=lambda x: x.quantile(0.25),
                q3=lambda x: x.quantile(0.75),
                iqr=lambda x: x.quantile(0.75) - x.quantile(0.25)
            )
            .reset_index()
        )

    # ---------------------------------------------------------
    # M) Respiratory Support Numeric Summaries (All Rows)
    # ---------------------------------------------------------
    # We'll select all rows from clif_respiratory_support_cohort
    # Then find numeric columns and compute count, median, IQR across the entire distribution.
    df_resp_summary = pd.DataFrame()
    try:
        resp_query = "SELECT * FROM clif_respiratory_support_cohort"
        df_resp = con.execute(resp_query).fetchdf()
        if not df_resp.empty:
            # identify numeric columns
            numeric_cols = []
            for col in df_resp.columns:
                if pd.api.types.is_numeric_dtype(df_resp[col]):
                    numeric_cols.append(col)

            rows_resp = []
            for col in numeric_cols:
                series = df_resp[col].dropna()
                if len(series) == 0:
                    continue
                c = series.count()
                med = series.median()
                q1 = series.quantile(0.25)
                q3 = series.quantile(0.75)
                iqr_val = q3 - q1

                rows_resp.append({
                    "resp_variable": col,
                    "count": c,
                    "median": med,
                    "iqr": iqr_val
                })

            if rows_resp:
                df_resp_summary = pd.DataFrame(rows_resp)
    except Exception as e:
        logger.warning(f"Error summarizing respiratory support numeric columns: {str(e)}")

    # ---------------------------------------------------------
    # Build a single DataFrame for Table One
    # ---------------------------------------------------------
    rows = []

    # -- Row: Total N
    rows.append({
        "Variable": "Total N",
        "Count": total_n,
        "Percent": None,
        "Mean[SD]": None,
        "Median": None,
        "IQR": None
    })

    # -- Row: Age
    rows.append({
        "Variable": "Age (years)",
        "Count": None,
        "Percent": None,
        "Mean[SD]": f"{mean_age:.2f} [{sd_age:.2f}]",
        "Median": None,
        "IQR": None
    })

    # -- Row: Female
    rows.append({
        "Variable": "Female",
        "Count": female_count,
        "Percent": f"{female_percent:.1f}",
        "Mean[SD]": None,
        "Median": None,
        "IQR": None
    })

    # Race distribution
    if not df_race.empty:
        for _, r in df_race.iterrows():
            race_cat = r["race_category"]
            c = r["count_race"]
            pct = r["percent_race"]
            rows.append({
                "Variable": f"Race: {race_cat}",
                "Count": c,
                "Percent": f"{pct:.1f}",
                "Mean[SD]": None,
                "Median": None,
                "IQR": None
            })

    # Ethnicity distribution
    if not df_ethnicity.empty:
        for _, r in df_ethnicity.iterrows():
            eth_cat = r["ethnicity_category"]
            c = r["count_ethnicity"]
            pct = r["percent_ethnicity"]
            rows.append({
                "Variable": f"Ethnicity: {eth_cat}",
                "Count": c,
                "Percent": f"{pct:.1f}",
                "Mean[SD]": None,
                "Median": None,
                "IQR": None
            })

    # Hospital Mortality
    rows.append({
        "Variable": "Hospital Mortality (Expired)",
        "Count": mortality_count,
        "Percent": f"{mortality_percent:.1f}",
        "Mean[SD]": None,
        "Median": None,
        "IQR": None
    })

    # Mechanically Ventilated
    rows.append({
        "Variable": "Mechanically Ventilated (IMV)",
        "Count": vent_count,
        "Percent": f"{vent_percent:.1f}",
        "Mean[SD]": None,
        "Median": None,
        "IQR": None
    })

    # First location
    if not df_first_loc.empty:
        for _, r in df_first_loc.iterrows():
            loc_cat = r["location_category"]
            c = r["count_loc"]
            pct = r["percent_loc"]
            rows.append({
                "Variable": f"First Location: {loc_cat}",
                "Count": c,
                "Percent": f"{pct:.1f}",
                "Mean[SD]": None,
                "Median": None,
                "IQR": None
            })

    # Initial Mode (among ventilated)
    if not df_initial_mode.empty:
        for _, rowm in df_initial_mode.iterrows():
            mode_cat = rowm["mode_category"]
            c = rowm["count_mode"]
            pct = rowm["percent_mode"]
            rows.append({
                "Variable": f"Initial Mode: {mode_cat}",
                "Count": c,
                "Percent": f"{pct:.1f}",
                "Mean[SD]": None,
                "Median": None,
                "IQR": None
            })

    # Lab Summary
    if not df_labs_summary.empty:
        for _, lab_row in df_labs_summary.iterrows():
            cat = lab_row["lab_category"]
            med = lab_row["median"]
            iqr_val = lab_row["iqr"]
            c = lab_row["count"]
            rows.append({
                "Variable": f"Lab: {cat}",
                "Count": c,
                "Percent": None,
                "Mean[SD]": None,
                "Median": f"{med:.2f}",
                "IQR": f"{iqr_val:.2f}"
            })

    # Med Summary
    if not df_meds_summary.empty:
        for _, med_row in df_meds_summary.iterrows():
            cat = med_row["med_category"]
            med = med_row["median"]
            iqr_val = med_row["iqr"]
            c = med_row["count"]
            rows.append({
                "Variable": f"Med: {cat}",
                "Count": c,
                "Percent": None,
                "Mean[SD]": None,
                "Median": f"{med:.2f}",
                "IQR": f"{iqr_val:.2f}"
            })

    # Vitals Summary
    if not df_vitals_summary.empty:
        for _, vit_row in df_vitals_summary.iterrows():
            cat = vit_row["vital_category"]
            med = vit_row["median"]
            iqr_val = vit_row["iqr"]
            c = vit_row["count"]
            rows.append({
                "Variable": f"Vital: {cat}",
                "Count": c,
                "Percent": None,
                "Mean[SD]": None,
                "Median": f"{med:.2f}",
                "IQR": f"{iqr_val:.2f}"
            })

    # Respiratory numeric summary (all rows)
    if not df_resp_summary.empty:
        for _, rowr in df_resp_summary.iterrows():
            resp_var = rowr["resp_variable"]
            c = rowr["count"]
            med = rowr["median"]
            iqr_val = rowr["iqr"]
            rows.append({
                "Variable": f"Resp: {resp_var}",
                "Count": c,
                "Percent": None,
                "Mean[SD]": None,
                "Median": f"{med:.2f}",
                "IQR": f"{iqr_val:.2f}"
            })

    # Create final single DataFrame
    df_tableone = pd.DataFrame(
        rows, 
        columns=["Variable", "Count", "Percent", "Mean[SD]", "Median", "IQR"]
    )

    return df_tableone

In [6]:
cohort_df = pd.read_csv("cohort_ids.csv")
sample_cohort_ids = cohort_df["hospitalization_id"].tolist()

In [17]:
path_to_clif = "/Users/kavenchhikara/Desktop/CLIF/CLIF-UCMC/rclif/c19/clif-2.0.0"
path_to_ids = "cohort_ids.csv"

In [None]:
# hosp_csv = sys.argv[1]       # e.g. "/path/to/hosp_ids.csv"
# base_path = sys.argv[2]      # e.g. "/path/to/clif_data"
hosp_csv = path_to_ids
base_path = path_to_clif

# 2) Read the CSV of hospitalization IDs
try:
    df_hids = pd.read_csv(hosp_csv)
    if "hospitalization_id" not in df_hids.columns:
        raise ValueError("CSV must have a column named 'hospitalization_id'.")
    hospitalization_ids = df_hids["hospitalization_id"].dropna().unique().tolist()
except Exception as e:
    logger.error(f"Could not read hospitalization_ids from {hosp_csv}: {str(e)}")
    sys.exit(1)

# 3) Call create_table_one
try:
    df_tableone = create_table_one(hospitalization_ids, base_path)
    if df_tableone is not None and not df_tableone.empty:
        # 4) Save to CSV
        out_csv = "tableone.csv"
        df_tableone.to_csv(out_csv, index=False)
        logger.info(f"Table One saved to {out_csv}")
    else:
        logger.warning("Table One returned empty or None.")
except Exception as exc:
    logger.error(f"Unhandled exception: {exc}")
    sys.exit(1)