In [None]:
from dotenv import load_dotenv

load_dotenv()

In [None]:
from sqlalchemy import create_engine
from sqlalchemy.engine import URL
import os

HOST_IP = os.environ['HOST_IP']
DATABASE_USER = os.environ['DATABASE_USER']
DATABASE_PASSWORD = os.environ['DATABASE_PASSWORD']
DATABASE_PORT = os.environ['DATABASE_PORT']

connection_url = URL.create(
    "postgresql+psycopg2",
    username=DATABASE_USER,
    password=DATABASE_PASSWORD,
    host=HOST_IP,
    port=DATABASE_PORT,
    database="mimiciv"
)

engine = create_engine(connection_url)

In [None]:
import pandas as pd
from sqlalchemy import text

query = text("SELECT * FROM mimiciv.mimiciv_hosp.admissions LIMIT 10")
pd.read_sql_query(query, engine)

In [None]:
query = text("""
SELECT json_build_object(
    schema_name, json_agg(
        json_build_object(
            table_name, column_names
        )
    )
)
FROM (
    SELECT 
        t.table_schema as schema_name, 
        t.table_name, 
        json_agg(c.column_name ORDER BY c.ordinal_position) as column_names
    FROM information_schema.tables t
    INNER JOIN information_schema.columns c 
        ON t.table_schema = c.table_schema AND t.table_name = c.table_name
    WHERE t.table_schema NOT IN ('information_schema', 'pg_catalog')
    GROUP BY t.table_schema, t.table_name
) AS sub
GROUP BY schema_name;
""")

# Execute the query
with engine.connect() as con:
    result = con.execute(query).fetchall()

In [None]:
# Extract schemas and their respective tables with columns from the query result
schemas_with_tables = [schema_result[0] for schema_result in result]

# Flatten the list of dictionaries for each schema
database_structure = {}
for schema in schemas_with_tables:
    for schema_name, tables in schema.items():
        # Initialize the schema in the flattened dictionary if not already present
        if schema_name not in database_structure:
            database_structure[schema_name] = {}

        # Combine the tables under the same schema
        for table in tables:
            database_structure[schema_name].update(table)

database_structure

In [None]:
def to_clean_records(dataframe):
    return dataframe.apply(lambda row: row.dropna().to_dict(), axis=1).tolist()

In [None]:
subject_id = 10000032
hadm_id = 29079034
# subject_id = 10005909
# hadm_id = 20199380

query = text(
    "select * from mimiciv.mimiciv_hosp.admissions where subject_id = :subject and hadm_id = :hadm;").bindparams(
    subject=subject_id, hadm=hadm_id)

hospital_stay = pd.read_sql_query(query, engine)

tables_to_ignore = [
    'admissions',
    'drgcodes',
    'emar',
    'hcpcsevents',
    'pharmacy'
]

hosp_cols_to_remove = [
    'subject_id', 'hadm_id', 'seq_num', 'emar_id', 'emar_seq', 'pharmacy_id', 'enter_provider_id', 'labevent_id',
    'order_provider_id',
    'admit_provider_id', 'storetime',
    'insurance', 'edregtime', 'edouttime',
    'anchor_year_group', 'hospital_expire_flag',
    'specimen_id', 'status', 'entertime', 'verifiedtime', 'expiration_value', 'expiration_unit', 'expiration_date', 
    'order_status',
    'gsn', 'ndc', 'formulary_drug_cd', 'poe_seq',
    'microevent_id', 'micro_specimen_id', 'spec_itemid', 'test_seq', 'test_itemid',
]

for column, v in database_structure['mimiciv_hosp'].items():
    if 'hadm_id' in v:
        if column in tables_to_ignore:
            continue

        print(column)
        sql_query = text(f"select * from mimiciv.mimiciv_hosp.{column} where hadm_id = :hadm_id").bindparams(hadm_id=hadm_id)
        sql_df = pd.read_sql(sql_query, engine)
        removed_cols = [col for col in hosp_cols_to_remove if col in sql_df.columns]
        sql_df = sql_df.drop(removed_cols, axis=1, errors='ignore')

        if isinstance(sql_df, pd.DataFrame):
            sql_df = to_clean_records(sql_df)

        hospital_stay[column] = [sql_df]
        
hospital_stay = hospital_stay.drop(
    [col for col in hosp_cols_to_remove if col in hospital_stay.columns], axis=1, errors='ignore'
)

In [None]:
# additional tables from subject id
additional_patient_tables = [
    'omr',
    'patients'
]

for table in additional_patient_tables:
    print(table)
    sql_query = text(f"select * from mimiciv.mimiciv_hosp.{table} where subject_id = :subject_id").bindparams(subject_id=subject_id)
    sql_df = pd.read_sql(sql_query, engine)
    removed_cols = [col for col in hosp_cols_to_remove if col in sql_df.columns]
    sql_df = sql_df.drop(removed_cols, axis=1, errors='ignore')

    if isinstance(sql_df, pd.DataFrame):
        sql_df = to_clean_records(sql_df)

    hospital_stay[table] = [sql_df]

In [None]:
def convert_icd_to_text(icd_list, icd_type):
    # Prepare a CASE statement for ordering
    case_statement = "CASE "
    for index, item in enumerate(icd_list):
        code = item['icd_code'].strip()  # Remove leading/trailing spaces
        version = item['icd_version']
        case_statement += f"WHEN icd_code = '{code}' AND icd_version = {version} THEN {index} "

    case_statement += "END"

    # Create a WHERE IN clause
    icd_conditions = ", ".join([f"('{item['icd_code'].strip()}', {item['icd_version']})" for item in icd_list])
    sql_query = f"""
    SELECT long_title 
    FROM mimiciv.mimiciv_hosp.d_icd_{icd_type} 
    WHERE (icd_code, icd_version) IN ({icd_conditions})
    ORDER BY {case_statement};
    """

    # Execute the query
    return pd.read_sql(sql_query, engine)['long_title'].tolist()

def convert_lab_id_to_info(labs):
    # Prepare a CASE statement for ordering
    case_statement = "CASE "
    for index, item in enumerate(labs):
        id = item['itemid']
        case_statement += f"WHEN itemid = {id} THEN {index} "

    case_statement += "END"

    # Create a WHERE IN clause
    lab_conditions = ", ".join([str(item['itemid']) for item in labs])
    sql_query = f"""
    SELECT *
    FROM mimiciv.mimiciv_hosp.d_labitems
    WHERE itemid IN ({lab_conditions})
    ORDER BY {case_statement};
    """

    # Execute the query
    returned = pd.read_sql(sql_query, engine)
    
    original = pd.DataFrame.from_dict(labs)
    
    return to_clean_records(original.merge(returned, on='itemid', how='outer').drop('itemid', axis=1).sort_values(by=['charttime']))

In [None]:
# post processing

for column in hospital_stay.columns:
    print(column)
    match column:
        case 'diagnoses_icd':
            if len(hospital_stay[column][0]) > 0:
                ordered_diagnoses = convert_icd_to_text(hospital_stay[column][0], 'diagnoses')
                hospital_stay[column] = [ordered_diagnoses]
        case 'labevents':
            hospital_stay[column] = [convert_lab_id_to_info(hospital_stay[column][0])]
        case 'procedures_icd':
            if len(hospital_stay[column][0]) > 0:
                ordered_procedures = convert_icd_to_text(hospital_stay[column][0], 'procedures')
                for name, d in zip(ordered_procedures, hospital_stay[column][0]):
                    d['title'] = name
                    
                procedures_df = pd.DataFrame(hospital_stay[column][0])
                procedures_df = procedures_df.drop(['icd_code', 'icd_version'], axis=1).sort_values(by=['chartdate'])
                hospital_stay[column] = [to_clean_records(procedures_df)]
        case 'prescriptions':
            poe_df = pd.DataFrame(hospital_stay['poe'][0])
            prescriptions_df = pd.DataFrame(hospital_stay['prescriptions'][0])
        
            # Get all the prescriptions that have a non-null value in the `poe_id` column
            poe_prescriptions = prescriptions_df[prescriptions_df['poe_id'].notnull()]
            poe_prescriptions = poe_prescriptions.merge(poe_df, on='poe_id', how='outer')
            
            # if a row in poe_prescriptions has both a starttime and an ordertime, set ordertime as null
            poe_prescriptions.loc[(poe_prescriptions['starttime'].notnull()) & (poe_prescriptions['ordertime'].notnull()), 'ordertime'] = None
            
            poe_prescriptions['temp'] = poe_prescriptions['starttime'].combine_first(poe_prescriptions['ordertime'])
            
            poe_prescriptions = poe_prescriptions.sort_values(by=['temp'])
            poe_prescriptions = poe_prescriptions.drop('temp', axis=1)
        
            # Get all the prescriptions that have a null value in the `poe_id` column
            non_poe_prescriptions = prescriptions_df[prescriptions_df['poe_id'].isnull()]
        
            hospital_stay['poe'] = [to_clean_records(poe_prescriptions)]
        
            if len(non_poe_prescriptions) > 0:
                hospital_stay['prescriptions'] = [to_clean_records(non_poe_prescriptions.sort_values())]
            else:
                hospital_stay['prescriptions'] = [[]]
        case 'poe':
            poe_df = pd.DataFrame(hospital_stay['poe'][0])
            poe_df = poe_df[(poe_df['order_type'] != 'ADT orders') & (poe_df['order_type'] != 'Lab')]
            
            hospital_stay['poe'] = [to_clean_records(poe_df)]
        case 'microbiologyevents':
            micro_df = pd.DataFrame(hospital_stay[column][0])
            micro_df = micro_df.drop('chartdate', axis=1)
            hospital_stay[column] = [to_clean_records(micro_df)]

In [None]:
# rename columns to be more readable
hospital_stay = hospital_stay.rename(columns={
    'admittime': 'admission time',
    'dischtime': 'discharge time',
    'deathtime': 'death time',
    'admission_type': 'admission type',
    'admission_location': 'admission location',
    'discharge_location': 'discharge location',
    'marital_status': 'marital status',
    'diagnoses_icd': 'diagnoses',
    'labevents': 'lab tests',
    'microbiologyevents': 'microbiology tests',
    'poe': 'provider orders',
    'procedures_icd': 'procedures',
    'services': 'hospital services',
    'omr': 'other patient information',
    'patients': 'patient information'
})

In [None]:
hospital_stay

In [None]:
ed_stay_query = text("select * from mimiciv.mimiciv_ed.edstays where hadm_id = :hadm_id").bindparams(hadm_id=hadm_id)

# stay_id = 32522732
# ed_stay_query = text("select * from mimiciv.mimiciv_ed.edstays where stay_id = :stay_id").bindparams(stay_id=stay_id)

ed_stay_df = pd.read_sql(ed_stay_query, engine)

ed_stays = []

ed_cols_to_remove = [
    'subject_id',
    'stay_id',
    'seq_num',
    'icd_code',
    'icd_version',
    'gsn',
    'ndc',
    'etc_rn',
    'etccode',
    'med_rn',
    'gsn',
    'gsn_rn'
]

for row in ed_stay_df.iterrows():
    row_info = pd.DataFrame(row[1]).transpose()
    stay_id = row_info['stay_id'].values[0]

    for column, value in database_structure['mimiciv_ed'].items():
        if 'stay_id' in value:
            if column == 'edstays':
                continue

            print(column)
            sql_query = text(f"select * from mimiciv.mimiciv_ed.{column} where stay_id = :stay_id").bindparams(
                stay_id=stay_id)
            sql_df = pd.read_sql(sql_query, engine)
            
            removed_cols = [col for col in ed_cols_to_remove if col in sql_df.columns]
            sql_df = sql_df.drop(removed_cols, axis=1, errors='ignore')
            
            match column:
                case 'diagnosis':
                    # convert to a list of icd_title column
                    sql_df = sql_df['icd_title'].tolist()
                case 'medrecon':
                    sql_df = sql_df.drop('charttime', axis=1, errors='ignore')
                    sql_df = sql_df.rename(columns={'etcdescription': 'enhanced therapeutic class'})
                case 'pyxis':
                    # remove duplicate rows if `name` AND `charttime` are the same
                    sql_df = sql_df.drop_duplicates(subset=['name', 'charttime']) 
            
            if isinstance(sql_df, pd.DataFrame):
                sql_df = to_clean_records(sql_df)

            row_info[column] = [sql_df]
    ed_stays.append(row_info)

ed_stays = pd.concat(ed_stays, ignore_index=True)

# rename columns to be more readable
ed_stays = ed_stays.rename(columns={
    'intime': 'arrival time',
    'outtime': 'exit time',
    'arrival_transport': 'arrival transport',
    'medrecon': 'medication reconciliation',
    'pyxis': 'dispensed medications',
    'vitalsign': 'vital signs'
})


# reorder columns
ed_stays = ed_stays[[
    'subject_id', 'hadm_id', 'stay_id',
    'race', 'gender',
    'arrival time', 'arrival transport',
    'triage',
    'vital signs',
    'medication reconciliation',
    'dispensed medications',
    'exit time', 'disposition',
    'diagnosis'
]]

In [None]:
ed_stays

In [None]:
icu_stay_query = text("select * from mimiciv.mimiciv_icu.icustays where hadm_id = :hadm_id").bindparams(hadm_id=hadm_id)

icu_stay_df = pd.read_sql(icu_stay_query, engine)

icu_stays = []

for row in icu_stay_df.iterrows():
    row_info = pd.DataFrame(row[1]).transpose()
    stay_id = row_info['stay_id'].values[0]

    for k, v in database_structure['mimiciv_icu'].items():
        if 'stay_id' in v:
            if k == 'icustays':
                continue

            print(k)
            sql_query = text(f"select * from mimiciv.mimiciv_icu.{k} where stay_id = :stay_id").bindparams(
                stay_id=stay_id)
            sql_df = pd.read_sql(sql_query, engine).drop('subject_id', axis=1, errors='ignore')

            # Prefix column with table name
            sql_df = sql_df.rename(columns=lambda x: f"{k}_{x}" if x != 'stay_id' else x)

            # if more than one row, convert to list of dictionaries
            if len(sql_df) > 1:
                sql_df = sql_df.drop('stay_id', axis=1, errors='ignore')
                sql_df = to_clean_records(sql_df)

                # add to df2 as a single cell in new column
                row_info[k] = [sql_df]
            else:
                row_info = row_info.merge(sql_df, on='stay_id', how='outer')

    icu_stays.append(row_info)

icu_stays = pd.concat(icu_stays, ignore_index=True)

In [None]:
icu_stays

In [None]:
discharge_note_query = text("select * from mimiciv.mimiciv_note.discharge where hadm_id = :hadm_id").bindparams(
    hadm_id=hadm_id)

discharge_note_df = pd.read_sql(discharge_note_query, engine)

radiology_note_query = text("select * from mimiciv.mimiciv_note.radiology where hadm_id = :hadm_id").bindparams(
    hadm_id=hadm_id)

radiology_note_df = pd.read_sql(radiology_note_query, engine)

In [None]:
discharge_note_df

In [None]:
radiology_note_df

In [None]:
def format_df_to_text(df):
    output = ""

    for row in df.iterrows():
        for column in df.columns:
            if column in ['subject_id', 'hadm_id']:
                continue

            value = row[1][column]

            if isinstance(value, list) and len(value) > 0:
                output += f"{column}:\n"

                if isinstance(value[0], dict):
                    for i, dictionary in enumerate(value):
                        output += f" - {i + 1} of {len(value)}\n"
                        for k, v in dictionary.items():
                            if str(v).lower() not in ['none', 'nan', 'nat']:
                                output += f"   - {k}: {v}\n"
                else:
                    for v in value:
                        if str(v).lower() not in ['none', 'nan', 'nat']:
                            output += f" - {v}\n"
            elif not isinstance(value, list) and str(value).lower() not in ['none', 'nan', 'nat']:
                output += f"{column}: {value}\n"
                
        output += "\n"

    return output


def patient_info_to_text(hosp_df, ed_df, icu_df, discharge_df, radiology_df):
    patient_info = ""

In [None]:
format_df_to_text(ed_stays)

In [None]:
format_df_to_text(hospital_stay)