In [None]:
from dotenv import load_dotenv

load_dotenv()

In [None]:
from sqlalchemy import create_engine
from sqlalchemy.engine import URL
import os

HOST_IP = os.environ['HOST_IP']
DATABASE_USER = os.environ['DATABASE_USER']
DATABASE_PASSWORD = os.environ['DATABASE_PASSWORD']
DATABASE_PORT = os.environ['DATABASE_PORT']

connection_url = URL.create(
    "postgresql+psycopg2",
    username=DATABASE_USER,
    password=DATABASE_PASSWORD,
    host=HOST_IP,
    port=DATABASE_PORT,
    database="mimiciv"
)

engine = create_engine(connection_url)

In [None]:
import pandas as pd
from sqlalchemy import text

query = text("SELECT * FROM mimiciv.mimiciv_hosp.admissions LIMIT 10")
pd.read_sql_query(query, engine)

In [None]:
query = text("""
SELECT json_build_object(
    schema_name, json_agg(
        json_build_object(
            table_name, column_names
        )
    )
)
FROM (
    SELECT 
        t.table_schema as schema_name, 
        t.table_name, 
        json_agg(c.column_name ORDER BY c.ordinal_position) as column_names
    FROM information_schema.tables t
    INNER JOIN information_schema.columns c 
        ON t.table_schema = c.table_schema AND t.table_name = c.table_name
    WHERE t.table_schema NOT IN ('information_schema', 'pg_catalog')
    GROUP BY t.table_schema, t.table_name
) AS sub
GROUP BY schema_name;
""")

# Execute the query
with engine.connect() as con:
    result = con.execute(query).fetchall()

In [None]:
# Extract schemas and their respective tables with columns from the query result
schemas_with_tables = [schema_result[0] for schema_result in result]

# Flatten the list of dictionaries for each schema
database_structure = {}
for schema in schemas_with_tables:
    for schema_name, tables in schema.items():
        # Initialize the schema in the flattened dictionary if not already present
        if schema_name not in database_structure:
            database_structure[schema_name] = {}

        # Combine the tables under the same schema
        for table in tables:
            database_structure[schema_name].update(table)

database_structure

In [None]:
subject_id = 10000032
hadm_id = 29079034

query = text(
    "select * from mimiciv.mimiciv_hosp.admissions where subject_id = :subject and hadm_id = :hadm;").bindparams(
    subject=subject_id, hadm=hadm_id)

hospital_stay = pd.read_sql_query(query, engine)

cols_to_remove = [
    'subject_id', 'hadm_id', 'seq_num', 'emar_id', 'emar_seq', 'pharmacy_id', 'enter_provider_id', 'labevent_id',
    'order_provider_id', 'value_num',
    'admit_provider_id', 'storetime'
]

for k, v in database_structure['mimiciv_hosp'].items():
    if 'hadm_id' in v:
        if k == 'admissions':
            continue

        print(k)
        sql_query = text(f"select * from mimiciv.mimiciv_hosp.{k} where hadm_id = :hadm_id").bindparams(hadm_id=hadm_id)
        sql_df = pd.read_sql(sql_query, engine).drop('subject_id', axis=1, errors='ignore')

        # if more than one row, convert to list of dictionaries
        if len(sql_df) > 1:
            removed_cols = [col for col in cols_to_remove if col in sql_df.columns]
            sql_df = sql_df.drop(removed_cols, axis=1, errors='ignore')
            sql_df = sql_df.to_dict('records')

            # add to df2 as a single cell in new column
            hospital_stay[k] = [sql_df]
        else:
            # Prefix column with table name
            sql_df = sql_df.rename(columns=lambda x: f"{k}_{x}" if x != 'hadm_id' else x)
            hospital_stay = hospital_stay.merge(sql_df, on='hadm_id', how='outer')

hospital_stay = hospital_stay.drop(
    [col for col in cols_to_remove if col in hospital_stay.columns],
    axis=1,
    errors='ignore',
)

In [None]:
def convert_icd_to_text(icd_list, icd_type):
    # Prepare a CASE statement for ordering
    case_statement = "CASE "
    for index, item in enumerate(icd_list):
        code = item['icd_code'].strip()  # Remove leading/trailing spaces
        version = item['icd_version']
        case_statement += f"WHEN icd_code = '{code}' AND icd_version = {version} THEN {index} "

    case_statement += "END"

    # Create a WHERE IN clause
    icd_conditions = ", ".join([f"('{item['icd_code'].strip()}', {item['icd_version']})" for item in icd_list])
    sql_query = f"""
    SELECT long_title 
    FROM mimiciv.mimiciv_hosp.d_icd_{icd_type} 
    WHERE (icd_code, icd_version) IN ({icd_conditions})
    ORDER BY {case_statement};
    """

    # Execute the query
    return pd.read_sql(sql_query, engine)['long_title'].tolist()

In [None]:
if 'diagnoses_icd' in hospital_stay.columns:
    ordered_diagnoses = convert_icd_to_text(hospital_stay['diagnoses_icd'][0], 'diagnoses')
    hospital_stay['diagnoses_icd'] = [ordered_diagnoses]
    hospital_stay = hospital_stay.rename(columns={'diagnoses_icd': 'prioritized_diagnoses'})

if 'procedures_icd' in hospital_stay.columns:
    ordered_procedures = convert_icd_to_text(hospital_stay['procedures_icd'][0], 'procedures')
    hospital_stay['procedures_icd'] = [ordered_procedures]
    hospital_stay = hospital_stay.rename(columns={'procedures_icd': 'prioritized_procedures'})

In [None]:
hospital_stay

In [None]:
ed_stay_query = text("select * from mimiciv.mimiciv_ed.edstays where hadm_id = :hadm_id").bindparams(hadm_id=hadm_id)

ed_stay_df = pd.read_sql(ed_stay_query, engine)

ed_stays = []

for row in ed_stay_df.iterrows():
    row_info = pd.DataFrame(row[1]).transpose()
    stay_id = row_info['stay_id'].values[0]

    for k, v in database_structure['mimiciv_ed'].items():
        if 'stay_id' in v:
            if k == 'edstays':
                continue

            print(k)
            sql_query = text(f"select * from mimiciv.mimiciv_ed.{k} where stay_id = :stay_id").bindparams(
                stay_id=stay_id)
            sql_df = pd.read_sql(sql_query, engine).drop('subject_id', axis=1, errors='ignore')

            # Prefix column with table name
            sql_df = sql_df.rename(columns=lambda x: f"{k}_{x}" if x != 'stay_id' else x)

            # if more than one row, convert to list of dictionaries
            if len(sql_df) > 1:
                sql_df = sql_df.drop('stay_id', axis=1, errors='ignore')
                sql_df = sql_df.to_dict('records')

                # add to df2 as a single cell in new column
                row_info[k] = [sql_df]
            else:
                row_info = row_info.merge(sql_df, on='stay_id', how='outer')

    ed_stays.append(row_info)

ed_stays = pd.concat(ed_stays, ignore_index=True)

In [None]:
ed_stays

In [None]:
icu_stay_query = text("select * from mimiciv.mimiciv_icu.icustays where hadm_id = :hadm_id").bindparams(hadm_id=hadm_id)

icu_stay_df = pd.read_sql(icu_stay_query, engine)

icu_stays = []

for row in icu_stay_df.iterrows():
    row_info = pd.DataFrame(row[1]).transpose()
    stay_id = row_info['stay_id'].values[0]

    for k, v in database_structure['mimiciv_icu'].items():
        if 'stay_id' in v:
            if k == 'icustays':
                continue

            print(k)
            sql_query = text(f"select * from mimiciv.mimiciv_icu.{k} where stay_id = :stay_id").bindparams(
                stay_id=stay_id)
            sql_df = pd.read_sql(sql_query, engine).drop('subject_id', axis=1, errors='ignore')

            # Prefix column with table name
            sql_df = sql_df.rename(columns=lambda x: f"{k}_{x}" if x != 'stay_id' else x)

            # if more than one row, convert to list of dictionaries
            if len(sql_df) > 1:
                sql_df = sql_df.drop('stay_id', axis=1, errors='ignore')
                sql_df = sql_df.to_dict('records')

                # add to df2 as a single cell in new column
                row_info[k] = [sql_df]
            else:
                row_info = row_info.merge(sql_df, on='stay_id', how='outer')

    icu_stays.append(row_info)

icu_stays = pd.concat(icu_stays, ignore_index=True)

In [None]:
icu_stays

In [None]:
discharge_note_query = text("select * from mimiciv.mimiciv_note.discharge where hadm_id = :hadm_id").bindparams(
    hadm_id=hadm_id)

discharge_note_df = pd.read_sql(discharge_note_query, engine)

radiology_note_query = text("select * from mimiciv.mimiciv_note.radiology where hadm_id = :hadm_id").bindparams(
    hadm_id=hadm_id)

radiology_note_df = pd.read_sql(radiology_note_query, engine)

In [None]:
discharge_note_df

In [None]:
radiology_note_df

In [None]:
def format_df_to_text(df):
    output = ""

    for row in df.iterrows():
        for column in df.columns:
            if column in ['subject_id', 'hadm_id']:
                continue

            value = row[1][column]

            if isinstance(value, list):
                output += f"{column}:\n"

                if isinstance(value[0], dict):
                    for i, dictionary in enumerate(value):
                        output += f" - {i + 1} of {len(value)}\n"
                        for k, v in dictionary.items():
                            if str(v).lower() not in ['none', 'nan']:
                                output += f"   - {k}: {v}\n"
                else:
                    for v in value:
                        if str(v).lower() not in ['none', 'nan']:
                            output += f" - {v}\n"
            elif str(value).lower() not in ['none', 'nan']:
                output += f"{column}: {value}\n"

    return output


def patient_info_to_text(hosp_df, ed_df, icu_df, discharge_df, radiology_df):
    patient_info = ""

In [None]:
format_df_to_text(hospital_stay)