In [None]:
import sqlite3
import pandas as pd
from pathlib import Path

In [None]:
DB_PATH       = "mimic_iv_hosp.db"
OUTPUT_FOLDER = Path("./mimic_sample_1000")
SAMPLE_SIZE   = 1000

In [None]:
OUTPUT_FOLDER.mkdir(exist_ok=True)

In [None]:
conn   = sqlite3.connect(DB_PATH)
cursor = conn.cursor()

In [None]:
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
all_tables = [row[0] for row in cursor.fetchall()]

In [None]:
linking_tables = []
for tbl in all_tables:
    cursor.execute(f'PRAGMA table_info("{tbl}");')
    cols = [r[1] for r in cursor.fetchall()]
    if "hadm_id" in cols:
        linking_tables.append(tbl)

print(f"Found {len(linking_tables)} tables with hadm_id: {linking_tables}\n")

In [None]:
sample_sql = f"""
    SELECT hadm_id
      FROM [admissions.csv]
     ORDER BY RANDOM()
     LIMIT {SAMPLE_SIZE};
"""
sample_ids = pd.read_sql_query(sample_sql, conn)["hadm_id"].tolist()
print(f"Sampled {len(sample_ids)} hadm_ids (e.g. {sample_ids[:5]}…)\n")

In [None]:
sample_df = pd.DataFrame({"hadm_id": sample_ids})
sample_df.to_sql("sample_adm", conn, if_exists="replace", index=False)

In [None]:
for tbl in linking_tables:
    print(f"Exporting table `{tbl}`…", end=" ")
    df = pd.read_sql_query(
        f'SELECT * FROM "{tbl}" WHERE hadm_id IN (SELECT hadm_id FROM sample_adm);',
        conn
    )
    out_path = OUTPUT_FOLDER / f"{tbl}_sample{SAMPLE_SIZE}.csv"
    df.to_csv(out_path, index=False)
    print(f"{len(df):,} rows → {out_path.name}")

In [None]:
cursor.execute("DROP TABLE IF EXISTS sample_adm;")
conn.commit()
conn.close()

Re opening to get other datasets not linked by hadm_id

In [None]:
import sqlite3
conn = sqlite3.connect("mimic_iv_hosp.db")

In [None]:
import pandas as pd
from pathlib import Path
OUTPUT = Path("./mimic_sample_1000")

In [None]:
small_defs = [
    "d_icd_diagnoses.csv",
    "d_icd_procedures.csv",
    "d_labitems.csv",
    "d_hcpcs.csv",
    "provider.csv",
]
for tbl in small_defs:
    out_file = OUTPUT / f"{tbl}.csv"
    if out_file.exists():
        print(f"Skipping {tbl} (already exists).")
        continue

    print(f"Exporting {tbl}…", end=" ")
    df = pd.read_sql_query(f"SELECT * FROM {tbl};", conn)
    df.to_csv(out_file, index=False)
    print(f"{len(df):,} rows → {out_file.name}")

In [None]:
tables_df = pd.read_sql_query(
    "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;",
    conn
)
print(tables_df)