In [1]:
import sqlite3
import pandas as pd
from pathlib import Path

In [2]:
DB_PATH       = "mimic_iv_hosp.db"
OUTPUT_FOLDER = Path("./mimic_sample_1000")
SAMPLE_SIZE   = 1000

In [3]:
OUTPUT_FOLDER.mkdir(exist_ok=True)

In [4]:
conn   = sqlite3.connect(DB_PATH)
cursor = conn.cursor()

In [5]:
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
all_tables = [row[0] for row in cursor.fetchall()]

In [None]:
linking_tables = []
for tbl in all_tables:
    cursor.execute(f'PRAGMA table_info("{tbl}");')
    cols = [r[1] for r in cursor.fetchall()]
    if "hadm_id" in cols:
        linking_tables.append(tbl)

print(f"Found {len(linking_tables)} tables with hadm_id: {linking_tables}\n")

Found 13 tables with hadm_id: ['admissions.csv', 'diagnoses_icd.csv', 'drgcodes.csv', 'emar.csv', 'hcpcsevents.csv', 'microbiologyevents.csv', 'pharmacy.csv', 'poe.csv', 'prescriptions.csv', 'procedures_icd.csv', 'services.csv', 'transfers.csv', 'labevents.csv']



In [9]:
sample_sql = f"""
    SELECT hadm_id
      FROM [admissions.csv]
     ORDER BY RANDOM()
     LIMIT {SAMPLE_SIZE};
"""
sample_ids = pd.read_sql_query(sample_sql, conn)["hadm_id"].tolist()
print(f"Sampled {len(sample_ids)} hadm_ids (e.g. {sample_ids[:5]}…)\n")

Sampled 1000 hadm_ids (e.g. [20700725, 22562915, 21933384, 28996579, 29463780]…)



In [10]:
sample_df = pd.DataFrame({"hadm_id": sample_ids})
sample_df.to_sql("sample_adm", conn, if_exists="replace", index=False)

1000

In [12]:
for tbl in linking_tables:
    print(f"Exporting table `{tbl}`…", end=" ")
    df = pd.read_sql_query(
        f'SELECT * FROM "{tbl}" WHERE hadm_id IN (SELECT hadm_id FROM sample_adm);',
        conn
    )
    out_path = OUTPUT_FOLDER / f"{tbl}_sample{SAMPLE_SIZE}.csv"
    df.to_csv(out_path, index=False)
    print(f"{len(df):,} rows → {out_path.name}")

Exporting table `admissions.csv`… 1,000 rows → admissions.csv_sample1000.csv
Exporting table `diagnoses_icd.csv`… 11,517 rows → diagnoses_icd.csv_sample1000.csv
Exporting table `drgcodes.csv`… 1,378 rows → drgcodes.csv_sample1000.csv
Exporting table `emar.csv`… 83,500 rows → emar.csv_sample1000.csv
Exporting table `hcpcsevents.csv`… 378 rows → hcpcsevents.csv_sample1000.csv
Exporting table `microbiologyevents.csv`… 3,768 rows → microbiologyevents.csv_sample1000.csv
Exporting table `pharmacy.csv`… 33,725 rows → pharmacy.csv_sample1000.csv
Exporting table `poe.csv`… 97,453 rows → poe.csv_sample1000.csv
Exporting table `prescriptions.csv`… 38,271 rows → prescriptions.csv_sample1000.csv
Exporting table `procedures_icd.csv`… 1,533 rows → procedures_icd.csv_sample1000.csv
Exporting table `services.csv`… 1,096 rows → services.csv_sample1000.csv
Exporting table `transfers.csv`… 3,690 rows → transfers.csv_sample1000.csv
Exporting table `labevents.csv`… 158,695 rows → labevents.csv_sample1000.cs

In [13]:
cursor.execute("DROP TABLE IF EXISTS sample_adm;")
conn.commit()
conn.close()

Re opening to get other datasets not linked by hadm_id

In [1]:
import sqlite3
conn = sqlite3.connect("mimic_iv_hosp.db")

In [2]:
import pandas as pd
from pathlib import Path
OUTPUT = Path("./mimic_sample_1000")

In [5]:
small_defs = [
    "d_icd_diagnoses.csv",
    "d_icd_procedures.csv",
    "d_labitems.csv",
    "d_hcpcs.csv",
    "provider.csv",
]
for tbl in small_defs:
    out_file = OUTPUT / f"{tbl}.csv"
    if out_file.exists():
        print(f"Skipping {tbl} (already exists).")
        continue

    print(f"Exporting {tbl}…", end=" ")
    df = pd.read_sql_query(f"SELECT * FROM {tbl};", conn)
    df.to_csv(out_file, index=False)
    print(f"{len(df):,} rows → {out_file.name}")

Skipping d_icd_diagnoses.csv (already exists).
Skipping d_icd_procedures.csv (already exists).
Skipping d_labitems.csv (already exists).
Skipping d_hcpcs.csv (already exists).
Skipping provider.csv (already exists).


In [13]:
tables_df = pd.read_sql_query(
    "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;",
    conn
)
print(tables_df)

                      name
0           admissions.csv
1              d_hcpcs.csv
2      d_icd_diagnoses.csv
3     d_icd_procedures.csv
4           d_labitems.csv
5        diagnoses_icd.csv
6             drgcodes.csv
7                 emar.csv
8          emar_detail.csv
9          hcpcsevents.csv
10           labevents.csv
11  microbiologyevents.csv
12                 omr.csv
13            patients.csv
14            pharmacy.csv
15                 poe.csv
16          poe_detail.csv
17       prescriptions.csv
18      procedures_icd.csv
19            provider.csv
20            services.csv
21           transfers.csv
