# `Create Database for MIMIC Dataset`

Creates SQLite dataset for MIMIC-IV v3.1 

### Access Data Files
- [ ] file1
- [ ] file2

### Create Databases
- [ ] table1
- [ ] table2

### Read Aquisition Parameters
- [ ] table1





## 1. `Define Data Files`

In [None]:
# Add project root to sys.path

import sys
from pathlib import Path

ROOT = Path.cwd()
for parent in [ROOT] + list(ROOT.parents):
    if (parent / "config").is_dir():
        ROOT = parent
        break
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

print( '[X] Root fount' )

In [None]:
# Read settings and verify CSV files exist

from xrh.settings import load_settings


settings = load_settings()
if not settings.admission_csv.exists():
    raise FileNotFoundError(f"Admission CSV file not found at {settings.admission_csv}")

if not settings.patient_csv.exists():
    raise FileNotFoundError(f"Patient CSV file not found at {settings.patient_csv}")

if not settings.transfer_csv.exists():
    raise FileNotFoundError(f"Transfer CSV file not found at {settings.transfer_csv}")

if not settings.d_hcpcs_csv.exists():
    raise FileNotFoundError(f"D HCPCS CSV file not found at {settings.d_hcpcs_csv}")

if not settings.d_icd_diagnoses_csv.exists():
    raise FileNotFoundError(f"D ICD Diagnoses CSV file not found at {settings.d_icd_diagnoses_csv}")

if not settings.d_icd_procedures_csv.exists():
    raise FileNotFoundError(f"D ICD Procedures CSV file not found at {settings.d_icd_procedures_csv}")

if not settings.d_labitems_csv.exists():
    raise FileNotFoundError(f"D Lab Items CSV file not found at {settings.d_labitems_csv}")

if not settings.diagnoses_icd_csv.exists():
    raise FileNotFoundError(f"Diagnoses ICD CSV file not found at {settings.diagnoses_icd_csv}")

if not settings.drgcodes_csv.exists():
    raise FileNotFoundError(f"DRG Codes CSV file not found at {settings.drgcodes_csv}")

if not settings.emar_csv.exists():
    raise FileNotFoundError(f"EMAR CSV file not found at {settings.emar_csv}")

if not settings.emar_detail_csv.exists():
    raise FileNotFoundError(f"EMAR Detail CSV file not found at {settings.emar_detail_csv}")

if not settings.hcpcsevents_csv.exists():
    raise FileNotFoundError(f"HCPCS Events CSV file not found at {settings.hcpcsevents_csv}")

if not settings.labevents_csv.exists():
    raise FileNotFoundError(f"Lab Events CSV file not found at {settings.labevents_csv}")

if not settings.microbiologyevents_csv.exists():
    raise FileNotFoundError(f"Microbiology Events CSV file not found at {settings.microbiologyevents_csv}")

if not settings.pharmacy_csv.exists():
    raise FileNotFoundError(f"Pharmacy CSV file not found at {settings.pharmacy_csv}")

if not settings.poe_csv.exists():
    raise FileNotFoundError(f"POE CSV file not found at {settings.poe_csv}")

if not settings.poe_detail_csv.exists():
    raise FileNotFoundError(f"POE Detail CSV file not found at {settings.poe_detail_csv}")

if not settings.prescriptions_csv.exists():
    raise FileNotFoundError(f"Prescriptions CSV file not found at {settings.prescriptions_csv}")

if not settings.procedures_icd_csv.exists():
    raise FileNotFoundError(f"Procedures ICD CSV file not found at {settings.procedures_icd_csv}")

if not settings.services_csv.exists():
    raise FileNotFoundError(f"Services CSV file not found at {settings.services_csv}")

print( '[X] All required CSV files found' )

# 2. `Create Databese`

### 2.1.01 `Hosp: Admission`

In [None]:
import os
import sqlite3
import pandas as pd

# === CONFIG: set your paths here ===
db_dir   = settings.DB_path.parent                         # folder where the DB will live
csv_path = settings.admission_csv       # <-- change to your CSV path

db_dir.mkdir(parents=True, exist_ok=True)
db_path = settings.DB_path            # database filename

# === 1) Create the database and the admissions table ===
schema_sql = """
DROP TABLE IF EXISTS admissions;
CREATE TABLE admissions (
    subject_id             INTEGER NOT NULL,
    hadm_id                INTEGER NOT NULL,
    admittime              TEXT    NOT NULL,   -- store TIMESTAMP as ISO-8601 text
    dischtime              TEXT,
    deathtime              TEXT,
    admission_type         TEXT    NOT NULL,
    admit_provider_id      TEXT,
    admission_location     TEXT,
    discharge_location     TEXT,
    insurance              TEXT,
    language               TEXT,
    marital_status         TEXT,
    race                   TEXT,
    edregtime              TEXT,
    edouttime              TEXT,
    hospital_expire_flag   INTEGER
);
-- Optional helpful indexes (uncomment if you want them)
-- CREATE INDEX idx_admissions_hadm ON admissions(hadm_id);
-- CREATE INDEX idx_admissions_subject ON admissions(subject_id);
"""

conn = sqlite3.connect(db_path)
with conn:
    conn.executescript(schema_sql)

print(f"✅ Created database at: {db_path.resolve()}")
print("✅ Created table: admissions")

# === 2) Load CSV and insert into admissions ===
# If your CSV columns already match exactly, this will just work.
# Otherwise, you can rename columns via the 'rename_map' below.
parse_date_cols = [
    "admittime", "dischtime", "deathtime", "edregtime", "edouttime"
]

# Read CSV; if your timestamps are ISO-8601 already, you can set dtype=str instead of parse_dates
try:
    df = pd.read_csv(csv_path, parse_dates=parse_date_cols, infer_datetime_format=True, keep_date_col=True)
except ValueError:
    # Fall back if some timestamp cols are missing in the CSV
    df = pd.read_csv(csv_path)

# Ensure column names match the target table
expected_cols = [
    "subject_id","hadm_id","admittime","dischtime","deathtime",
    "admission_type","admit_provider_id","admission_location","discharge_location",
    "insurance","language","marital_status","race","edregtime","edouttime",
    "hospital_expire_flag"
]

# If your CSV headers differ, map them here, e.g. {'SUBJECT_ID':'subject_id', 'HADM_ID':'hadm_id', ...}
rename_map = {}
if rename_map:
    df = df.rename(columns=rename_map)

# Keep only expected columns (and in the right order)
missing = [c for c in expected_cols if c not in df.columns]
if missing:
    raise ValueError(f"The CSV is missing required columns: {missing}")

df = df[expected_cols]

# Convert datetime columns to ISO 8601 strings for SQLite (TEXT)
for c in ["admittime","dischtime","deathtime","edregtime","edouttime"]:
    if c in df.columns:
        # Convert to string only where not null
        if pd.api.types.is_datetime64_any_dtype(df[c]):
            df[c] = df[c].dt.strftime("%Y-%m-%d %H:%M:%S")
        else:
            # If already strings, leave as-is
            pass

# Enforce integer types where appropriate (SQLite is flexible, but this helps cleanliness)
int_cols = ["subject_id","hadm_id","hospital_expire_flag"]
for c in int_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce").astype("Int64")

# Insert into the table
# We defined the table explicitly, so we APPEND.
with conn:
    df.to_sql("admissions", conn, if_exists="append", index=False)

# Quick verification
row_count = conn.execute("SELECT COUNT(*) FROM admissions").fetchone()[0]
print(f"✅ Inserted {row_count} rows into admissions")

# Peek at a few rows
preview = pd.read_sql("SELECT * FROM admissions LIMIT 5;", conn)
conn.close()
preview


### 2.1.02 `Hosp: OMR`

In [None]:
# === CREATE TABLE: omr ===
schema_sql_omr = """
DROP TABLE IF EXISTS omr;
CREATE TABLE omr (
    subject_id    INTEGER NOT NULL,
    chartdate     TEXT    NOT NULL,   -- store DATE as ISO-8601 'YYYY-MM-DD'
    seq_num       INTEGER NOT NULL,
    result_name   TEXT    NOT NULL,
    result_value  TEXT    NOT NULL
);
/* Optional indexes (uncomment if useful)
CREATE INDEX IF NOT EXISTS idx_omr_subject ON omr(subject_id);
CREATE INDEX IF NOT EXISTS idx_omr_subject_date ON omr(subject_id, chartdate);
*/
"""

with conn:
    conn.executescript(schema_sql_omr)
print("✅ Table 'omr' created.")

# === LOAD CSV -> omr ===
# Adjust this path if your settings object uses a different attribute name
omr_csv_path = getattr(settings, "omr_csv", None) or csv_path  # fallback to previous csv_path if needed

# Columns expected in the CSV
expected_cols = ["subject_id", "chartdate", "seq_num", "result_name", "result_value"]

# Read CSV; parse chartdate as date if present
try:
    df_omr = pd.read_csv(omr_csv_path, parse_dates=["chartdate"], infer_datetime_format=True, keep_date_col=True)
except ValueError:
    # If 'chartdate' not present or parse error, read without date parsing
    df_omr = pd.read_csv(omr_csv_path)

# If your CSV headers differ, map them here (example shown; edit/remove as needed)
# rename_map = {"SUBJECT_ID": "subject_id", "CHARTDATE": "chartdate", "SEQ_NUM": "seq_num",
#               "RESULT_NAME": "result_name", "RESULT_VALUE": "result_value"}
rename_map = {}
if rename_map:
    df_omr = df_omr.rename(columns=rename_map)

# Validate required columns, enforce order
missing = [c for c in expected_cols if c not in df_omr.columns]
if missing:
    raise ValueError(f"OMR CSV is missing required columns: {missing}")

df_omr = df_omr[expected_cols]

# Format chartdate to 'YYYY-MM-DD' strings (SQLite stores as TEXT)
if "chartdate" in df_omr.columns:
    if pd.api.types.is_datetime64_any_dtype(df_omr["chartdate"]):
        df_omr["chartdate"] = df_omr["chartdate"].dt.strftime("%Y-%m-%d")
    else:
        # If it's already a string, normalize by slicing the date part (optional)
        df_omr["chartdate"] = df_omr["chartdate"].astype(str).str[:10]

# Clean integer columns
for c in ["subject_id", "seq_num"]:
    if c in df_omr.columns:
        df_omr[c] = pd.to_numeric(df_omr[c], errors="coerce").astype("Int64")

# Insert into SQLite
with conn:
    df_omr.to_sql("omr", conn, if_exists="append", index=False)

# Quick verification
rows = conn.execute("SELECT COUNT(*) FROM omr;").fetchone()[0]
print(f"✅ Loaded {rows} rows into 'omr'.")
display(pd.read_sql("SELECT * FROM omr LIMIT 5;", conn))


### 2.1.03 `Hosp: Provider`

In [None]:
# === CREATE TABLE: provider ===
schema_sql_provider = """
DROP TABLE IF EXISTS provider;
CREATE TABLE provider (
    provider_id TEXT NOT NULL
);
/* Optional index if provider_id is often used for joins */
-- CREATE UNIQUE INDEX IF NOT EXISTS idx_provider_id ON provider(provider_id);
"""

with conn:
    conn.executescript(schema_sql_provider)
print("✅ Table 'provider' created.")
+
# === LOAD CSV -> provider ===
# Adjust CSV path as needed; expects settings.provider_csv to exist
provider_csv_path = getattr(settings, "provider_csv", None)

if provider_csv_path is None:
    raise ValueError("⚠️ Please define settings.provider_csv with the path to your provider CSV file.")

# Read CSV
df_provider = pd.read_csv(provider_csv_path, dtype=str)  # ensure provider_id stays as text

# Normalize column name (handle variations like 'PROVIDER_ID')
rename_map = {"PROVIDER_ID": "provider_id"}
df_provider = df_provider.rename(columns={c: c.lower() for c in df_provider.columns})
df_provider = df_provider.rename(columns=rename_map)

# Validate expected column
if "provider_id" not in df_provider.columns:
    raise ValueError("⚠️ The provider CSV must contain a 'provider_id' column.")

# Keep only that column (avoid extra)
df_provider = df_provider[["provider_id"]]

# Drop duplicates if any
df_provider = df_provider.drop_duplicates()

# Insert into SQLite
with conn:
    df_provider.to_sql("provider", conn, if_exists="append", index=False)

# Quick verification
count = conn.execute("SELECT COUNT(*) FROM provider;").fetchone()[0]
print(f"✅ Loaded {count} rows into 'provider'.")
display(pd.read_sql("SELECT * FROM provider LIMIT 5;", conn))
