# 02 — Feature Engineering (MIMIC-IV Sepsis DRL)

> **Input:** `mimic_hourly_binned.parquet`  
> **Output:** `mimic_hourly_binned_feature_engineered.parquet`  

2024-2025 literatürüne uygun ~20-25 feature'lık standart state vektörü oluşturma:

| # | Adım | Yeni Feature(lar) |
|---|------|--------------------|
| 1 | Veri yükleme & kontrol | — |
| 2 | Norepinefrin eşdeğeri | `total_vaso_equiv` |
| 3 | Sıvı dengesi (4h) | `fluid_balance_4h` |
| 4 | SOFA skoru (6 organ) | `sofa_score` |
| 5 | Mek. ventilasyon & Şok indeksi | `mechanical_ventilation`, `shock_index` |
| 6 | Lag features | `prev_fluid_dose`, `prev_vaso_dose` |
| 7 | Final state vector & kayıt | ~20-25 feature → parquet |

In [1]:
import polars as pl
from pathlib import Path

# ─── Paths ─────────────────────────────────────────
PROJECT_ROOT = Path.cwd().parent  # notebooks/ → proje kökü
DATA_DIR     = PROJECT_ROOT / "data" / "processed"
INPUT_PATH   = DATA_DIR / "mimic_hourly_binned.parquet"
OUTPUT_PATH  = DATA_DIR / "mimic_hourly_binned_feature_engineered.parquet"

print(f"Input  : {INPUT_PATH}")
print(f"Output : {OUTPUT_PATH}")
print(f"Dosya mevcut: {INPUT_PATH.exists()}")

Input  : /Users/enesdemir/Documents/mimic-sepsis-drl/data/processed/mimic_hourly_binned.parquet
Output : /Users/enesdemir/Documents/mimic-sepsis-drl/data/processed/mimic_hourly_binned_feature_engineered.parquet
Dosya mevcut: True


## 1. Veri Yükleme & Şema Kontrolü

In [2]:
df = pl.read_parquet(INPUT_PATH)

print(f"Shape: {df.shape}")
print(f"Sütunlar ({len(df.columns)}):")
for col in df.columns:
    null_pct = df[col].null_count() / len(df) * 100
    print(f"  {col:30s}  dtype={str(df[col].dtype):12s}  null={null_pct:.1f}%")

Shape: (8808129, 43)
Sütunlar (43):
  stay_id                         dtype=Int64         null=0.0%
  hour_bin                        dtype=Datetime(time_unit='us', time_zone=None)  null=0.0%
  heart_rate                      dtype=Float64       null=0.4%
  sbp                             dtype=Float64       null=0.8%
  dbp                             dtype=Float64       null=0.8%
  mbp                             dtype=Float64       null=0.8%
  resp_rate                       dtype=Float64       null=0.5%
  spo2                            dtype=Float64       null=0.5%
  temp_c                          dtype=Float64       null=83.4%
  fio2                            dtype=Float64       null=32.4%
  lactate                         dtype=Float64       null=28.0%
  creatinine                      dtype=Float64       null=5.5%
  bilirubin_total                 dtype=Float64       null=37.3%
  platelet                        dtype=Float64       null=5.8%
  wbc                             dt

In [3]:
df.head(5)

stay_id,hour_bin,heart_rate,sbp,dbp,mbp,resp_rate,spo2,temp_c,fio2,lactate,creatinine,bilirubin_total,platelet,wbc,bun,glucose,sodium,potassium,hemoglobin,hematocrit,bicarbonate,chloride,anion_gap,inr,pao2,paco2,ph,urine_output,norepinephrine_dose,epinephrine_dose,phenylephrine_dose,vasopressin_dose,dopamine_dose,dobutamine_dose,crystalloid_ml,gcs_eye,gcs_motor,gcs_verbal,gcs_total,gender,age,admission_type
i64,datetime[μs],f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,i64,str
30000153,2174-09-29 12:00:00,100.0,136.0,74.0,89.0,18.0,100.0,,75.0,,,,,,,,,,,35.0,,,,,,,,280.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,3.0,5.0,1.0,9.0,"""M""",61,"""EW EMER."""
30000153,2174-09-29 13:00:00,104.0,132.0,74.5,84.0,16.0,100.0,,75.0,1.3,,,,,,,,,,35.0,,,,,221.0,45.0,7.3,280.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,3.0,5.0,1.0,9.0,"""M""",61,"""EW EMER."""
30000153,2174-09-29 14:00:00,83.0,131.0,61.0,80.0,16.0,100.0,,75.0,2.1,,,,,,,,,,35.0,,,,,263.0,45.0,7.3,45.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,3.0,5.0,1.0,9.0,"""M""",61,"""EW EMER."""
30000153,2174-09-29 15:00:00,92.0,123.0,65.0,84.0,14.0,100.0,,50.0,2.1,0.9,,173.0,17.0,22.0,192.0,142.0,4.4,10.8,31.7,19.0,115.0,12.0,1.1,263.0,45.0,7.3,50.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,3.0,5.0,1.0,9.0,"""M""",61,"""EW EMER."""
30000153,2174-09-29 16:00:00,83.0,109.0,55.0,71.0,16.0,100.0,,50.0,2.1,0.9,,173.0,17.0,22.0,192.0,142.0,4.4,10.8,31.7,19.0,115.0,12.0,1.1,215.0,42.0,7.31,50.0,0.0,0.0,0.0,0.0,0.0,0.0,941.299999,4.0,6.0,1.0,11.0,"""M""",61,"""EW EMER."""


## 2. Norepinefrin Eşdeğeri (Vazopressor Standardizasyonu)

Farklı vazopressorleri tek skalaya indirgeme (2024-2025 standardı):

| İlaç | Dönüşüm Oranı |
|------|---------------|
| Norepinefrin | ×1.0 |
| Epinefrin | ×0.1 |
| Phenylefrin | ×0.1 |
| Vasopressin | ×0.4 |
| Dopamin | ×0.01 |
| Dobutamin | ×0.0 (inotrop, ayrı tutulur) |

In [4]:
# ─── Norepinefrin eşdeğeri dönüşüm oranları ───────
VASO_CONVERSION = {
    "norepinephrine_dose": 1.0,
    "epinephrine_dose":    0.1,
    "phenylephrine_dose":  0.1,
    "vasopressin_dose":    0.4,
    "dopamine_dose":       0.01,
    "dobutamine_dose":     0.0,   # İnotrop etki — vazopressor değil
}

# Mevcut vazo kolonlarını kontrol et
available_vaso_cols = [c for c in VASO_CONVERSION if c in df.columns]
print(f"Mevcut vazopressor kolonları: {available_vaso_cols}")

# Her bir ilacın eşdeğerini hesapla
equiv_exprs = []
for col, ratio in VASO_CONVERSION.items():
    if col in df.columns:
        equiv_exprs.append(
            (pl.col(col).fill_null(0) * ratio).alias(col.replace("_dose", "_equiv"))
        )

df = df.with_columns(equiv_exprs)

# Toplam vazopressor eşdeğeri
equiv_cols = [col.replace("_dose", "_equiv") for col in available_vaso_cols]
df = df.with_columns(
    pl.sum_horizontal([pl.col(c) for c in equiv_cols]).alias("total_vaso_equiv")
)

print(f"\ntotal_vaso_equiv istatistikleri:")
df.select("total_vaso_equiv").describe()

Mevcut vazopressor kolonları: ['norepinephrine_dose', 'epinephrine_dose', 'phenylephrine_dose', 'vasopressin_dose', 'dopamine_dose', 'dobutamine_dose']

total_vaso_equiv istatistikleri:


statistic,total_vaso_equiv
str,f64
"""count""",8808129.0
"""null_count""",0.0
"""mean""",0.277242
"""std""",1.735346
"""min""",0.0
"""25%""",0.0
"""50%""",0.0
"""75%""",0.0
"""max""",1100.354014


## 3. Sıvı Dengesi (Net Fluid Balance — 4h)

```
fluid_balance_4h = crystalloid_ml − urine_output
```

- **Pozitif:** Ödem riski  
- **Negatif:** Hipovolemi

In [5]:
df = df.with_columns(
    (
        pl.col("crystalloid_ml").fill_null(0) 
        - pl.col("urine_output").fill_null(0)
    ).alias("fluid_balance_4h")
)

print("fluid_balance_4h istatistikleri:")
df.select("fluid_balance_4h").describe()

fluid_balance_4h istatistikleri:


statistic,fluid_balance_4h
str,f64
"""count""",8808129.0
"""null_count""",0.0
"""mean""",31.625266
"""std""",1466.576125
"""min""",-876537.000001
"""25%""",-145.931016
"""50%""",-16.889828
"""75%""",114.250006
"""max""",1000300.0


## 4. SOFA Skoru (Sequential Organ Failure Assessment)

Vincent et al. (1996) — 6 organ, her biri 0-4 puan, toplam **0-24**.

| Organ | Metrik | Puan Aralığı |
|-------|--------|--------------|
| Solunum | PaO2/FiO2 | 0-4 |
| Kardiyovasküler | MAP + Vazo dozu | 0-4 |
| Böbrek | Kreatinin ∨ İdrar | 0-4 |
| Nörolojik | GCS | 0-4 |
| Koagülasyon | Trombosit | 0-4 |
| Karaciğer | Bilirubin | 0-4 |

In [6]:
# ═══════════════════════════════════════════════════
# 4A. SOFA — Solunum (PaO2/FiO2)
# ═══════════════════════════════════════════════════

# FiO2: chartevents'te % olarak (21-100), orana çevir
df = df.with_columns(
    pl.when(pl.col("fio2") > 1.0)
    .then(pl.col("fio2") / 100.0)   # 21% → 0.21
    .otherwise(pl.col("fio2"))        # Zaten oran ise olduğu gibi
    .alias("fio2_ratio")
)

# PF oranı
df = df.with_columns(
    (pl.col("pao2") / pl.col("fio2_ratio")).alias("pf_ratio")
)

# Mekanik ventilasyon flag (SOFA resp 3-4 için gerekli)
df = df.with_columns(
    pl.when(
        (pl.col("fio2").is_not_null() & (pl.col("fio2") > 21))
    )
    .then(pl.lit(1))
    .otherwise(pl.lit(0))
    .alias("mechanical_ventilation")
)

# SOFA Respiratory skoru
df = df.with_columns(
    pl.when(pl.col("pf_ratio").is_null())
    .then(pl.lit(None).cast(pl.Int32))
    .when(pl.col("pf_ratio") > 400)
    .then(pl.lit(0))
    .when(pl.col("pf_ratio") > 300)
    .then(pl.lit(1))
    .when(pl.col("pf_ratio") > 200)
    .then(pl.lit(2))
    .when((pl.col("pf_ratio") > 100) & (pl.col("mechanical_ventilation") == 1))
    .then(pl.lit(3))
    .when((pl.col("pf_ratio") <= 100) & (pl.col("mechanical_ventilation") == 1))
    .then(pl.lit(4))
    .otherwise(pl.lit(2))  # PF<=200 ama ventilasyon yoksa max 2
    .alias("sofa_resp")
)

print("SOFA Respiratory dağılımı:")
df.group_by("sofa_resp").len().sort("sofa_resp")

SOFA Respiratory dağılımı:


sofa_resp,len
i32,u32
,3338801
0.0,683094
1.0,842241
2.0,1496423
3.0,1714091
4.0,733479


In [7]:
# ═══════════════════════════════════════════════════
# 4B. SOFA — Kardiyovasküler
#     MAP + vazopressor dozu (norepinefrin eşdeğeri)
# ═══════════════════════════════════════════════════

df = df.with_columns(
    pl.when(pl.col("mbp").is_null() & pl.col("total_vaso_equiv").is_null())
    .then(pl.lit(None).cast(pl.Int32))
    .when((pl.col("mbp").fill_null(70) >= 70) & (pl.col("total_vaso_equiv").fill_null(0) == 0))
    .then(pl.lit(0))
    .when(pl.col("mbp").fill_null(70) < 70)
    .then(pl.lit(1))
    .when(pl.col("total_vaso_equiv") <= 0.1)
    .then(pl.lit(2))
    .when(pl.col("total_vaso_equiv") <= 0.5)
    .then(pl.lit(3))
    .otherwise(pl.lit(4))
    .alias("sofa_cardio")
)

print("SOFA Cardiovascular dağılımı:")
df.group_by("sofa_cardio").len().sort("sofa_cardio")

SOFA Cardiovascular dağılımı:


sofa_cardio,len
i32,u32
0,5795830
1,2309005
2,94269
3,239549
4,369476


In [8]:
# ═══════════════════════════════════════════════════
# 4C. SOFA — Böbrek (Kreatinin)
#     İdrar 24h kriteri burada uygulanmaz (saatlik veri),
#     yalnızca kreatinin kullanılır.
# ═══════════════════════════════════════════════════

df = df.with_columns(
    pl.when(pl.col("creatinine").is_null())
    .then(pl.lit(None).cast(pl.Int32))
    .when(pl.col("creatinine") < 1.2)
    .then(pl.lit(0))
    .when(pl.col("creatinine") < 2.0)
    .then(pl.lit(1))
    .when(pl.col("creatinine") < 3.5)
    .then(pl.lit(2))
    .when(pl.col("creatinine") < 5.0)
    .then(pl.lit(3))
    .otherwise(pl.lit(4))
    .alias("sofa_renal")
)

print("SOFA Renal dağılımı:")
df.group_by("sofa_renal").len().sort("sofa_renal")

SOFA Renal dağılımı:


sofa_renal,len
i32,u32
,480550
0.0,4975961
1.0,1686184
2.0,979055
3.0,378714
4.0,307665


In [9]:
# ═══════════════════════════════════════════════════
# 4D. SOFA — Nörolojik (GCS)
#     Sedasyonlu hasta: forward-fill zaten pipeline'da uygulandı
# ═══════════════════════════════════════════════════

df = df.with_columns(
    pl.when(pl.col("gcs_total").is_null())
    .then(pl.lit(None).cast(pl.Int32))
    .when(pl.col("gcs_total") >= 15)
    .then(pl.lit(0))
    .when(pl.col("gcs_total") >= 13)
    .then(pl.lit(1))
    .when(pl.col("gcs_total") >= 10)
    .then(pl.lit(2))
    .when(pl.col("gcs_total") >= 6)
    .then(pl.lit(3))
    .otherwise(pl.lit(4))
    .alias("sofa_neuro")
)

print("SOFA Neurological dağılımı:")
df.group_by("sofa_neuro").len().sort("sofa_neuro")

SOFA Neurological dağılımı:


sofa_neuro,len
i32,u32
,105917
0.0,3826558
1.0,1304030
2.0,1538324
3.0,1223074
4.0,810226


In [10]:
# ═══════════════════════════════════════════════════
# 4E. SOFA — Koagülasyon (Trombosit)
# ═══════════════════════════════════════════════════

df = df.with_columns(
    pl.when(pl.col("platelet").is_null())
    .then(pl.lit(None).cast(pl.Int32))
    .when(pl.col("platelet") > 150)
    .then(pl.lit(0))
    .when(pl.col("platelet") > 100)
    .then(pl.lit(1))
    .when(pl.col("platelet") > 50)
    .then(pl.lit(2))
    .when(pl.col("platelet") > 20)
    .then(pl.lit(3))
    .otherwise(pl.lit(4))
    .alias("sofa_coag")
)

print("SOFA Coagulation dağılımı:")
df.group_by("sofa_coag").len().sort("sofa_coag")

SOFA Coagulation dağılımı:


sofa_coag,len
i32,u32
,509026
0.0,5604087
1.0,1422701
2.0,882591
3.0,321949
4.0,67775


In [11]:
# ═══════════════════════════════════════════════════
# 4F. SOFA — Karaciğer (Bilirubin)
# ═══════════════════════════════════════════════════

df = df.with_columns(
    pl.when(pl.col("bilirubin_total").is_null())
    .then(pl.lit(None).cast(pl.Int32))
    .when(pl.col("bilirubin_total") < 1.2)
    .then(pl.lit(0))
    .when(pl.col("bilirubin_total") < 2.0)
    .then(pl.lit(1))
    .when(pl.col("bilirubin_total") < 6.0)
    .then(pl.lit(2))
    .when(pl.col("bilirubin_total") < 12.0)
    .then(pl.lit(3))
    .otherwise(pl.lit(4))
    .alias("sofa_liver")
)

print("SOFA Liver dağılımı:")
df.group_by("sofa_liver").len().sort("sofa_liver")

SOFA Liver dağılımı:


sofa_liver,len
i32,u32
,3284098
0.0,3892956
1.0,585822
2.0,592125
3.0,210916
4.0,242212


In [12]:
# ═══════════════════════════════════════════════════
# 4G. Toplam SOFA Skoru (0-24)
# ═══════════════════════════════════════════════════

sofa_components = [
    "sofa_resp", "sofa_cardio", "sofa_renal",
    "sofa_neuro", "sofa_coag", "sofa_liver"
]

df = df.with_columns(
    pl.sum_horizontal([pl.col(c).fill_null(0) for c in sofa_components])
    .alias("sofa_score")
)

print("SOFA Score istatistikleri:")
df.select("sofa_score").describe()

SOFA Score istatistikleri:


statistic,sofa_score
str,f64
"""count""",8808129.0
"""null_count""",0.0
"""mean""",4.734691
"""std""",3.566224
"""min""",0.0
"""25%""",2.0
"""50%""",4.0
"""75%""",7.0
"""max""",23.0


In [13]:
# Hızlı doğrulama
assert df["sofa_score"].min() >= 0, "SOFA min < 0!"
assert df["sofa_score"].max() <= 24, "SOFA max > 24!"
print(f"✅ SOFA skoru aralığı: [{df['sofa_score'].min()}, {df['sofa_score'].max()}]")

✅ SOFA skoru aralığı: [0, 23]


## 5. Mekanik Ventilasyon & Şok İndeksi

- **Mekanik Ventilasyon:** Zaten yukarıda (SOFA Resp) hesaplandı — `FiO2 > 21%` ise 1  
- **Şok İndeksi:** `HR / SBP` — Normal 0.5-0.7, Yüksek >1.0 (şok belirtisi)

In [14]:
# ─── Şok İndeksi ──────────────────────────────────
df = df.with_columns(
    (pl.col("heart_rate") / pl.col("sbp")).alias("shock_index")
)

print("Shock Index istatistikleri:")
df.select("shock_index").describe()

Shock Index istatistikleri:


statistic,shock_index
str,f64
"""count""",8739273.0
"""null_count""",68856.0
"""mean""",
"""std""",
"""min""",-3094.807692
"""25%""",0.589147
"""50%""",0.712121
"""75%""",0.86
"""max""",inf


## 6. Lag Features (Önceki Timestep Dozları)

Agent "şimdi ne yapmalıyım?" derken "az önce ne yaptım?" bilmeli.  
İlaç kümülasyonu nedeniyle zorunlu.

```
prev_fluid_dose(t) = crystalloid_ml(t-1)
prev_vaso_dose(t)  = total_vaso_equiv(t-1)
```

İlk timestep → `null` kalır (sonradan impute edilecek).

In [15]:
# Sıralama garanti
df = df.sort("stay_id", "hour_bin")

# Lag features (stay_id içinde shift)
df = df.with_columns([
    pl.col("crystalloid_ml").shift(1).over("stay_id").alias("prev_fluid_dose"),
    pl.col("total_vaso_equiv").shift(1).over("stay_id").alias("prev_vaso_dose"),
])

print("Lag features (ilk 10 satır, tek stay_id):")
sample_stay = df["stay_id"].drop_nulls()[0]
df.filter(pl.col("stay_id") == sample_stay).select(
    "stay_id", "hour_bin", "crystalloid_ml", "prev_fluid_dose",
    "total_vaso_equiv", "prev_vaso_dose"
).head(10)

Lag features (ilk 10 satır, tek stay_id):


stay_id,hour_bin,crystalloid_ml,prev_fluid_dose,total_vaso_equiv,prev_vaso_dose
i64,datetime[μs],f64,f64,f64,f64
30000153,2174-09-29 12:00:00,30.0,,0.0,
30000153,2174-09-29 13:00:00,30.0,30.0,0.0,0.0
30000153,2174-09-29 14:00:00,30.0,30.0,0.0,0.0
30000153,2174-09-29 15:00:00,30.0,30.0,0.0,0.0
30000153,2174-09-29 16:00:00,941.299999,30.0,0.0,0.0
30000153,2174-09-29 17:00:00,941.299999,941.299999,0.0,0.0
30000153,2174-09-29 18:00:00,941.299999,941.299999,0.0,0.0
30000153,2174-09-29 19:00:00,941.299999,941.299999,0.0,0.0
30000153,2174-09-29 20:00:00,941.299999,941.299999,0.0,0.0
30000153,2174-09-29 21:00:00,199.999995,941.299999,0.0,0.0


## 7. Final State Vector & Parquet Kayıt

2024-2025 MIMIC-IV Sepsis DRL standardı — ~20-25 feature:

| Kategori | Feature'lar |
|----------|-------------|
| **Lag** | `prev_fluid_dose`, `prev_vaso_dose` |
| **Vitals** | `heart_rate`, `sbp`, `dbp`, `mbp`, `resp_rate`, `spo2`, `temp_c` |
| **Labs** | `lactate`, `creatinine`, `platelet`, `bun`, `wbc`, `bilirubin_total` |
| **Organ** | `sofa_score`, `gcs_total`, `urine_output` |
| **Hemodinamik** | `shock_index`, `mechanical_ventilation` |
| **Sıvı** | `fluid_balance_4h` |
| **Demografi** | `age`, `gender` |

In [16]:
# ─── State vector tanımı ───────────────────────────
STATE_FEATURES = [
    # Lag
    "prev_fluid_dose", "prev_vaso_dose",
    # Vitals
    "heart_rate", "sbp", "dbp", "mbp", "resp_rate", "spo2", "temp_c",
    # Labs
    "lactate", "creatinine", "platelet", "bun", "wbc", "bilirubin_total",
    # Organ function
    "sofa_score", "gcs_total", "urine_output",
    # Hemodynamic indices
    "shock_index", "mechanical_ventilation",
    # Fluid
    "fluid_balance_4h",
    # Demographics
    "age", "gender",
]

# Meta sütunlar (ID + zaman)
META_COLS = ["stay_id", "hour_bin"]

# Mevcut olan feature'ları filtrele
available_features = [f for f in STATE_FEATURES if f in df.columns]
missing_features   = [f for f in STATE_FEATURES if f not in df.columns]

print(f"State vector boyutu: {len(available_features)} feature")
if missing_features:
    print(f"⚠️  Eksik feature'lar: {missing_features}")
else:
    print("✅ Tüm feature'lar mevcut!")

State vector boyutu: 23 feature
✅ Tüm feature'lar mevcut!


In [17]:
# ─── Gender encode (M=0, F=1) ─────────────────────
if "gender" in df.columns:
    df = df.with_columns(
        pl.when(pl.col("gender") == "M")
        .then(pl.lit(0))
        .when(pl.col("gender") == "F")
        .then(pl.lit(1))
        .otherwise(pl.lit(None))
        .cast(pl.Int32)
        .alias("gender")
    )

print("Gender dağılımı:")
df.group_by("gender").len().sort("gender")

Gender dağılımı:


gender,len
i32,u32
0,5074695
1,3733434


In [18]:
# ─── Final DataFrame oluştur ──────────────────────
df_final = df.select(META_COLS + available_features)

print(f"Final shape: {df_final.shape}")
print(f"Sütunlar ({len(df_final.columns)}): {df_final.columns}")
print("\nÖzet istatistikler:")
df_final.describe()

Final shape: (8808129, 25)
Sütunlar (25): ['stay_id', 'hour_bin', 'prev_fluid_dose', 'prev_vaso_dose', 'heart_rate', 'sbp', 'dbp', 'mbp', 'resp_rate', 'spo2', 'temp_c', 'lactate', 'creatinine', 'platelet', 'bun', 'wbc', 'bilirubin_total', 'sofa_score', 'gcs_total', 'urine_output', 'shock_index', 'mechanical_ventilation', 'fluid_balance_4h', 'age', 'gender']

Özet istatistikler:


statistic,stay_id,hour_bin,prev_fluid_dose,prev_vaso_dose,heart_rate,sbp,dbp,mbp,resp_rate,spo2,temp_c,lactate,creatinine,platelet,bun,wbc,bilirubin_total,sofa_score,gcs_total,urine_output,shock_index,mechanical_ventilation,fluid_balance_4h,age,gender
str,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",8808129.0,"""8808129""",7117971.0,8713671.0,8770031.0,8740315.0,8740024.0,8741935.0,8760778.0,8764642.0,1465057.0,6340328.0,8327579.0,8299103.0,8326449.0,8295935.0,5524031.0,8808129.0,8702212.0,8293822.0,8739273.0,8808129.0,8808129.0,8808129.0,8808129.0
"""null_count""",0.0,"""0""",1690158.0,94458.0,38098.0,67814.0,68105.0,66194.0,47351.0,43487.0,7343072.0,2467801.0,480550.0,509026.0,481680.0,512194.0,3284098.0,0.0,105917.0,514307.0,68856.0,0.0,0.0,0.0,0.0
"""mean""",34974000.0,"""2153-10-15 03:38:30.535507""",251.733846,0.27857,87.819938,120.478139,65.184255,84.512153,21.169934,137.767256,38.354066,3.263103,1.481856,219.682421,30.943375,12.125175,2.209595,4.734691,11.923467,184.733394,,0.663667,31.625266,62.641503,0.423862
"""std""",2884300.0,,1580.41945,1.732863,3797.403494,491.228532,259.152687,4828.722159,2407.519375,19406.703678,9.811226,1433.422544,1.46838,132.416995,25.204517,8.458586,5.210624,3.566224,3.831584,364.241888,,0.472454,1466.576125,16.111648,0.494169
"""min""",30000153.0,"""2110-01-11 10:00:00""",0.0,0.0,-241395.0,-94.0,-40.0,-9806.0,0.0,-951234.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,1.0,-3765.0,-3094.807692,0.0,-876537.000001,18.0,0.0
"""25%""",32477246.0,"""2133-12-07 02:00:00""",47.900002,0.0,73.0,104.0,53.0,69.0,16.0,95.0,36.6,1.0,0.7,131.0,14.0,7.7,0.4,2.0,10.0,50.0,0.589147,0.0,-145.931016,53.0,0.0
"""50%""",34965363.0,"""2153-08-17 00:00:00""",100.0,0.0,85.0,118.0,62.0,78.0,19.25,97.0,37.1,1.4,1.0,196.0,23.0,10.6,0.6,4.0,14.0,120.0,0.712121,1.0,-16.889828,64.0,0.0
"""75%""",37460082.0,"""2173-11-27 22:00:00""",295.000009,0.0,98.0,134.0,73.0,89.0,24.0,99.0,37.6,1.9,1.7,280.0,39.0,14.5,1.4,7.0,15.0,250.0,0.86,1.0,114.250006,75.0,1.0
"""max""",39999858.0,"""2214-08-11 05:00:00""",1000400.0,1100.354014,10000000.0,1003110.0,114109.0,8999090.0,7000400.0,9900000.0,987.4,1276103.0,80.0,2385.0,305.0,572.5,87.2,23.0,15.0,876587.0,inf,1.0,1000300.0,91.0,1.0


In [19]:
# ─── Null yüzdeleri ────────────────────────────────
print("Null yüzdeleri (%):\n")
for col in available_features:
    null_pct = df_final[col].null_count() / len(df_final) * 100
    bar = "█" * int(null_pct // 2)
    print(f"  {col:30s} {null_pct:6.1f}%  {bar}")

Null yüzdeleri (%):

  prev_fluid_dose                  19.2%  █████████
  prev_vaso_dose                    1.1%  
  heart_rate                        0.4%  
  sbp                               0.8%  
  dbp                               0.8%  
  mbp                               0.8%  
  resp_rate                         0.5%  
  spo2                              0.5%  
  temp_c                           83.4%  █████████████████████████████████████████
  lactate                          28.0%  ██████████████
  creatinine                        5.5%  ██
  platelet                          5.8%  ██
  bun                               5.5%  ██
  wbc                               5.8%  ██
  bilirubin_total                  37.3%  ██████████████████
  sofa_score                        0.0%  
  gcs_total                         1.2%  
  urine_output                      5.8%  ██
  shock_index                       0.8%  
  mechanical_ventilation            0.0%  
  fluid_balance_4h         

In [20]:
# ─── Parquet'e yaz ─────────────────────────────────
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
df_final.write_parquet(OUTPUT_PATH)

# Doğrulama
file_size_mb = OUTPUT_PATH.stat().st_size / 1024 / 1024
print(f"\n✅ Kaydedildi: {OUTPUT_PATH}")
print(f"   Boyut: {file_size_mb:.1f} MB")
print(f"   Satır: {df_final.shape[0]:,}")
print(f"   Sütun: {df_final.shape[1]}")


✅ Kaydedildi: /Users/enesdemir/Documents/mimic-sepsis-drl/data/processed/mimic_hourly_binned_feature_engineered.parquet
   Boyut: 176.1 MB
   Satır: 8,808,129
   Sütun: 25


In [21]:
# ─── Okuma doğrulaması ─────────────────────────────
df_check = pl.read_parquet(OUTPUT_PATH)
assert df_check.shape == df_final.shape, "Shape mismatch!"
assert df_check.columns == df_final.columns, "Column mismatch!"
print(f"✅ Okuma doğrulaması başarılı: {df_check.shape}")
df_check.head(5)

✅ Okuma doğrulaması başarılı: (8808129, 25)


stay_id,hour_bin,prev_fluid_dose,prev_vaso_dose,heart_rate,sbp,dbp,mbp,resp_rate,spo2,temp_c,lactate,creatinine,platelet,bun,wbc,bilirubin_total,sofa_score,gcs_total,urine_output,shock_index,mechanical_ventilation,fluid_balance_4h,age,gender
i64,datetime[μs],f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i32,f64,f64,f64,i32,f64,i64,i32
30000153,2174-09-29 12:00:00,,,100.0,136.0,74.0,89.0,18.0,100.0,,,,,,,,3,9.0,280.0,0.735294,1,-250.0,61,0
30000153,2174-09-29 13:00:00,30.0,0.0,104.0,132.0,74.5,84.0,16.0,100.0,,1.3,,,,,,5,9.0,280.0,0.787879,1,-250.0,61,0
30000153,2174-09-29 14:00:00,30.0,0.0,83.0,131.0,61.0,80.0,16.0,100.0,,2.1,,,,,,4,9.0,45.0,0.633588,1,-15.0,61,0
30000153,2174-09-29 15:00:00,30.0,0.0,92.0,123.0,65.0,84.0,14.0,100.0,,2.1,0.9,173.0,22.0,17.0,,3,9.0,50.0,0.747967,1,-20.0,61,0
30000153,2174-09-29 16:00:00,30.0,0.0,83.0,109.0,55.0,71.0,16.0,100.0,,2.1,0.9,173.0,22.0,17.0,,2,11.0,50.0,0.761468,1,891.299999,61,0
