In [1]:
import gc
import gzip
import pandas as pd
import os
import numpy as np

In [2]:
file_path = r"CSV/Exports/Temp/10_keep_24h/merged_df_of_first_24h.csv"
stroke_df = pd.read_csv(file_path)
stroke_df = stroke_df.drop(columns=["row_count"], errors="ignore")

# Filter arrythmia diagnoses

In [3]:
# Compressed file paths
compressed_file_path = r"../00_Datasets/mimic-iv-3_1/hosp/diagnoses_icd.csv.gz"
compressed_file_admission = r'../00_Datasets/mimic-iv-3_1/hosp/admissions.csv.gz'

# Read the stroke_diagnosis.csv file
diagnosis_df = pd.read_csv(compressed_file_path, compression='gzip')
admission_df = pd.read_csv(compressed_file_admission, compression='gzip')

arrhythmia_path = r"CSV/Imports/arrhythmia.csv"
icd_df = pd.read_csv(arrhythmia_path)

In [4]:
# Κρατάω μόνο τις τα icd_codes της αρρυθμίας.

codes = (
    icd_df["icd_code"]
    .astype(str)
    .str.strip()
    .unique()
)

diagnosis_filtered = diagnosis_df[
    diagnosis_df["icd_code"].astype(str).str.strip().isin(codes)
].copy()

print(diagnosis_filtered.shape)
display(diagnosis_filtered.head())

(47162, 5)


Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version
713,10001667,22672901,5,I4891,10
750,10001843,21728396,11,I4891,10
766,10001843,26133978,12,I4892,10
778,10001843,26133978,24,I4891,10
853,10001884,22532141,4,I4891,10


In [5]:
# Κρατάω μόνο τους ασθενείς από την κοορτή με εγκεφαλικό που έχουν είχαν ή έχουν ευσαχθεί αρρυθμία
keys = stroke_df[["subject_id"]].drop_duplicates()

diagnosis_filter_df = (
    diagnosis_filtered
    .merge(keys, on=["subject_id"], how="inner")
    .copy()
)

print(diagnosis_filter_df.shape)
display(diagnosis_filter_df.head())

(563, 5)


Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version
0,10248191,21618503,18,I482,10
1,10259667,27045371,26,I4891,10
2,10277119,20500159,22,I4891,10
3,10277119,20693395,8,I4891,10
4,10374990,21447101,9,I4891,10


In [6]:
# Φέρνω την ώρα εισαγωγής και εξαγωγής από το νοσοκομείο της κοορτής

keys = ["subject_id", "hadm_id"]

admission_cols = keys + ["admittime", "dischtime"]

diagnosis_filter_df = diagnosis_filter_df.merge(
    admission_df[admission_cols],
    on=keys,
    how="left",
    validate="m:1"
)

In [7]:
# Εδώ κάνω την σύγκριση.

# 1) Τσεκάρω ότι οι ημερομηνίες είναι datetime
for c in ["admittime", "dischtime"]:
    diagnosis_filter_df[c] = pd.to_datetime(diagnosis_filter_df[c], errors="coerce")

for c in ["icu_intime", "icu_outtime"]:
    stroke_df[c] = pd.to_datetime(stroke_df[c], errors="coerce")

# 2) Αν το stroke_df έχει ΠΟΛΛΑ ICU stays ανά subject_id,
#    κάνε collapse σε ένα συνολικό ICU interval (min->max).
stroke_subject = (stroke_df
                  .groupby("subject_id", as_index=False)
                  .agg(icu_intime=("icu_intime", "min"),
                       icu_outtime=("icu_outtime", "max")))

# 3) Φέρε icu_intime / icu_outtime στο diagnosis_filter_df
df = diagnosis_filter_df.merge(
    stroke_subject,
    on="subject_id",
    how="left"
)

# 4) Σύγκριση διαστημάτων
before_icu = df["dischtime"] < df["icu_intime"]
after_icu  = df["admittime"] > df["icu_outtime"]

df["admission_vs_icu"] = np.select(
    [df["icu_intime"].isna() | df["icu_outtime"].isna(), before_icu, after_icu],
    ["NO_ICU_INFO", "BEFORE_ICU", "AFTER_ICU"],
    default="OVERLAP_OR_WITHIN"
)

diagnosis_filter_df = df


In [8]:
# counts ανά κατηγορία
counts = diagnosis_filter_df["admission_vs_icu"].value_counts(dropna=False)
print(counts)

# (προαιρετικό) και σε ποσοστά
percent = diagnosis_filter_df["admission_vs_icu"].value_counts(normalize=True, dropna=False) * 100
print(percent.round(2))

admission_vs_icu
AFTER_ICU    563
Name: count, dtype: int64
admission_vs_icu
AFTER_ICU    100.0
Name: proportion, dtype: float64


# Create columns with arrythmia ICD

In [9]:
icd_cols = [
    "I48","I480","I481","I4811","I4819","I482","I4820","I4821",
    "I483","I484","I489","I4891","I4892","I49","I498","I499"
]

# 1) Δημιουργία στηλών (μόνο όσες λείπουν) με αρχική τιμή 0
missing = [c for c in icd_cols if c not in stroke_df.columns]
stroke_df[missing] = 0

# (προαιρετικό) σιγουρεύουμε 0/1 ακέραια
stroke_df[icd_cols] = stroke_df[icd_cols].astype(int)

# 2) Βάλε τις στήλες πριν το hospital_expire_flag
flag_col = "hospital_expire_flag"

cols_wo_icd = [c for c in stroke_df.columns if c not in icd_cols]

if flag_col in cols_wo_icd:
    insert_at = cols_wo_icd.index(flag_col)
    new_cols = cols_wo_icd[:insert_at] + icd_cols + cols_wo_icd[insert_at:]
else:
    # αν δεν υπάρχει το hospital_expire_flag, απλά τις βάζει στο τέλος
    new_cols = cols_wo_icd + icd_cols

stroke_df = stroke_df[new_cols]

In [10]:
# Fill the columns

icd_cols = [
    "I48","I480","I481","I4811","I4819","I482","I4820","I4821",
    "I483","I484","I489","I4891","I4892","I49","I498","I499"
]

# 0) Σιγουρέψου ότι οι στήλες υπάρχουν στο stroke_df
missing = [c for c in icd_cols if c not in stroke_df.columns]
stroke_df[missing] = 0

# 1) Καθάρισε/φίλτραρε diagnosis_filter_df
diag = diagnosis_filter_df[["subject_id", "icd_code"]].dropna().copy()
diag["icd_code"] = (
    diag["icd_code"].astype(str)
    .str.upper()
    .str.replace(".", "", regex=False)   # αν υπάρχουν ICD με τελεία (π.χ. I48.2)
)

diag = diag[diag["icd_code"].isin(icd_cols)].drop_duplicates()

# 2) Κάνε wide matrix (1 αν υπάρχει ο κωδικός για το subject_id)
wide = (
    pd.crosstab(diag["subject_id"], diag["icd_code"])
    .reindex(columns=icd_cols, fill_value=0)
    .clip(upper=1)
    .astype(int)
)

# 3) Merge στο stroke_df και ενημέρωσε τις στήλες (max παλιό/νέο)
stroke_df = stroke_df.merge(
    wide.reset_index(),
    on="subject_id",
    how="left",
    suffixes=("", "__diag")
)

for c in icd_cols:
    diag_c = f"{c}__diag"
    if diag_c in stroke_df.columns:
        stroke_df[c] = (
            stroke_df[[c, diag_c]]
            .max(axis=1)
            .fillna(0)
            .astype(int)
        )

# 4) Πέτα τα προσωρινά __diag columns
drop_cols = [f"{c}__diag" for c in icd_cols if f"{c}__diag" in stroke_df.columns]
stroke_df.drop(columns=drop_cols, inplace=True)

# 5) (προαιρετικό) ξαναβάλε τις ICD στήλες πριν το hospital_expire_flag
flag_col = "hospital_expire_flag"
cols_wo_icd = [c for c in stroke_df.columns if c not in icd_cols]

if flag_col in cols_wo_icd:
    idx = cols_wo_icd.index(flag_col)
    stroke_df = stroke_df[cols_wo_icd[:idx] + icd_cols + cols_wo_icd[idx:]]

In [11]:
# Αλλαγή τίτλων
rename_map = {
    "I48":   "I48_Atrial_fibrillation_and_flutter",
    "I480":  "I480_Paroxysmal_atrial_fibrillation",
    "I481":  "I481_Persistent_atrial_fibrillation",
    "I4811": "I4811_Longstanding_persistent_atrial_fibrillation",
    "I4819": "I4819_Other_persistent_atrial_fibrillation",
    "I482":  "I482_Chronic_atrial_fibrillation",
    "I4820": "I4820_Chronic_atrial_fibrillation,_unspecified",
    "I4821": "I4821_Permanent_atrial_fibrillation",
    "I483":  "I483_Typical_atrial_flutter",
    "I484":  "I484_Atypical_atrial_flutter",
    "I489":  "I489_Unspecified_atrial_fibrillation_and_atrial_flutter",
    "I4891": "I4891_Unspecified_atrial_fibrillation",
    "I4892": "I4892_Unspecified_atrial_flutter",
    "I49":   "I49_Other_cardiac_arrhythmias",
    "I498":  "I498_Other_specified_cardiac_arrhythmias",
    "I499":  "I499_Cardiac_arrhythmia,_unspecified",
}

stroke_df.rename(columns=rename_map, inplace=True)

In [12]:
display(stroke_df)

Unnamed: 0,subject_id,hadm_id,Time_Zone,stay_id,icu_intime,icu_outtime,hosp_dischtime,dod,sofa,gender,...,I489_Unspecified_atrial_fibrillation_and_atrial_flutter,I4891_Unspecified_atrial_fibrillation,I4892_Unspecified_atrial_flutter,I49_Other_cardiac_arrhythmias,I498_Other_specified_cardiac_arrhythmias,"I499_Cardiac_arrhythmia,_unspecified",hospital_expire_flag,mort_30d,mort_180d,mort_360d
0,10004733,27411876,1,39635619,2174-12-04 11:28:24,2174-12-12 20:03:01,2174-12-27 14:00:00,,3,M,...,0,0,0,0,0,0,Survive,0,0,0
1,10004733,27411876,2,39635619,2174-12-04 11:28:24,2174-12-12 20:03:01,2174-12-27 14:00:00,,3,M,...,0,0,0,0,0,0,Survive,0,0,0
2,10004733,27411876,3,39635619,2174-12-04 11:28:24,2174-12-12 20:03:01,2174-12-27 14:00:00,,3,M,...,0,0,0,0,0,0,Survive,0,0,0
3,10004733,27411876,4,39635619,2174-12-04 11:28:24,2174-12-12 20:03:01,2174-12-27 14:00:00,,3,M,...,0,0,0,0,0,0,Survive,0,0,0
4,10004733,27411876,5,39635619,2174-12-04 11:28:24,2174-12-12 20:03:01,2174-12-27 14:00:00,,3,M,...,0,0,0,0,0,0,Survive,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29067,19999987,23865745,4,36195440,2145-11-02 22:59:00,2145-11-04 21:29:30,2145-11-11 12:57:00,,6,F,...,0,0,0,0,0,0,Survive,0,0,0
29068,19999987,23865745,5,36195440,2145-11-02 22:59:00,2145-11-04 21:29:30,2145-11-11 12:57:00,,6,F,...,0,0,0,0,0,0,Survive,0,0,0
29069,19999987,23865745,6,36195440,2145-11-02 22:59:00,2145-11-04 21:29:30,2145-11-11 12:57:00,,6,F,...,0,0,0,0,0,0,Survive,0,0,0
29070,19999987,23865745,7,36195440,2145-11-02 22:59:00,2145-11-04 21:29:30,2145-11-11 12:57:00,,6,F,...,0,0,0,0,0,0,Survive,0,0,0


In [13]:
missing_pct = stroke_df.isna().mean().mul(100)

headers_missing_df = pd.DataFrame({
    "feature": stroke_df.columns,
    "missing_pct": missing_pct.values
})

# 2 δεκαδικά + κόμμα
headers_missing_df["missing_pct"] = (
    headers_missing_df["missing_pct"]
      .round(2)                      # κρατάει μέχρι 2 δεκαδικά
      .map(lambda x: f"{x:.2f}")     # πάντα 2 δεκαδικά
      .str.replace(".", ",", regex=False)
)


headers_missing_df.to_csv(r'CSV/Exports/Temp/05_stroke_filtered_headers.csv', index=False)

In [14]:
out_dir = "CSV/Exports/Temp/11_keep_24h"
os.makedirs(out_dir, exist_ok=True)

stroke_df.to_csv(os.path.join(out_dir, "merged_df.csv"), index=False)