In [1]:
import gc
import gzip
import pandas as pd
import os

# Compressed file paths
compressed_file_path_01 = r"CSV/Exports/Temp/06_renamed_columns/stroke01_df_small.csv"
compressed_file_path_02 = r"CSV/Exports/Temp/06_renamed_columns/stroke02_df_small.csv"
compressed_file_path_03 = r"CSV/Exports/Temp/06_renamed_columns/stroke03_df_small.csv"
compressed_file_path_04 = r"CSV/Exports/Temp/06_renamed_columns/stroke04_df_small.csv"

In [2]:
stroke01_df_small = pd.read_csv(compressed_file_path_01)
stroke02_df_small = pd.read_csv(compressed_file_path_02)
stroke03_df_small = pd.read_csv(compressed_file_path_03)
stroke04_df_small = pd.read_csv(compressed_file_path_04)

# Add GCS

In [3]:
def aggregate_gcs_sum(df: pd.DataFrame, new_col: str, prefix: str = "GCS") -> pd.DataFrame:
    """
    Βρίσκει τις στήλες που ξεκινούν με prefix (default 'GCS'),
    αθροίζει τις 3 GCS components (row-wise),
    γράφει το αποτέλεσμα σε new_col,
    και αφαιρεί τις αρχικές GCS στήλες.
    """
    gcs_cols = [c for c in df.columns if isinstance(c, str) and c.strip().startswith(prefix)]
    
    if len(gcs_cols) == 0:
        raise ValueError(f"Δεν βρέθηκαν στήλες που να ξεκινούν με '{prefix}'.")
    if len(gcs_cols) != 3:
        raise ValueError(f"Περίμενα 3 GCS στήλες, αλλά βρήκα {len(gcs_cols)}: {gcs_cols}")

    df = df.copy()

    # Σιγουρευόμαστε ότι είναι αριθμητικές (αν έρθουν ως strings)
    for c in gcs_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    # Άθροιση ανά γραμμή (αν υπάρχουν NaN, το sum θα τα αγνοήσει μόνο αν min_count=1,
    # αλλά για GCS components συνήθως θέλουμε και τα 3. Εδώ απαιτούμε και τα 3.)
    df[new_col] = df[gcs_cols].sum(axis=1)

    # Διαγραφή αρχικών GCS στηλών
    df.drop(columns=gcs_cols, inplace=True)

    return df


stroke01_df_small = aggregate_gcs_sum(stroke01_df_small, "GCS - Mean")
stroke02_df_small = aggregate_gcs_sum(stroke02_df_small, "GCS - Median")
stroke03_df_small = aggregate_gcs_sum(stroke03_df_small, "GCS - Min")
stroke04_df_small = aggregate_gcs_sum(stroke04_df_small, "GCS - Max")

print("Έτοιμο: δημιουργήθηκαν οι νέες GCS στήλες και αφαιρέθηκαν οι παλιές GCS columns.")


Έτοιμο: δημιουργήθηκαν οι νέες GCS στήλες και αφαιρέθηκαν οι παλιές GCS columns.


In [4]:
# Move GCS
def move_col_after(df, col_to_move, after_col):
    df = df.copy()
    if after_col not in df.columns:
        raise KeyError(f"Η στήλη '{after_col}' δεν υπάρχει στο dataframe.")
    if col_to_move not in df.columns:
        raise KeyError(f"Η στήλη '{col_to_move}' δεν υπάρχει στο dataframe.")
    
    cols = df.columns.tolist()
    cols.remove(col_to_move)
    insert_pos = cols.index(after_col) + 1
    cols.insert(insert_pos, col_to_move)
    return df[cols]

stroke01_df_small = move_col_after(stroke01_df_small, "GCS - Mean", "race")
stroke02_df_small = move_col_after(stroke02_df_small, "GCS - Median", "race")
stroke03_df_small = move_col_after(stroke03_df_small, "GCS - Min", "race")
stroke04_df_small = move_col_after(stroke04_df_small, "GCS - Max", "race")

print("Το GCS μπήκε μετά τη στήλη race σε όλα τα dataframes.")


Το GCS μπήκε μετά τη στήλη race σε όλα τα dataframes.


In [5]:
out_dir = "CSV/Exports/Temp/07_GCS"
os.makedirs(out_dir, exist_ok=True)

stroke01_df_small.to_csv(os.path.join(out_dir, "GCS_stroke01_df_small.csv"), index=False)
stroke02_df_small.to_csv(os.path.join(out_dir, "GCS_stroke02_df_small.csv"), index=False)
stroke03_df_small.to_csv(os.path.join(out_dir, "GCS_stroke03_df_small.csv"), index=False)
stroke04_df_small.to_csv(os.path.join(out_dir, "GCS_stroke04_df_small.csv"), index=False)

# Merge Dataframes

In [6]:
# Merge keys (με Time_Zone)
keys = ["subject_id", "hadm_id", "stay_id", "Time_Zone"]

# Κοινές στήλες που θέλεις να υπάρχουν ΜΟΝΟ 1 φορά στο τελικό df
common_cols = [
    "row_count","subject_id","hadm_id","stay_id","Time_Zone",
    "icu_intime","icu_outtime","hosp_dischtime","dod",
    "gender","age","language","marital_status","race","los",
    "hospital_expire_flag"
]

def assert_unique_keys(df: pd.DataFrame, name: str):
    dup = df.duplicated(subset=keys, keep=False)
    if dup.any():
        examples = df.loc[dup, keys].drop_duplicates().head(8)
        raise ValueError(
            f"[{name}] Μη-unique γραμμές για keys={keys}. "
            f"Δείγμα:\n{examples}"
        )

def right_side(df: pd.DataFrame) -> pd.DataFrame:
    # κρατάμε keys + ό,τι ΔΕΝ είναι common (δηλ. τα unique features κάθε df)
    unique_cols = [c for c in df.columns if c not in common_cols]
    return df[keys + unique_cols].copy()

# 1) Έλεγχος ότι σε κάθε df το (subject_id, hadm_id, stay_id, Time_Zone) είναι μοναδικό
assert_unique_keys(stroke01_df_small, "stroke01_df_small")
assert_unique_keys(stroke02_df_small, "stroke02_df_small")
assert_unique_keys(stroke03_df_small, "stroke03_df_small")
assert_unique_keys(stroke04_df_small, "stroke04_df_small")

# 2) Merge: κρατάμε common cols μόνο από το stroke01_df_small
merged_df = stroke01_df_small.copy()

merged_df = merged_df.merge(right_side(stroke02_df_small), on=keys, how="inner", validate="one_to_one")
merged_df = merged_df.merge(right_side(stroke03_df_small), on=keys, how="inner", validate="one_to_one")
merged_df = merged_df.merge(right_side(stroke04_df_small), on=keys, how="inner", validate="one_to_one")

print("Merge ολοκληρώθηκε με keys:", keys)

Merge ολοκληρώθηκε με keys: ['subject_id', 'hadm_id', 'stay_id', 'Time_Zone']


In [7]:
# Move hospital_expire_flag at the end of the dataframe
col = "hospital_expire_flag"

if col not in merged_df.columns:
    raise KeyError(f"Η στήλη '{col}' δεν υπάρχει στο merged_df.")

cols = [c for c in merged_df.columns if c != col] + [col]
merged_df = merged_df[cols]

print("Το hospital_expire_flag μεταφέρθηκε στο τέλος στο merged_df.")

Το hospital_expire_flag μεταφέρθηκε στο τέλος στο merged_df.


# Merge sofa

In [8]:
# Φέρνω το sofa score
sofa_path = r'CSV/Exports/Temp/01_stroke_sofa_first_day.csv'
sofa_df = pd.read_csv(sofa_path)


In [9]:
keys = ["subject_id", "hadm_id", "stay_id"]

# 1) Κράτα μόνο τα keys + sofa (και λύσε τυχόν διπλοεγγραφές)
sofa_lookup = (
    sofa_df[keys + ["sofa"]]
    .groupby(keys, as_index=False)["sofa"].max()   # αν υπάρχουν πολλαπλές εγγραφές, κράτα το max sofa
)

# 2) Merge (left, ώστε να μη χάσεις γραμμές από merged_df)
merged_df = merged_df.merge(sofa_lookup, on=keys, how="left", validate="m:1")

# 3) Βάλε τη sofa ακριβώς μετά την dod
cols = merged_df.columns.tolist()
cols.remove("sofa")
dod_idx = cols.index("dod")
cols.insert(dod_idx + 1, "sofa")
merged_df = merged_df[cols]

merged_df["sofa"].isna().sum()

np.int64(0)

In [10]:
out_dir = "CSV/Exports/Temp/08_merged_daframes"
os.makedirs(out_dir, exist_ok=True)

merged_df.to_csv(os.path.join(out_dir, "merge_stroke01_df_small.csv"), index=False)

# mort_30d, mort_180d, mort_360d

In [11]:
# 1) Σιγουρευόμαστε ότι είναι datetime
merged_df["icu_intime"] = pd.to_datetime(merged_df["icu_intime"], errors="coerce")
merged_df["dod"] = pd.to_datetime(merged_df["dod"], errors="coerce")

def mort_within_days(df, days: int, intime_col="icu_intime", dod_col="dod"):
    # True μόνο όταν έχουμε dod και dod <= icu_intime + days
    return ((df[dod_col].notna()) & (df[dod_col] <= (df[intime_col] + pd.Timedelta(days=days)))).astype(int)

merged_df["mort_30d"]  = mort_within_days(merged_df, 30)
merged_df["mort_180d"] = mort_within_days(merged_df, 180)
merged_df["mort_360d"] = mort_within_days(merged_df, 360)

print("Δημιουργήθηκαν οι mort_30d, mort_180d, mort_360d.")

Δημιουργήθηκαν οι mort_30d, mort_180d, mort_360d.


In [12]:
out_dir = "CSV/Exports/Temp/09_mort_label"
os.makedirs(out_dir, exist_ok=True)

merged_df.to_csv(os.path.join(out_dir, "merged_df_with_mort_label.csv"), index=False)

# Keep the first 24 hours

In [13]:
merged_df = merged_df[merged_df["Time_Zone"].between(1, 8)].copy()

In [14]:
out_dir = "CSV/Exports/Temp/10_keep_24h"
os.makedirs(out_dir, exist_ok=True)

merged_df.to_csv(os.path.join(out_dir, "merged_df_of_first_24h.csv"), index=False)

# Test

In [17]:
merged_df_24 = merged_df[merged_df["Time_Zone"] == 8].copy()
display(merged_df_24)

Unnamed: 0,row_count,subject_id,hadm_id,Time_Zone,stay_id,icu_intime,icu_outtime,hosp_dischtime,dod,sofa,...,Differential-Monos - Max,Differential-Neuts - Max,Polys - Max,Metamyelocytes - Max,Myelocytes - Max,Nucleated Red Cells - Max,hospital_expire_flag,mort_30d,mort_180d,mort_360d
7,8,10004733,27411876,8,39635619,2174-12-04 11:28:24,2174-12-12 20:03:01,2174-12-27 14:00:00,NaT,3,...,5.3,81.8,,,,,Survive,0,0,0
23,24,10006277,25610553,8,30888848,2176-06-07 23:56:15,2176-06-08 20:47:56,2176-06-13 11:30:00,NaT,1,...,7.4,78.5,,,,,Survive,0,0,0
39,40,10008100,29402054,8,35997892,2181-12-10 19:41:59,2181-12-11 15:07:56,2181-12-12 13:40:00,NaT,1,...,,,,,,,Survive,0,0,0
55,56,10017492,27417763,8,39543480,2116-06-26 20:35:09,2116-06-27 20:26:18,2116-07-05 08:05:00,2116-07-05,5,...,,,,,,,Death,1,1,1
71,72,10025463,24470193,8,38275267,2137-10-09 02:51:25,2137-10-09 17:32:37,2137-10-09 15:30:00,2137-10-09,4,...,,,,,,,Death,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58071,58072,19994233,29338696,8,38059821,2184-02-10 22:47:00,2184-02-11 16:56:21,2184-02-16 17:35:00,2184-05-12,2,...,,,,,,,Survive,0,1,1
58087,58088,19997293,26366652,8,36403582,2123-11-17 05:47:01,2123-11-19 17:08:30,2123-11-19 16:00:00,2124-02-20,5,...,8.5,54.6,,,,,Survive,0,1,1
58103,58104,19997760,21257506,8,36265722,2190-12-03 22:07:00,2190-12-06 18:12:24,2190-12-06 18:12:00,2191-10-08,4,...,,,,,,,Survive,0,0,1
58119,58120,19999442,26785317,8,32336619,2148-11-19 14:23:43,2148-11-26 13:12:15,2148-12-04 16:25:00,NaT,3,...,,,,,,,Survive,0,0,0


In [16]:
missing_pct = merged_df.isna().mean().mul(100)

headers_missing_df = pd.DataFrame({
    "feature": merged_df.columns,
    "missing_pct": missing_pct.values
})

# 2 δεκαδικά + κόμμα
headers_missing_df["missing_pct"] = (
    headers_missing_df["missing_pct"]
      .round(2)
      .map(lambda x: f"{x:.2f}")
      .str.replace(".", ",", regex=False)
)


headers_missing_df.to_csv(r'CSV/Exports/Temp/04_merge_stroke_headers.csv', index=False)