In [1]:
import pandas as pd
import os

eicu_data_path = "../eicu_data"  

columns_to_load = {
    "patient.csv": ["patientunitstayid", "age", "gender", "admissionheight", "admissionweight", "hospitaldischargestatus","hospitaldischargeoffset","hospitaladmitoffset"],
    "apacheapsvar.csv": ["patientunitstayid", "albumin", "bilirubin", "bun", "creatinine", "glucose", "hematocrit", "meanbp", "pao2", "pco2", "ph", "respiratoryrate", "sodium", "urine", "wbc"],
    "apachepatientresult.csv": ["patientunitstayid", "actualhospitalmortality", "actualicumortality", "apachescore", "acutephysiologyscore", "predictedhospitalmortality", "predictedicumortality"],
    "apachepredvar.csv": ["patientunitstayid", "admitdiagnosis", "admitsource", "diabetes", "hepaticfailure", "immunosuppression", "leukemia", "metastaticcancer", "motor", "verbal"],
    "vitalperiodic.csv": ["patientunitstayid", "heartrate", "respiration", "sao2", "temperature", "systemicdiastolic", "systemicmean", "systemicsystolic"],
    "lab.csv": ["patientunitstayid", "labname", "labresult"],
    "intakeoutput.csv": ["patientunitstayid", "intaketotal", "outputtotal", "nettotal"],
    "respiratorycare.csv": ["patientunitstayid", "ventstartoffset", "ventendoffset"],
}

print(" Loading datasets...")
dataframes = {file.replace(".csv", ""): pd.read_csv(os.path.join(eicu_data_path, file), usecols=cols) for file, cols in columns_to_load.items()}

patient_df = dataframes["patient"]
apache_apsvar_df = dataframes["apacheapsvar"]
apache_result_df = dataframes["apachepatientresult"]
apache_predvar_df = dataframes["apachepredvar"]
vitalperiodic_df = dataframes["vitalperiodic"]
lab_df = dataframes["lab"]
intakeoutput_df = dataframes["intakeoutput"]
respiratorycare_df = dataframes["respiratorycare"]

print(" Successfully loaded all selected features, optimizing memory usage! ")


 Loading datasets...
 Successfully loaded all selected features, optimizing memory usage! 


In [2]:
print(" Reducing `intakeoutput_df`...")
intakeoutput_df = pd.read_csv(os.path.join(eicu_data_path, "intakeoutput.csv"), usecols=["patientunitstayid", "intaketotal", "outputtotal", "nettotal"])

intakeoutput_reduced = intakeoutput_df.groupby("patientunitstayid").agg({
    "intaketotal": ["mean", "max", "min", "last"],
    "outputtotal": ["mean", "max", "min", "last"],
    "nettotal": ["mean", "max", "min", "last"]
})

intakeoutput_reduced.columns = ["_".join(col).strip() for col in intakeoutput_reduced.columns]
intakeoutput_reduced.reset_index(inplace=True)

dataframes["intakeoutput"] = intakeoutput_reduced

print(f" `intakeoutput_df` reduced from {len(intakeoutput_df):,} to {len(intakeoutput_reduced):,} rows!")


 Reducing `intakeoutput_df`...
 `intakeoutput_df` reduced from 12,030,289 to 180,855 rows!


In [3]:
print(" Checking `lab` dataset size before transformation...")

lab_initial_rows = len(dataframes["lab"])
print(f" `lab` dataset size before transformation: {lab_initial_rows:,} rows")

print(" Transforming `lab` dataset...")

valid_lab_tests = [
    "glucose", "creatinine", "sodium", "potassium", "chloride", "BUN",
    "WBC x 1000", "Hgb", "platelets x 1000", "total bilirubin", "lactate",
    "pH", "paO2", "paCO2"
]  
lab_df = dataframes["lab"][dataframes["lab"]["labname"].isin(valid_lab_tests)]

lab_df = lab_df.groupby(["patientunitstayid", "labname"])["labresult"].agg(["mean", "min", "max"]).reset_index()

lab_df = lab_df.pivot(index="patientunitstayid", columns="labname", values=["mean", "min", "max"]).reset_index()

lab_df.columns = ["_".join(col).strip() for col in lab_df.columns]
lab_df.rename(columns={"patientunitstayid_": "patientunitstayid"}, inplace=True)

dataframes["lab"] = lab_df  

print(" `lab` dataset transformed successfully!")


lab_reduced_rows = len(dataframes["lab"])
print(f" `lab` dataset size after transformation: {lab_reduced_rows:,} rows")

reduction_percentage = ((lab_initial_rows - lab_reduced_rows) / lab_initial_rows) * 100
print(f" `lab` dataset reduced by {reduction_percentage:.2f}%")


 Checking `lab` dataset size before transformation...
 `lab` dataset size before transformation: 39,132,531 rows
 Transforming `lab` dataset...
 `lab` dataset transformed successfully!
 `lab` dataset size after transformation: 193,160 rows
 `lab` dataset reduced by 99.51%


In [4]:
lab_df.head()

Unnamed: 0,patientunitstayid,mean_BUN,mean_Hgb,mean_WBC x 1000,mean_chloride,mean_creatinine,mean_glucose,mean_lactate,mean_pH,mean_paCO2,...,max_creatinine,max_glucose,max_lactate,max_pH,max_paCO2,max_paO2,max_platelets x 1000,max_potassium,max_sodium,max_total bilirubin
0,141168,27.333333,12.566667,14.766667,101.333333,2.4,93.333333,12.2,7.16,39.5,...,2.95,131.0,12.2,7.2,46.0,121.0,213.0,4.3,140.0,5.2
1,141178,11.0,15.5,7.6,108.0,0.7,77.0,,,,...,0.7,77.0,,,,,273.0,3.6,146.0,0.4
2,141179,18.333333,12.5,8.1,107.333333,0.7,80.0,,,,...,0.7,96.0,,,,,219.0,4.2,146.0,
3,141194,27.357143,8.64,7.92,106.071429,2.108571,143.071429,1.5,7.31,26.0,...,2.94,168.0,1.9,7.31,26.0,100.0,298.0,4.6,141.0,0.4
4,141196,20.0,10.6,17.7,98.0,0.84,134.5,0.8,7.43,45.0,...,0.89,144.0,0.8,7.43,45.0,70.0,534.0,4.1,138.0,0.3


In [5]:
print(" Checking `vitalperiodic` dataset size before aggregation...")
vital_initial_rows = len(dataframes["vitalperiodic"])
print(f" `vitalperiodic` dataset size before aggregation: {vital_initial_rows:,} rows")

print(" Aggregating `vitalperiodic` (keeping only mean values)...")

vital_aggregated = dataframes["vitalperiodic"].groupby("patientunitstayid").agg({
    "heartrate": "mean",
    "respiration": "mean",
    "temperature": "mean",
    "sao2": "mean",
    "systemicdiastolic": "mean",
    "systemicmean": "mean",
    "systemicsystolic": "mean"
}).reset_index()

dataframes["vitalperiodic"] = vital_aggregated

print(" `vitalperiodic` successfully aggregated!")

vital_reduced_rows = len(dataframes["vitalperiodic"])
print(f" `vitalperiodic` dataset size after aggregation: {vital_reduced_rows:,} rows")

reduction_percentage = ((vital_initial_rows - vital_reduced_rows) / vital_initial_rows) * 100
print(f" `vitalperiodic` reduced by {reduction_percentage:.2f}%")

 Checking `vitalperiodic` dataset size before aggregation...
 `vitalperiodic` dataset size before aggregation: 146,671,642 rows
 Aggregating `vitalperiodic` (keeping only mean values)...
 `vitalperiodic` successfully aggregated!
 `vitalperiodic` dataset size after aggregation: 192,831 rows
 `vitalperiodic` reduced by 99.87%


In [6]:
dataframes["vitalperiodic"].head()

Unnamed: 0,patientunitstayid,heartrate,respiration,temperature,sao2,systemicdiastolic,systemicmean,systemicsystolic
0,141168,110.557416,,,86.325843,51.0,70.585366,92.707317
1,141178,88.0,,,92.0,,,
2,141179,91.233503,,,98.717391,,,
3,141194,85.401042,20.598958,,98.730689,50.636628,65.104348,92.828488
4,141196,82.305147,24.114815,,95.509434,,,


In [7]:
vitalperiodic_df['temperature'].isnull().sum()

133468353

In [8]:
lab_df = dataframes["lab"]  
intakeoutput_df = dataframes["intakeoutput"] 
vitalperiodic_df = dataframes["vitalperiodic"] 

In [9]:
print(" Merging datasets...")
merged_df = patient_df.copy()
datasets_to_merge = [apache_apsvar_df, apache_result_df, apache_predvar_df, lab_df, vitalperiodic_df, intakeoutput_df, respiratorycare_df]
for df in datasets_to_merge:
    print(" Merging dataset...")
    merged_df = merged_df.merge(df, on="patientunitstayid", how="left")

print(" Merging completed successfully!")
num_rows, num_cols = merged_df.shape
print(f" Merged dataset contains {num_rows:,} rows and {num_cols} columns.")

 Merging datasets...
 Merging dataset...
 Merging dataset...
 Merging dataset...
 Merging dataset...
 Merging dataset...
 Merging dataset...
 Merging dataset...
 Merging completed successfully!
 Merged dataset contains 1,908,519 rows and 100 columns.


In [10]:
merged_df.columns

Index(['patientunitstayid', 'gender', 'age', 'admissionheight',
       'hospitaladmitoffset', 'hospitaldischargeoffset',
       'hospitaldischargestatus', 'admissionweight', 'urine', 'wbc',
       'respiratoryrate', 'sodium', 'meanbp', 'ph', 'hematocrit', 'creatinine',
       'albumin', 'pao2', 'pco2', 'bun', 'glucose', 'bilirubin',
       'acutephysiologyscore', 'apachescore', 'predictedicumortality',
       'actualicumortality', 'predictedhospitalmortality',
       'actualhospitalmortality', 'admitsource', 'verbal', 'motor',
       'admitdiagnosis', 'hepaticfailure', 'metastaticcancer', 'leukemia',
       'immunosuppression', 'diabetes', 'mean_BUN', 'mean_Hgb',
       'mean_WBC x 1000', 'mean_chloride', 'mean_creatinine', 'mean_glucose',
       'mean_lactate', 'mean_pH', 'mean_paCO2', 'mean_paO2',
       'mean_platelets x 1000', 'mean_potassium', 'mean_sodium',
       'mean_total bilirubin', 'min_BUN', 'min_Hgb', 'min_WBC x 1000',
       'min_chloride', 'min_creatinine', 'min_glucose

In [11]:
print(" Remaining missing values:")
print(merged_df.isnull().sum().sum())  

 Remaining missing values:
14398145


In [12]:
missing_values = merged_df.isnull().sum()
missing_values = missing_values[missing_values > 0]
print(" Columns still with missing values:")
print(missing_values)

 Columns still with missing values:
gender                        157
age                           120
admissionheight             14588
hospitaldischargestatus     10954
admissionweight             27683
                            ...  
nettotal_max                49993
nettotal_min                49993
nettotal_last               49993
ventstartoffset            266081
ventendoffset              266081
Length: 97, dtype: int64


In [13]:
merged_df["admissionweight"].fillna(merged_df["admissionweight"].median(), inplace=True)
merged_df["admitsource"].fillna("Unknown", inplace=True)

lab_cols = ["wbc", "hematocrit", "creatinine", "albumin", "pao2", "pco2", "bun", "glucose", "bilirubin"]
merged_df[lab_cols] = merged_df[lab_cols].apply(lambda x: x.fillna(x.median()), axis=0)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df["admissionweight"].fillna(merged_df["admissionweight"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df["admitsource"].fillna("Unknown", inplace=True)
  merged_df["admitsource"].fillna("Unknown", inplace=True)


In [14]:
print("Cleaning ICU mortality columns...")

# Define the ICU mortality columns
icu_cols = ["predictedicumortality", "actualicumortality", "predictedhospitalmortality", "actualhospitalmortality"]

for col in icu_cols:
    merged_df[col] = merged_df[col].replace({"EXPIRED": 1, "ALIVE": 0})
    merged_df[col] = pd.to_numeric(merged_df[col], errors="coerce")  # Convert to numeric

# Fill remaining NaN values with the median (should be 0 or 1)
merged_df[icu_cols] = merged_df[icu_cols].apply(lambda x: x.fillna(x.median()), axis=0)

print("ICU mortality columns cleaned and converted to numeric!")

Cleaning ICU mortality columns...


  merged_df[col] = merged_df[col].replace({"EXPIRED": 1, "ALIVE": 0})


ICU mortality columns cleaned and converted to numeric!


In [15]:
clinical_cols = ["hepaticfailure", "metastaticcancer", "leukemia", "immunosuppression", "diabetes"]
merged_df[clinical_cols] = merged_df[clinical_cols].apply(lambda x: x.fillna(x.mode()[0]), axis=0)

fluid_cols = ["intaketotal_max", "intaketotal_min", "intaketotal_last",
              "outputtotal_max", "outputtotal_min", "outputtotal_last",
              "nettotal_max", "nettotal_min", "nettotal_last"]
merged_df[fluid_cols] = merged_df[fluid_cols].apply(lambda x: x.fillna(x.median()), axis=0)


In [16]:
missing_values = merged_df.isnull().sum()
missing_values = missing_values[missing_values > 0]
print("Columns still with missing values:")
print(missing_values)

Columns still with missing values:
gender                        157
age                           120
admissionheight             14588
hospitaldischargestatus     10954
urine                       64676
                            ...  
intaketotal_mean            49993
outputtotal_mean            49993
nettotal_mean               49993
ventstartoffset            266081
ventendoffset              266081
Length: 68, dtype: int64


In [17]:
print("Handling missing values...")

# Fill missing temperature with median
merged_df["temperature"].fillna(merged_df["temperature"].median(), inplace=True)

# Fill missing blood pressure with mean per patient
bp_cols = [ "systemicdiastolic", "systemicsystolic", "systemicmean"]
merged_df[bp_cols] = merged_df[bp_cols].apply(lambda x: x.fillna(x.mean()), axis=0)

# Fill missing intake/output values with median
fluid_cols = ["intaketotal_mean", "outputtotal_mean", "nettotal_mean"]
merged_df[fluid_cols] = merged_df[fluid_cols].apply(lambda x: x.fillna(x.median()), axis=0)

# Fill missing lab results with median per test
lab_cols = [col for col in merged_df.columns if "mean_" in col or "min_" in col or "max_" in col]
merged_df[lab_cols] = merged_df[lab_cols].apply(lambda x: x.fillna(x.median()), axis=0)

# Fill missing demographic data
merged_df["admissionheight"].fillna(merged_df["admissionheight"].median(), inplace=True)
merged_df["hospitaldischargestatus"].fillna(merged_df["hospitaldischargestatus"].mode()[0], inplace=True)
merged_df["gender"].fillna(merged_df["gender"].mode()[0], inplace=True)

# Convert age column to numeric, replacing '> 89' with 89
merged_df["age"] = merged_df["age"].replace("> 89", "89")  # Convert '> 89' to '89'
merged_df["age"] = pd.to_numeric(merged_df["age"], errors="coerce")  # Convert column to float

# Fill missing values with median age
merged_df["age"] = pd.to_numeric(merged_df["age"], errors="coerce")
merged_df["age"].fillna(merged_df["age"].median(), inplace=True)

# Fill missing ICU scores with median
score_cols = ["apachescore", "acutephysiologyscore"]
merged_df[score_cols] = merged_df[score_cols].apply(lambda x: x.fillna(x.median()), axis=0)

# Fill missing diagnosis with "Unknown"
merged_df["admitdiagnosis"].fillna("Unknown", inplace=True)

print("Missing values handled successfully!")

Handling missing values...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df["temperature"].fillna(merged_df["temperature"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df["admissionheight"].fillna(merged_df["admissionheight"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method wi

Missing values handled successfully!


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df["age"].fillna(merged_df["age"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df["admitdiagnosis"].fillna("Unknown", inplace=True)


In [18]:
print(" Handling final missing values...")

# Fill missing vital signs with median per patient
vital_cols = ["urine", "respiratoryrate", "sodium", "meanbp", "ph", "heartrate", "respiration", "sao2"]
merged_df[vital_cols] = merged_df[vital_cols].apply(lambda x: x.fillna(x.median()), axis=0)

# Fill missing Glasgow Coma Scale (GCS) values with median
gcs_cols = ["verbal", "motor"]
merged_df[gcs_cols] = merged_df[gcs_cols].apply(lambda x: x.fillna(x.median()), axis=0)

print(" Final missing values handled successfully!")

 Handling final missing values...
 Final missing values handled successfully!


In [19]:
print(" Converting categorical columns to string for Parquet compatibility...")

categorical_cols = ["admitsource", "admitdiagnosis"]

merged_df[categorical_cols] = merged_df[categorical_cols].astype(str)

print(" Categorical columns converted to string successfully!")

 Converting categorical columns to string for Parquet compatibility...
 Categorical columns converted to string successfully!


In [20]:
merged_df = merged_df.drop(columns=["actualicumortality","patientunitstayid",
                                    "predictedicumortality", "predictedhospitalmortality"
                                   ], errors="ignore")

In [21]:
features_to_remove_corr = [
    "max_creatinine", "intaketotal_max", "min_Hgb", "nettotal_mean", "nettotal_last",
     "hospitaldischargestatus", "outputtotal_last", "intaketotal_last",
    "max_pH", "nettotal_max", "max_total bilirubin", "nettotal_min",
     "outputtotal_max"
]


merged_df = merged_df.drop(columns=features_to_remove_corr, errors="ignore")

print(f" Final dataset shape after correlation-based feature selection: {merged_df.shape}")
print(f" Features retained: {list(merged_df.columns)}")

 Final dataset shape after correlation-based feature selection: (1908519, 83)
 Features retained: ['gender', 'age', 'admissionheight', 'hospitaladmitoffset', 'hospitaldischargeoffset', 'admissionweight', 'urine', 'wbc', 'respiratoryrate', 'sodium', 'meanbp', 'ph', 'hematocrit', 'creatinine', 'albumin', 'pao2', 'pco2', 'bun', 'glucose', 'bilirubin', 'acutephysiologyscore', 'apachescore', 'actualhospitalmortality', 'admitsource', 'verbal', 'motor', 'admitdiagnosis', 'hepaticfailure', 'metastaticcancer', 'leukemia', 'immunosuppression', 'diabetes', 'mean_BUN', 'mean_Hgb', 'mean_WBC x 1000', 'mean_chloride', 'mean_creatinine', 'mean_glucose', 'mean_lactate', 'mean_pH', 'mean_paCO2', 'mean_paO2', 'mean_platelets x 1000', 'mean_potassium', 'mean_sodium', 'mean_total bilirubin', 'min_BUN', 'min_WBC x 1000', 'min_chloride', 'min_creatinine', 'min_glucose', 'min_lactate', 'min_pH', 'min_paCO2', 'min_paO2', 'min_platelets x 1000', 'min_potassium', 'min_sodium', 'min_total bilirubin', 'max_BUN', 

In [22]:
features_to_remove = [
    "min_BUN", "max_BUN", "min_WBC x 1000", "max_WBC x 1000",
    "min_creatinine", "max_creatinine", "min_glucose", "max_glucose",
    "min_pH", "max_pH", "min_paO2", "max_paO2", "min_paCO2", "max_paCO2",
    "min_platelets x 1000", "max_platelets x 1000", "min_potassium", "max_potassium",
    "min_sodium", "max_sodium", "min_total bilirubin", "max_total bilirubin", "min_chloride", 
    "max_chloride", "min_lactate", "max_lactate", "max_Hgb",
    "intaketotal_min", "outputtotal_min"
]


merged_df = merged_df.drop(columns=features_to_remove, errors="ignore")

print(" Removed redundant min/max lab test features.")
print(f" Updated dataset shape: {merged_df.shape}")
print(f" Remaining columns: {list(merged_df.columns)}")

 Removed redundant min/max lab test features.
 Updated dataset shape: (1908519, 57)
 Remaining columns: ['gender', 'age', 'admissionheight', 'hospitaladmitoffset', 'hospitaldischargeoffset', 'admissionweight', 'urine', 'wbc', 'respiratoryrate', 'sodium', 'meanbp', 'ph', 'hematocrit', 'creatinine', 'albumin', 'pao2', 'pco2', 'bun', 'glucose', 'bilirubin', 'acutephysiologyscore', 'apachescore', 'actualhospitalmortality', 'admitsource', 'verbal', 'motor', 'admitdiagnosis', 'hepaticfailure', 'metastaticcancer', 'leukemia', 'immunosuppression', 'diabetes', 'mean_BUN', 'mean_Hgb', 'mean_WBC x 1000', 'mean_chloride', 'mean_creatinine', 'mean_glucose', 'mean_lactate', 'mean_pH', 'mean_paCO2', 'mean_paO2', 'mean_platelets x 1000', 'mean_potassium', 'mean_sodium', 'mean_total bilirubin', 'heartrate', 'respiration', 'temperature', 'sao2', 'systemicdiastolic', 'systemicmean', 'systemicsystolic', 'intaketotal_mean', 'outputtotal_mean', 'ventstartoffset', 'ventendoffset']


In [23]:
merged_df["LOS_days"] = (merged_df["hospitaldischargeoffset"] - merged_df["hospitaladmitoffset"]) / (60 * 24)

In [24]:
merged_df.columns

Index(['gender', 'age', 'admissionheight', 'hospitaladmitoffset',
       'hospitaldischargeoffset', 'admissionweight', 'urine', 'wbc',
       'respiratoryrate', 'sodium', 'meanbp', 'ph', 'hematocrit', 'creatinine',
       'albumin', 'pao2', 'pco2', 'bun', 'glucose', 'bilirubin',
       'acutephysiologyscore', 'apachescore', 'actualhospitalmortality',
       'admitsource', 'verbal', 'motor', 'admitdiagnosis', 'hepaticfailure',
       'metastaticcancer', 'leukemia', 'immunosuppression', 'diabetes',
       'mean_BUN', 'mean_Hgb', 'mean_WBC x 1000', 'mean_chloride',
       'mean_creatinine', 'mean_glucose', 'mean_lactate', 'mean_pH',
       'mean_paCO2', 'mean_paO2', 'mean_platelets x 1000', 'mean_potassium',
       'mean_sodium', 'mean_total bilirubin', 'heartrate', 'respiration',
       'temperature', 'sao2', 'systemicdiastolic', 'systemicmean',
       'systemicsystolic', 'intaketotal_mean', 'outputtotal_mean',
       'ventstartoffset', 'ventendoffset', 'LOS_days'],
      dtype='object')

In [25]:
merged_df_size = merged_df.memory_usage(deep=True).sum() / (1024 ** 3)
print(f" Estimated size of merged_df: {merged_df_size:.2f} GB")

 Estimated size of merged_df: 1.12 GB


In [26]:
merged_df_size = merged_df.memory_usage(deep=True).sum() / (1024 ** 3)
print(f" Estimated size of merged_df: {merged_df_size:.2f} GB")

output_path = "merged_eicu_data.parquet"

merged_df.to_parquet(output_path, index=False)
print(f" Merged dataset saved as Parquet: {output_path}")

 Estimated size of merged_df: 1.12 GB
 Merged dataset saved as Parquet: merged_eicu_data.parquet
