In [1]:
import csv
import pandas as pd
import numpy as np
import logging
import os

In [2]:
# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("data_loading.log"),
        logging.StreamHandler()
    ]
)

In [3]:
# Read MIMICs CSV files
subfolder = 'o3_hour_overlap_window'

# Log the start of the process
logging.info(f"Starting the data loading process from {subfolder}...")

logging.info(f"Loading MIMIC-IV datasets")
try:
    mimic_mean_df = pd.read_csv(f"../01_MimicIV/CSV/Exports/datasets/whole_set/{subfolder}/o01_final_mean_with_los.csv", low_memory=False)
    logging.info("Loaded mimic_mean_df successfully.")
except Exception as e:
    logging.error(f"Error loading mimic_mean_df: {e}")

try:
    mimic_median_df = pd.read_csv(f"../01_MimicIV/CSV/Exports/datasets/whole_set/{subfolder}/o02_final_median_with_los.csv", low_memory=False)
    logging.info("Loaded mimic_median_df successfully.")
except Exception as e:
    logging.error(f"Error loading mimic_median_df: {e}")

try:
    mimic_min_df = pd.read_csv(f"../01_MimicIV/CSV/Exports/datasets/whole_set/{subfolder}/o03_final_min_with_los.csv", low_memory=False)
    logging.info("Loaded mimic_min_df successfully.")
except Exception as e:
    logging.error(f"Error loading mimic_min_df: {e}")

try:
    mimic_max_df = pd.read_csv(f"../01_MimicIV/CSV/Exports/datasets/whole_set/{subfolder}/o04_final_max_with_los.csv", low_memory=False)
    logging.info("Loaded mimic_max_df successfully.")
except Exception as e:
    logging.error(f"Error loading mimic_max_df: {e}")

# Read eICUs CSV files
logging.info("Loading eICU datasets")
try:
    eicu_mean_df = pd.read_csv(f"../02_eICU/CSV/Exports/datasets/whole_set/{subfolder}/o01_final_mean_table.csv", low_memory=False)
    logging.info("Loaded eicu_mean_df successfully.")
except Exception as e:
    logging.error(f"Error loading eicu_mean_df: {e}")

try:
    eicu_median_df = pd.read_csv(f"../02_eICU/CSV/Exports/datasets/whole_set/{subfolder}/o02_final_median_table.csv", low_memory=False)
    logging.info("Loaded eicu_median_df successfully.")
except Exception as e:
    logging.error(f"Error loading eicu_median_df: {e}")

try:
    eicu_min_df = pd.read_csv(f"../02_eICU/CSV/Exports/datasets/whole_set/{subfolder}/o03_final_min_table.csv", low_memory=False)
    logging.info("Loaded eicu_min_df successfully.")
except Exception as e:
    logging.error(f"Error loading eicu_min_df: {e}")

try:
    eicu_max_df = pd.read_csv(f"../02_eICU/CSV/Exports/datasets/whole_set/{subfolder}/o04_final_max_table.csv", low_memory=False)
    logging.info("Loaded eicu_max_df successfully.")
except Exception as e:
    logging.error(f"Error loading eicu_max_df: {e}")

logging.info("Data loading process completed.")

2025-03-22 22:58:54,523 - INFO - Starting the data loading process from o3_hour_overlap_window...
2025-03-22 22:58:54,527 - INFO - Loading MIMIC-IV datasets
2025-03-22 22:59:01,336 - INFO - Loaded mimic_mean_df successfully.
2025-03-22 22:59:08,644 - INFO - Loaded mimic_median_df successfully.
2025-03-22 22:59:15,686 - INFO - Loaded mimic_min_df successfully.
2025-03-22 22:59:22,705 - INFO - Loaded mimic_max_df successfully.
2025-03-22 22:59:22,706 - INFO - Loading eICU datasets
2025-03-22 22:59:25,073 - INFO - Loaded eicu_mean_df successfully.
2025-03-22 22:59:27,170 - INFO - Loaded eicu_median_df successfully.
2025-03-22 22:59:29,176 - INFO - Loaded eicu_min_df successfully.
2025-03-22 22:59:31,292 - INFO - Loaded eicu_max_df successfully.
2025-03-22 22:59:31,294 - INFO - Data loading process completed.


In [4]:
display (mimic_mean_df)

Unnamed: 0,row_count,subject_id,hadm_id,Time_Zone,gender,age,language,marital_status,race,Base Excess - Mean,...,Other - Mean.2,"Triglycerides, Pleural - Mean",Thoracic Fluid Content (TFC) (NICOM) - Mean,Head of Bed Measurement (Degree) - Mean,ARCH-1 - Mean,Factor VII - Mean,"Creatinine, Body Fluid - Mean",Ethanol - Mean,hospital_expire_flag,los
0,1,10004733,27411876,1,M,51,English,SINGLE,UNKNOWN,0.0,...,,,,,,,,,Survive,8.357373
1,2,10004733,27411876,2,M,51,English,SINGLE,UNKNOWN,0.0,...,,,,,,,,,Survive,8.357373
2,3,10004733,27411876,3,M,51,English,SINGLE,UNKNOWN,0.0,...,,,,,,,,,Survive,8.357373
3,4,10004733,27411876,4,M,51,English,SINGLE,UNKNOWN,0.0,...,,,,,,,,,Survive,8.357373
4,5,10004733,27411876,5,M,51,English,SINGLE,UNKNOWN,0.0,...,,,,,,,,,Survive,8.357373
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58139,58140,19999987,23865745,12,F,57,English,,UNKNOWN,1.0,...,,,,,,,,,Survive,1.937847
58140,58141,19999987,23865745,13,F,57,English,,UNKNOWN,1.0,...,,,,,,,,,Survive,1.937847
58141,58142,19999987,23865745,14,F,57,English,,UNKNOWN,1.0,...,,,,,,,,,Survive,1.937847
58142,58143,19999987,23865745,15,F,57,English,,UNKNOWN,1.0,...,,,,,,,,,Survive,1.937847


In [5]:
# columns to keep
mimic_columns_to_keep = pd.read_csv('CSV/import/mimic_features.csv')
eicu_columns_to_keep = pd.read_csv('CSV/import/eicu_features.csv')

# Mimic IV Ver 3.1

In [6]:
logging.info("Starting the chunk-based merging process for MIMIC-IV dataframes.")

try:
    chunk_size = 2000
    merged_chunks = []  # List to store merged chunks

    # Iteratively process chunks from mimic_mean_df
    for i in range(0, len(mimic_mean_df), chunk_size):
        logging.info(f"Processing chunk {i // chunk_size + 1}...")

        # Slice the chunk from mimic_mean_df
        chunk = mimic_mean_df.iloc[i:i + chunk_size]

        # Merge with mimic_median_df
        logging.info("Merging with mimic_median_df...")
        chunk = chunk.merge(
            mimic_median_df,
            on=['row_count', 'subject_id', 'hadm_id', 'Time_Zone', 'gender', 'age', 'language', 'marital_status', 'race', 'hospital_expire_flag', 'los'],
            suffixes=('_mean', '_median')
        )
        logging.info("Merged chunk with mimic_median_df successfully.")

        # Merge with mimic_min_df
        logging.info("Merging with mimic_min_df...")
        chunk = chunk.merge(
            mimic_min_df,
            on=['row_count', 'subject_id', 'hadm_id', 'Time_Zone', 'gender', 'age', 'language', 'marital_status', 'race', 'hospital_expire_flag', 'los'],
            suffixes=('', '_min')
        )
        logging.info("Merged chunk with mimic_min_df successfully.")

        # Merge with mimic_max_df
        logging.info("Merging with mimic_max_df...")
        chunk = chunk.merge(
            mimic_max_df,
            on=['row_count', 'subject_id', 'hadm_id', 'Time_Zone', 'gender', 'age', 'language', 'marital_status', 'race', 'hospital_expire_flag', 'los'],
            suffixes=('', '_max')
        )
        logging.info("Merged chunk with mimic_max_df successfully.")

        # Append merged chunk to the list
        merged_chunks.append(chunk)

    # Concatenate all chunks into a single dataframe
    logging.info("Concatenating all merged chunks...")
    merged_mimic_df = pd.concat(merged_chunks, ignore_index=True)
    logging.info("Concatenated all chunks successfully.")

    # Replace suffixes
    logging.info("Replacing suffixes in column names...")
    merged_mimic_df.columns = merged_mimic_df.columns.str.replace(r'\s*-\s*Mean', ' (Mean)', regex=True)
    merged_mimic_df.columns = merged_mimic_df.columns.str.replace(r'\s*-\s*Median', ' (Median)', regex=True)
    merged_mimic_df.columns = merged_mimic_df.columns.str.replace(r'\s*-\s*Min', ' (Min)', regex=True)
    merged_mimic_df.columns = merged_mimic_df.columns.str.replace(r'\s*-\s*Max', ' (Max)', regex=True)
    logging.info("Replaced suffixes successfully.")

    # Move the 'hospital_expire_flag' and 'LOS' columns to the end of the dataframe
    logging.info("Reordering columns: moving 'hospital_expire_flag' and 'los' to the end...")
    hospital_expire_flag_column = merged_mimic_df.pop('hospital_expire_flag')
    los_column = merged_mimic_df.pop('los')
    merged_mimic_df = pd.concat([merged_mimic_df, hospital_expire_flag_column, los_column], axis=1)
    logging.info("Reordered columns successfully.")

    # Rename the last two columns to preserve their original names
    logging.info("Renaming the last two columns to preserve original names...")
    merged_mimic_df.columns = list(merged_mimic_df.columns[:-2]) + ['hospital_expire_flag', 'los']
    logging.info("Renamed the last two columns successfully.")

except Exception as e:
    logging.error(f"An error occurred during the merging process: {e}")

logging.info("Chunk-based merging process for MIMIC-IV dataframes completed successfully.")

2025-03-22 23:01:09,329 - INFO - Starting the chunk-based merging process for MIMIC-IV dataframes.
2025-03-22 23:01:09,330 - INFO - Processing chunk 1...
2025-03-22 23:01:09,332 - INFO - Merging with mimic_median_df...
2025-03-22 23:01:09,593 - INFO - Merged chunk with mimic_median_df successfully.
2025-03-22 23:01:09,594 - INFO - Merging with mimic_min_df...
2025-03-22 23:01:09,846 - INFO - Merged chunk with mimic_min_df successfully.
2025-03-22 23:01:09,847 - INFO - Merging with mimic_max_df...
2025-03-22 23:01:10,129 - INFO - Merged chunk with mimic_max_df successfully.
2025-03-22 23:01:10,130 - INFO - Processing chunk 2...
2025-03-22 23:01:10,131 - INFO - Merging with mimic_median_df...
2025-03-22 23:01:10,382 - INFO - Merged chunk with mimic_median_df successfully.
2025-03-22 23:01:10,383 - INFO - Merging with mimic_min_df...
2025-03-22 23:01:10,684 - INFO - Merged chunk with mimic_min_df successfully.
2025-03-22 23:01:10,685 - INFO - Merging with mimic_max_df...
2025-03-22 23:01:

In [7]:
logging.info("Starting chunk-wise processing of merged_mimic_df.")
# Define GCS columns for processing
gcs_mean_cols = ['GCS - Eye Opening (Mean)', 'GCS - Verbal Response (Mean)', 'GCS - Motor Response (Mean)']
gcs_median_cols = ['GCS - Eye Opening (Median)', 'GCS - Verbal Response (Median)', 'GCS - Motor Response (Median)']
gcs_min_cols = ['GCS - Eye Opening (Min)', 'GCS - Verbal Response (Min)', 'GCS - Motor Response (Min)']
gcs_max_cols = ['GCS - Eye Opening (Max)', 'GCS - Verbal Response (Max)', 'GCS - Motor Response (Max)']
gcs_columns = gcs_mean_cols + gcs_median_cols + gcs_min_cols + gcs_max_cols

# Split dataframe into chunks
num_chunks = 10  
chunk_list = np.array_split(merged_mimic_df, num_chunks)

# Placeholder for processed chunks
processed_chunks = []

for i, chunk in enumerate(chunk_list):
    try:
        logging.info(f"Processing chunk {i + 1}/{num_chunks}...")
        
        # Convert GCS columns to numeric, coercing errors to NaN
        for col in gcs_columns:
            chunk[col] = pd.to_numeric(chunk[col], errors='coerce')
        
        # Aggregate GCS values
        chunk['GCS (Mean)'] = chunk[gcs_mean_cols].sum(axis=1, skipna=True)
        chunk['GCS (Median)'] = chunk[gcs_median_cols].sum(axis=1, skipna=True)
        chunk['GCS (Min)'] = chunk[gcs_min_cols].sum(axis=1, skipna=True)
        chunk['GCS (Max)'] = chunk[gcs_max_cols].sum(axis=1, skipna=True)

        # Replace rows where all original GCS columns were NaN with NaN in the new GCS columns
        mask_mean = chunk[gcs_mean_cols].isna().all(axis=1)
        
        mask_median = chunk[gcs_median_cols].isna().all(axis=1)
        mask_min = chunk[gcs_min_cols].isna().all(axis=1)
        mask_max = chunk[gcs_max_cols].isna().all(axis=1)

        chunk.loc[mask_mean, 'GCS (Mean)'] = np.nan
        chunk.loc[mask_median, 'GCS (Median)'] = np.nan
        chunk.loc[mask_min, 'GCS (Min)'] = np.nan
        chunk.loc[mask_max, 'GCS (Max)'] = np.nan

        # Drop the original GCS component columns
        chunk.drop(columns=gcs_columns, inplace=True)

        # Append processed chunk to the list
        processed_chunks.append(chunk)
        logging.info(f"Chunk {i + 1}/{num_chunks} processed successfully.")

    except Exception as e:
        logging.error(f"Error processing chunk {i + 1}: {e}")

# Concatenate all processed chunks into a single DataFrame
merged_mimic_df = pd.concat(processed_chunks, ignore_index=True)

logging.info("All chunks processed and concatenated successfully.")

2025-03-22 23:02:06,544 - INFO - Starting chunk-wise processing of merged_mimic_df.
  return bound(*args, **kwds)
2025-03-22 23:02:08,987 - INFO - Processing chunk 1/10...
2025-03-22 23:02:09,110 - INFO - Chunk 1/10 processed successfully.
2025-03-22 23:02:09,111 - INFO - Processing chunk 2/10...
2025-03-22 23:02:09,228 - INFO - Chunk 2/10 processed successfully.
2025-03-22 23:02:09,229 - INFO - Processing chunk 3/10...
2025-03-22 23:02:09,340 - INFO - Chunk 3/10 processed successfully.
2025-03-22 23:02:09,341 - INFO - Processing chunk 4/10...
2025-03-22 23:02:09,453 - INFO - Chunk 4/10 processed successfully.
2025-03-22 23:02:09,454 - INFO - Processing chunk 5/10...
2025-03-22 23:02:09,559 - INFO - Chunk 5/10 processed successfully.
2025-03-22 23:02:09,560 - INFO - Processing chunk 6/10...
2025-03-22 23:02:09,668 - INFO - Chunk 6/10 processed successfully.
2025-03-22 23:02:09,669 - INFO - Processing chunk 7/10...
2025-03-22 23:02:09,779 - INFO - Chunk 7/10 processed successfully.
2025

In [8]:
logging.info("Starting chunk-wise processing of Braden components.")

# Define Braden columns for processing
braden_mean_cols = [
    'Braden Sensory Perception (Mean)', 'Braden Moisture (Mean)', 'Braden Activity (Mean)', 
    'Braden Mobility (Mean)', 'Braden Nutrition (Mean)', 'Braden Friction/Shear (Mean)'
]
braden_median_cols = [
    'Braden Sensory Perception (Median)', 'Braden Moisture (Median)', 'Braden Activity (Median)', 
    'Braden Mobility (Median)', 'Braden Nutrition (Median)', 'Braden Friction/Shear (Median)'
]
braden_min_cols = [
    'Braden Sensory Perception (Min)', 'Braden Moisture (Min)', 'Braden Activity (Min)', 
    'Braden Mobility (Min)', 'Braden Nutrition (Min)', 'Braden Friction/Shear (Min)'
]
braden_max_cols = [
    'Braden Sensory Perception (Max)', 'Braden Moisture (Max)', 'Braden Activity (Max)', 
    'Braden Mobility (Max)', 'Braden Nutrition (Max)', 'Braden Friction/Shear (Max)'
]
braden_columns = braden_mean_cols + braden_median_cols + braden_min_cols + braden_max_cols

# Split dataframe into chunks
num_chunks = 10
chunk_list = np.array_split(merged_mimic_df, num_chunks)

# Placeholder for processed chunks
processed_chunks = []

for i, chunk in enumerate(chunk_list):
    try:
        logging.info(f"Processing chunk {i + 1}/{num_chunks}...")
        
        # Convert Braden columns to numeric, coercing errors to NaN
        for col in braden_columns:
            chunk[col] = pd.to_numeric(chunk[col], errors='coerce')
        
        # Aggregate Braden values
        chunk['Braden (Mean)'] = chunk[braden_mean_cols].sum(axis=1, skipna=True)
        chunk['Braden (Median)'] = chunk[braden_median_cols].sum(axis=1, skipna=True)
        chunk['Braden (Min)'] = chunk[braden_min_cols].sum(axis=1, skipna=True)
        chunk['Braden (Max)'] = chunk[braden_max_cols].sum(axis=1, skipna=True)

        # Replace rows where all original Braden columns were NaN with NaN in the new Braden columns
        mask_mean = chunk[braden_mean_cols].isna().all(axis=1)
        mask_median = chunk[braden_median_cols].isna().all(axis=1)
        mask_min = chunk[braden_min_cols].isna().all(axis=1)
        mask_max = chunk[braden_max_cols].isna().all(axis=1)

        chunk.loc[mask_mean, 'Braden (Mean)'] = np.nan
        chunk.loc[mask_median, 'Braden (Median)'] = np.nan
        chunk.loc[mask_min, 'Braden (Min)'] = np.nan
        chunk.loc[mask_max, 'Braden (Max)'] = np.nan

        # Drop the original Braden component columns
        chunk.drop(columns=braden_columns, inplace=True)

        # Append processed chunk to the list
        processed_chunks.append(chunk)
        logging.info(f"Chunk {i + 1}/{num_chunks} processed successfully.")

    except Exception as e:
        logging.error(f"Error processing chunk {i + 1}: {e}")

# Concatenate all processed chunks into a single DataFrame
merged_mimic_df = pd.concat(processed_chunks, ignore_index=True)

logging.info("All chunks processed and concatenated successfully.")

2025-03-22 23:02:38,380 - INFO - Starting chunk-wise processing of Braden components.
  return bound(*args, **kwds)
2025-03-22 23:02:41,051 - INFO - Processing chunk 1/10...
2025-03-22 23:02:41,187 - INFO - Chunk 1/10 processed successfully.
2025-03-22 23:02:41,189 - INFO - Processing chunk 2/10...
2025-03-22 23:02:41,341 - INFO - Chunk 2/10 processed successfully.
2025-03-22 23:02:41,342 - INFO - Processing chunk 3/10...
2025-03-22 23:02:41,484 - INFO - Chunk 3/10 processed successfully.
2025-03-22 23:02:41,485 - INFO - Processing chunk 4/10...
2025-03-22 23:02:41,619 - INFO - Chunk 4/10 processed successfully.
2025-03-22 23:02:41,620 - INFO - Processing chunk 5/10...
2025-03-22 23:02:41,758 - INFO - Chunk 5/10 processed successfully.
2025-03-22 23:02:41,759 - INFO - Processing chunk 6/10...
2025-03-22 23:02:41,887 - INFO - Chunk 6/10 processed successfully.
2025-03-22 23:02:41,888 - INFO - Processing chunk 7/10...
2025-03-22 23:02:42,020 - INFO - Chunk 7/10 processed successfully.
20

In [10]:
# Remove spaces and commas
merged_mimic_df.columns = merged_mimic_df.columns.str.replace(r'[ ,]+', '_', regex=True)

# Drop second column from the column_names_df
mimic_columns_to_keep.drop(columns=['Unnamed: 1'], inplace=True)

# Extract column names from columns_to_keep DataFrame
columns_to_keep_names = mimic_columns_to_keep['column'].tolist()

# Select only the desired columns
mimic_temp = merged_mimic_df[columns_to_keep_names]

# Remove Duplicate Columns
df_mimic_unique = mimic_temp.loc[:, ~mimic_temp.columns.duplicated()]

In [12]:
# Multiply values by 4 in 'Ionized Calcium' column, leaving NaN values unchanged for normalization with eicu
df_mimic_unique.loc[:, 'Ionized_Calcium_(Max)'] = mimic_temp['Ionized_Calcium_(Max)'].apply(lambda x: x * 4 if pd.notna(x) else x)
df_mimic_unique.loc[:, 'Ionized_Calcium_(Mean)'] = mimic_temp['Ionized_Calcium_(Mean)'].apply(lambda x: x * 4 if pd.notna(x) else x)
df_mimic_unique.loc[:, 'Ionized_Calcium_(Median)'] = mimic_temp['Ionized_Calcium_(Median)'].apply(lambda x: x * 4 if pd.notna(x) else x)
df_mimic_unique.loc[:, 'Ionized_Calcium_(Min)'] = mimic_temp['Ionized_Calcium_(Min)'].apply(lambda x: x * 4 if pd.notna(x) else x)

# Make a copy df_mimic_unique in order to avoid SettingWithCopyWarning
df_mimic_unique = df_mimic_unique.copy()

# Glucose merge - calculate the mean for each aggregation type and handle NaN values
df_mimic_unique['Glucose (Max)'] = df_mimic_unique[['Glucose_(Max)', 'Glucose_(Max).1', 'Glucose_(Max).2']].mean(axis=1)
df_mimic_unique['Glucose (Mean)'] = df_mimic_unique[['Glucose_(Mean)', 'Glucose_(Mean).1', 'Glucose_(Mean).2']].mean(axis=1)
df_mimic_unique['Glucose (Median)'] = df_mimic_unique[['Glucose_(Median)', 'Glucose_(Median).1', 'Glucose_(Median).2']].mean(axis=1)
df_mimic_unique['Glucose (Min)'] = df_mimic_unique[['Glucose_(Min)', 'Glucose_(Min).1', 'Glucose_(Min).2']].mean(axis=1)

# Drop original Glucose columns to keep only the summarized columns
df_mimic_unique.drop(columns=[
    'Glucose_(Max)', 'Glucose_(Max).1', 'Glucose_(Max).2',
    'Glucose_(Mean)', 'Glucose_(Mean).1', 'Glucose_(Mean).2',
    'Glucose_(Median)', 'Glucose_(Median).1', 'Glucose_(Median).2',
    'Glucose_(Min)', 'Glucose_(Min).1', 'Glucose_(Min).2'
], inplace=True)

In [13]:
logging.info("Starting the pH column processing.")

# Make a copy of df_mimic_unique to avoid SettingWithCopyWarning
df_mimic_unique = df_mimic_unique.copy()

# pH merge
logging.info("Calculating 'pH (Max)' column.")
df_mimic_unique.loc[:, 'pH (Max)'] = df_mimic_unique.apply(
    lambda row: row[['pH_(Max)', 'pH_(Max).1', 'pH_(Max).3']].mean()
    if not all(row[['pH_(Max)', 'pH_(Max).1', 'pH_(Max).3']].isna())
    else np.nan, axis=1
)

logging.info("Calculating 'pH (Mean)' column.")
df_mimic_unique.loc[:, 'pH (Mean)'] = df_mimic_unique.apply(
    lambda row: row[['pH_(Mean)', 'pH_(Mean).1', 'pH_(Mean).2', 'pH_(Mean).3']].mean()
    if not all(row[['pH_(Mean)', 'pH_(Mean).1', 'pH_(Mean).2', 'pH_(Mean).3']].isna())
    else np.nan, axis=1
)

logging.info("Calculating 'pH (Median)' column.")
df_mimic_unique.loc[:, 'pH (Median)'] = df_mimic_unique.apply(
    lambda row: row[['pH_(Median)', 'pH_(Median).1', 'pH_(Median).3']].mean()
    if not all(row[['pH_(Median)', 'pH_(Median).1', 'pH_(Median).3']].isna())
    else np.nan, axis=1
)

logging.info("Calculating 'pH (Min)' column.")
df_mimic_unique.loc[:, 'pH (Min)'] = df_mimic_unique.apply(
    lambda row: row[['pH_(Min)', 'pH_(Min).1', 'pH_(Min).3']].mean()
    if not all(row[['pH_(Min)', 'pH_(Min).1', 'pH_(Min).3']].isna())
    else np.nan, axis=1
)

# Drop original pH columns to keep only the summarized columns
logging.info("Dropping original pH columns.")
df_mimic_unique.drop(columns=[
    'pH_(Max)', 'pH_(Max).1', 'pH_(Max).3',
    'pH_(Mean)', 'pH_(Mean).1', 'pH_(Mean).2', 'pH_(Mean).3',
    'pH_(Median)', 'pH_(Median).1', 'pH_(Median).3',
    'pH_(Min)', 'pH_(Min).1', 'pH_(Min).3'
], inplace=True)

logging.info("pH column processing completed successfully.")

2025-03-22 23:04:16,385 - INFO - Starting the pH column processing.
2025-03-22 23:04:16,532 - INFO - Calculating 'pH (Max)' column.
2025-03-22 23:04:59,228 - INFO - Calculating 'pH (Mean)' column.
2025-03-22 23:05:40,445 - INFO - Calculating 'pH (Median)' column.
2025-03-22 23:06:21,719 - INFO - Calculating 'pH (Min)' column.
2025-03-22 23:07:01,789 - INFO - Dropping original pH columns.
2025-03-22 23:07:01,835 - INFO - pH column processing completed successfully.


# eICU Ver. 2.0

In [14]:
logging.info("Starting the merging process for eICU dataframes.")

try:
    # Merge eICU dataframes
    logging.info("Merging eicu_meam_df and eicu_median_df...")
    merged_eicu_df = eicu_mean_df.merge(
        eicu_median_df, 
        on=['row_count', 'uniquepid', 'patientunitstayid', 'Time_Zone', 'gender', 'age', 'ethnicity', 'unitdischargestatus', 'LOS'], 
        suffixes=('_mean', '_median')
    )
    logging.info("Merged eicu_meam_df and eicu_median_df successfully.")

    logging.info("Merging with eicu_min_df...")
    merged_eicu_df = merged_eicu_df.merge(
        eicu_min_df, 
        on=['row_count', 'uniquepid', 'patientunitstayid', 'Time_Zone', 'gender', 'age', 'ethnicity', 'unitdischargestatus', 'LOS'], 
        suffixes=('', '_min')
    )
    logging.info("Merged with eicu_min_df successfully.")

    logging.info("Merging with eicu_max_df...")
    merged_eicu_df = merged_eicu_df.merge(
        eicu_max_df, 
        on=['row_count', 'uniquepid', 'patientunitstayid', 'Time_Zone', 'gender', 'age', 'ethnicity', 'unitdischargestatus', 'LOS'], 
        suffixes=('', '_max')
    )
    logging.info("Merged with eicu_max_df successfully.")

    # Move the 'unitdischargestatus' and 'LOS' columns to the end of the dataframe
    logging.info("Reordering columns: moving 'unitdischargestatus' and 'LOS' to the end...")
    unitdischargestatus_column = merged_eicu_df.pop('unitdischargestatus')
    los_column = merged_eicu_df.pop('LOS')
    merged_eicu_df = pd.concat([merged_eicu_df, unitdischargestatus_column, los_column], axis=1)
    logging.info("Reordered columns successfully.")

    # Rename the last two columns to preserve their original names
    logging.info("Renaming the last two columns to preserve original names...")
    merged_eicu_df.columns = list(merged_eicu_df.columns[:-2]) + ['unitdischargestatus', 'LOS']
    logging.info("Renamed the last two columns successfully.")

except Exception as e:
    logging.error(f"An error occurred during the merging process: {e}")

logging.info("Merging process for eICU dataframes completed successfully.")

2025-03-22 23:08:03,695 - INFO - Starting the merging process for eICU dataframes.
2025-03-22 23:08:03,697 - INFO - Merging eicu_meam_df and eicu_median_df...
2025-03-22 23:08:03,946 - INFO - Merged eicu_meam_df and eicu_median_df successfully.
2025-03-22 23:08:03,947 - INFO - Merging with eicu_min_df...
2025-03-22 23:08:04,393 - INFO - Merged with eicu_min_df successfully.
2025-03-22 23:08:04,394 - INFO - Merging with eicu_max_df...
2025-03-22 23:08:05,004 - INFO - Merged with eicu_max_df successfully.
2025-03-22 23:08:05,006 - INFO - Reordering columns: moving 'unitdischargestatus' and 'LOS' to the end...
2025-03-22 23:08:05,703 - INFO - Reordered columns successfully.
2025-03-22 23:08:05,704 - INFO - Renaming the last two columns to preserve original names...
2025-03-22 23:08:05,705 - INFO - Renamed the last two columns successfully.
2025-03-22 23:08:05,707 - INFO - Merging process for eICU dataframes completed successfully.


In [15]:
# Drop second column from the column_names_df
eicu_columns_to_keep.drop(columns=['Unnamed: 1'], inplace=True)

# Extract column names from columns_to_keep DataFrame
columns_to_keep_names = eicu_columns_to_keep['column'].tolist()

# Select only the desired columns
eicu_temp = merged_eicu_df[columns_to_keep_names]

In [16]:
"""--------Replace Block----------"""
# Make a copy df_mimic_unique in order to avoid SettingWithCopyWarning
eicu_temp = eicu_temp.copy()

# Replace 'Alive' with 0 and 'Expired' with 1 in the 'unitdischargestatus' column
eicu_temp.loc[:, 'unitdischargestatus'] = eicu_temp['unitdischargestatus'].replace({'Alive': 0, 'Expired': 1})

# Replace 'Female' with 'F' and 'Male' with 'M' in the 'gender' column
eicu_temp.loc[:, 'gender'] = eicu_temp['gender'].replace({'Female': 'F', 'Male': 'M'})


# Replace values in the 'ethnicity' column for standardization
eicu_temp.loc[:, 'ethnicity'] = eicu_temp['ethnicity'].replace({
    'African American': 'BLACK/AFRICAN AMERICAN',
    'Caucasian': 'WHITE',
    'Hispanic': 'HISPANIC OR LATINO',
    'Asian': 'ASIAN',
    'Native American': 'AMERICAN INDIAN/ALASKA NATIVE',
    'Other/Unknown': 'UNKNOWN'
})

# Replace age values higher than 89 with 90, and convert age to integer
eicu_temp.loc[:, 'age'] = eicu_temp['age'].replace('> 89', 90)
eicu_temp.loc[:, 'age'] = eicu_temp['age'].astype(np.int64)

  eicu_temp.loc[:, 'unitdischargestatus'] = eicu_temp['unitdischargestatus'].replace({'Alive': 0, 'Expired': 1})


In [17]:
# Remove spaces and commas
df_mimic_unique.columns = df_mimic_unique.columns.str.replace(r'[ ,]+', '_', regex=True)

eicu_temp.columns = eicu_temp.columns.str.replace(r'[ ,]+', '_', regex=True)

In [18]:
logging.info("Starting the processing of bedside glucose columns in eicu_temp.")

try:
    # Make a copy of eicu_temp to avoid SettingWithCopyWarning
    eicu_temp = eicu_temp.copy()

    # Calculate 'bedside_glucose (Max)'
    logging.info("Calculating 'bedside_glucose (Max)' column.")
    eicu_temp.loc[:, 'bedside_glucose (Max)'] = eicu_temp.apply(
        lambda row: row[['bedside_glucose_(Max)', 'Bedside_Glucose_(Max)']].mean()
        if not all(row[['bedside_glucose_(Max)', 'Bedside_Glucose_(Max)']].isna())
        else np.nan, axis=1
    )

    # Calculate 'bedside_glucose (Mean)'
    logging.info("Calculating 'bedside_glucose (Mean)' column.")
    eicu_temp.loc[:, 'bedside_glucose (Mean)'] = eicu_temp.apply(
        lambda row: row[['bedside_glucose_(Mean)', 'Bedside_Glucose_(Mean)']].mean()
        if not all(row[['bedside_glucose_(Mean)', 'Bedside_Glucose_(Mean)']].isna())
        else np.nan, axis=1
    )

    # Calculate 'bedside_glucose (Median)'
    logging.info("Calculating 'bedside_glucose (Median)' column.")
    eicu_temp.loc[:, 'bedside_glucose (Median)'] = eicu_temp.apply(
        lambda row: row[['bedside_glucose_(Median)', 'Bedside_Glucose_(Median)']].mean()
        if not all(row[['bedside_glucose_(Median)', 'Bedside_Glucose_(Median)']].isna())
        else np.nan, axis=1
    )

    # Calculate 'bedside_glucose (Min)'
    logging.info("Calculating 'bedside_glucose (Min)' column.")
    eicu_temp.loc[:, 'bedside_glucose (Min)'] = eicu_temp.apply(
        lambda row: row[['bedside_glucose_(Min)', 'Bedside_Glucose_(Min)']].mean()
        if not all(row[['bedside_glucose_(Min)', 'Bedside_Glucose_(Min)']].isna())
        else np.nan, axis=1
    )

    # Drop original bedside glucose columns
    logging.info("Dropping original bedside glucose columns.")
    eicu_temp.drop(columns=[
        'bedside_glucose_(Max)', 'Bedside_Glucose_(Max)',
        'bedside_glucose_(Mean)', 'Bedside_Glucose_(Mean)',
        'bedside_glucose_(Median)', 'Bedside_Glucose_(Median)',
        'bedside_glucose_(Min)', 'Bedside_Glucose_(Min)'
    ], inplace=True)

    # Replace spaces or commas in column names with underscores
    logging.info("Replacing spaces and commas in column names with underscores.")
    eicu_temp.columns = eicu_temp.columns.str.replace(r'[ ,]+', '_', regex=True)

    logging.info("Processing of bedside glucose columns completed successfully.")

except Exception as e:
    logging.error(f"An error occurred during the processing of bedside glucose columns: {e}")

2025-03-22 23:08:57,589 - INFO - Starting the processing of bedside glucose columns in eicu_temp.
2025-03-22 23:08:57,667 - INFO - Calculating 'bedside_glucose (Max)' column.
2025-03-22 23:10:02,906 - INFO - Calculating 'bedside_glucose (Mean)' column.
2025-03-22 23:11:09,312 - INFO - Calculating 'bedside_glucose (Median)' column.
2025-03-22 23:12:15,566 - INFO - Calculating 'bedside_glucose (Min)' column.
2025-03-22 23:13:22,081 - INFO - Dropping original bedside glucose columns.
2025-03-22 23:13:22,148 - INFO - Replacing spaces and commas in column names with underscores.
2025-03-22 23:13:22,150 - INFO - Processing of bedside glucose columns completed successfully.


In [19]:
# Rename eICU header to align with mimics
column_eicu_mapping = {
    'column': 'column',
    'row_count': 'row_count',
    'uniquepid': 'subject_id',
    'patientunitstayid': 'hadm_id',
    'Time_Zone': 'Time_Zone',
    'gender': 'gender',
    'age': 'age',
    'ethnicity': 'race',
    'Base_Excess_(Max)': 'Base_Excess_(Max)',
    'Base_Excess_(Mean)': 'Base_Excess_(Mean)',
    'Base_Excess_(Median)': 'Base_Excess_(Median)',
    'Base_Excess_(Min)': 'Base_Excess_(Min)',
    'lactate_(Max)': 'Lactate_(Max)',
    'lactate_(Mean)': 'Lactate_(Mean)',
    'lactate_(Median)': 'Lactate_(Median)',
    'lactate_(Min)': 'Lactate_(Min)',
    'paCO2_(Max)': 'pCO2_(Max)',
    'paCO2_(Mean)': 'pCO2_(Mean)',
    'paCO2_(Median)': 'pCO2_(Median)',
    'paCO2_(Min)': 'pCO2_(Min)',
    'Total_CO2_(Max)': 'Calculated_Total_CO2_(Max)',
    'Total_CO2_(Mean)': 'Calculated_Total_CO2_(Mean)',
    'Total_CO2_(Median)': 'Calculated_Total_CO2_(Median)',
    'Total_CO2_(Min)': 'Calculated_Total_CO2_(Min)',
    'BUN_(Max)': 'BUN_(Max)',
    'BUN_(Mean)': 'BUN_(Mean)',
    'BUN_(Median)': 'BUN_(Median)',
    'BUN_(Min)': 'BUN_(Min)',
    'pH_(Max)': 'pH_(Max)',
    'pH_(Mean)': 'pH_(Mean)',
    'pH_(Median)': 'pH_(Median)',
    'pH_(Min)': 'pH_(Min)',
    'paO2_(Max)': 'pO2_(Max)',
    'paO2_(Mean)': 'pO2_(Mean)',
    'paO2_(Median)': 'pO2_(Median)',
    'paO2_(Min)': 'pO2_(Min)',
    'ALT_(SGPT)_(Max)': 'Alanine_Aminotransferase_(ALT)_(Max)',
    'ALT_(SGPT)_(Mean)': 'Alanine_Aminotransferase_(ALT)_(Mean)',
    'ALT_(SGPT)_(Median)': 'Alanine_Aminotransferase_(ALT)_(Median)',
    'ALT_(SGPT)_(Min)': 'Alanine_Aminotransferase_(ALT)_(Min)',
    'alkaline_phos._(Max)': 'Alkaline_Phosphatase_(Max)',
    'alkaline_phos._(Mean)': 'Alkaline_Phosphatase_(Mean)',
    'alkaline_phos._(Median)': 'Alkaline_Phosphatase_(Median)',
    'alkaline_phos._(Min)': 'Alkaline_Phosphatase_(Min)',
    'anion_gap_(Max)': 'Anion_Gap_(Max)',
    'anion_gap_(Mean)': 'Anion_Gap_(Mean)',
    'anion_gap_(Median)': 'Anion_Gap_(Median)',
    'anion_gap_(Min)': 'Anion_Gap_(Min)',
    'AST_(SGOT)_(Max)': 'Asparate_Aminotransferase_(AST)_(Max)',
    'AST_(SGOT)_(Mean)': 'Asparate_Aminotransferase_(AST)_(Mean)',
    'AST_(SGOT)_(Median)': 'Asparate_Aminotransferase_(AST)_(Median)',
    'AST_(SGOT)_(Min)': 'Asparate_Aminotransferase_(AST)_(Min)',
    'bicarbonate_(Max)': 'Bicarbonate_(Max)',
    'bicarbonate_(Mean)': 'Bicarbonate_(Mean)',
    'bicarbonate_(Median)': 'Bicarbonate_(Median)',
    'bicarbonate_(Min)': 'Bicarbonate_(Min)',
    'chloride_(Max)': 'Chloride_(Max)',
    'chloride_(Mean)': 'Chloride_(Mean)',
    'chloride_(Median)': 'Chloride_(Median)',
    'chloride_(Min)': 'Chloride_(Min)',
    'creatinine_(Max)': 'Creatinine_(Max)',
    'creatinine_(Mean)': 'Creatinine_(Mean)',
    'creatinine_(Median)': 'Creatinine_(Median)',
    'creatinine_(Min)': 'Creatinine_(Min)',
    'glucose_(Max)': 'Glucose_(Max)',
    'glucose_(Mean)': 'Glucose_(Mean)',
    'glucose_(Median)': 'Glucose_(Median)',
    'glucose_(Min)': 'Glucose_(Min)',
    'magnesium_(Max)': 'Magnesium_(Max)',
    'magnesium_(Mean)': 'Magnesium_(Mean)',
    'magnesium_(Median)': 'Magnesium_(Median)',
    'magnesium_(Min)': 'Magnesium_(Min)',
    'phosphate_(Max)': 'Phosphate_(Max)',
    'phosphate_(Mean)': 'Phosphate_(Mean)',
    'phosphate_(Median)': 'Phosphate_(Median)',
    'phosphate_(Min)': 'Phosphate_(Min)',
    'potassium_(Max)': 'Potassium_(Max)',
    'potassium_(Mean)': 'Potassium_(Mean)',
    'potassium_(Median)': 'Potassium_(Median)',
    'potassium_(Min)': 'Potassium_(Min)',
    'sodium_(Max)': 'Sodium_(Max)',
    'sodium_(Mean)': 'Sodium_(Mean)',
    'sodium_(Median)': 'Sodium_(Median)',
    'sodium_(Min)': 'Sodium_(Min)',
    'Hct_(Max)': 'Hematocrit_(Max)',
    'Hct_(Mean)': 'Hematocrit_(Mean)',
    'Hct_(Median)': 'Hematocrit_(Median)',
    'Hct_(Min)': 'Hematocrit_(Min)',
    'Hgb_(Max)': 'Hemoglobin_(Max)',
    'Hgb_(Mean)': 'Hemoglobin_(Mean)',
    'Hgb_(Median)': 'Hemoglobin_(Median)',
    'Hgb_(Min)': 'Hemoglobin_(Min)',
    'PT_-_INR_(Max)': 'INR(PT)_(Max)',
    'PT_-_INR_(Mean)': 'INR(PT)_(Mean)',
    'PT_-_INR_(Median)': 'INR(PT)_(Median)',
    'PT_-_INR_(Min)': 'INR(PT)_(Min)',
    'MCH_(Max)': 'MCH_(Max)',
    'MCH_(Mean)': 'MCH_(Mean)',
    'MCH_(Median)': 'MCH_(Median)',
    'MCH_(Min)': 'MCH_(Min)',
    'MCHC_(Max)': 'MCHC_(Max)',
    'MCHC_(Mean)': 'MCHC_(Mean)',
    'MCHC_(Median)': 'MCHC_(Median)',
    'MCHC_(Min)': 'MCHC_(Min)',
    'MCV_(Max)': 'MCV_(Max)',
    'MCV_(Mean)': 'MCV_(Mean)',
    'MCV_(Median)': 'MCV_(Median)',
    'MCV_(Min)': 'MCV_(Min)',
    'platelets_x_1000_(Max)': 'Platelet_Count_(Max)',
    'platelets_x_1000_(Mean)': 'Platelet_Count_(Mean)',
    'platelets_x_1000_(Median)': 'Platelet_Count_(Median)',
    'platelets_x_1000_(Min)': 'Platelet_Count_(Min)',
    'PT_(Max)': 'PT_(Max)',
    'PT_(Mean)': 'PT_(Mean)',
    'PT_(Median)': 'PT_(Median)',
    'PT_(Min)': 'PT_(Min)',
    'PTT_(Max)': 'PTT_(Max)',
    'PTT_(Mean)': 'PTT_(Mean)',
    'PTT_(Median)': 'PTT_(Median)',
    'PTT_(Min)': 'PTT_(Min)',
    'RDW_(Max)': 'RDW_(Max)',
    'RDW_(Mean)': 'RDW_(Mean)',
    'RDW_(Median)': 'RDW_(Median)',
    'RDW_(Min)': 'RDW_(Min)',
    'RBC_(Max)': 'Red_Blood_Cells_(Max)',
    'RBC_(Mean)': 'Red_Blood_Cells_(Mean)',
    'RBC_(Median)': 'Red_Blood_Cells_(Median)',
    'RBC_(Min)': 'Red_Blood_Cells_(Min)',
    'WBC_x_1000_(Max)': 'White_Blood_Cells_(Max)',
    'WBC_x_1000_(Mean)': 'White_Blood_Cells_(Mean)',
    'WBC_x_1000_(Median)': 'White_Blood_Cells_(Median)',
    'WBC_x_1000_(Min)': 'White_Blood_Cells_(Min)',
    'Heart_Rate_(Max)': 'Heart_Rate_(bpm)_(Max)',
    'Heart_Rate_(Mean)': 'Heart_Rate_(bpm)_(Mean)',
    'Heart_Rate_(Median)': 'Heart_Rate_(bpm)_(Median)',
    'Heart_Rate_(Min)': 'Heart_Rate_(bpm)_(Min)',
    'Non-Invasive_BP_Diastolic_(Max)': 'Non_Invasive_Blood_Pressure_systolic_(mmHg)_(Max)',
    'Non-Invasive_BP_Diastolic_(Mean)': 'Non_Invasive_Blood_Pressure_systolic_(mmHg)_(Mean)',
    'Non-Invasive_BP_Diastolic_(Median)': 'Non_Invasive_Blood_Pressure_systolic_(mmHg)_(Median)',
    'Non-Invasive_BP_Diastolic_(Min)': 'Non_Invasive_Blood_Pressure_systolic_(mmHg)_(Min)',
    'Non-Invasive_BP_Systolic_(Max)': 'Non_Invasive_Blood_Pressure_diastolic_(mmHg)_(Max)',
    'Non-Invasive_BP_Systolic_(Mean)': 'Non_Invasive_Blood_Pressure_diastolic_(mmHg)_(Mean)',
    'Non-Invasive_BP_Systolic_(Median)': 'Non_Invasive_Blood_Pressure_diastolic_(mmHg)_(Median)',
    'Non-Invasive_BP_Systolic_(Min)': 'Non_Invasive_Blood_Pressure_diastolic_(mmHg)_(Min)',
    'Non-Invasive_BP_Mean_(Max)': 'Non_Invasive_Blood_Pressure_mean_(mmHg)_(Max)',
    'Non-Invasive_BP_Mean_(Mean)': 'Non_Invasive_Blood_Pressure_mean_(mmHg)_(Mean)',
    'Non-Invasive_BP_Mean_(Median)': 'Non_Invasive_Blood_Pressure_mean_(mmHg)_(Median)',
    'Non-Invasive_BP_Mean_(Min)': 'Non_Invasive_Blood_Pressure_mean_(mmHg)_(Min)',
    'Respiratory_Rate_(Max)': 'Respiratory_Rate_(insp/min)_(Max)',
    'Respiratory_Rate_(Mean)': 'Respiratory_Rate_(insp/min)_(Mean)',
    'Respiratory_Rate_(Median)': 'Respiratory_Rate_(insp/min)_(Median)',
    'Respiratory_Rate_(Min)': 'Respiratory_Rate_(insp/min)_(Min)',
    'O2_Saturation_(Max)': 'O2_saturation_pulseoxymetry_(%)_(Max)',
    'O2_Saturation_(Mean)': 'O2_saturation_pulseoxymetry_(%)_(Mean)',
    'O2_Saturation_(Median)': 'O2_saturation_pulseoxymetry_(%)_(Median)',
    'O2_Saturation_(Min)': 'O2_saturation_pulseoxymetry_(%)_(Min)',
    'CI_(Max)': 'Chloride_(serum)_(Max)',
    'CI_(Mean)': 'Chloride_(serum)_(Mean)',
    'CI_(Median)': 'Chloride_(serum)_(Median)',
    'CI_(Min)': 'Chloride_(serum)_(Min)',
    'calcium_(Max)': 'Calcium_non-ionized_(Max)',
    'calcium_(Mean)': 'Calcium_non-ionized_(Mean)',
    'calcium_(Median)': 'Calcium_non-ionized_(Median)',
    'calcium_(Min)': 'Calcium_non-ionized_(Min)',
    'CPK_(Max)': 'CK_(CPK)_(Max)',
    'CPK_(Mean)': 'CK_(CPK)_(Mean)',
    'CPK_(Median)': 'CK_(CPK)_(Median)',
    'CPK_(Min)': 'CK_(CPK)_(Min)',
    'Temperature_(F)_(Max)': 'Temperature_Fahrenheit_(F)_(Max)',
    'Temperature_(F)_(Mean)': 'Temperature_Fahrenheit_(F)_(Mean)',
    'Temperature_(F)_(Median)': 'Temperature_Fahrenheit_(F)_(Median)',
    'Temperature_(F)_(Min)': 'Temperature_Fahrenheit_(F)_(Min)',
    'Pain_Score_(Max)': 'Pain_Level_(Max)',
    'Pain_Score_(Mean)': 'Pain_Level_(Mean)',
    'Pain_Score_(Median)': 'Pain_Level_(Median)',
    'Pain_Score_(Min)': 'Pain_Level_(Min)',
    'LPM_O2_(Max)': 'O2_Flow_(L/min)_(Max)',
    'LPM_O2_(Mean)': 'O2_Flow_(L/min)_(Mean)',
    'LPM_O2_(Median)': 'O2_Flow_(L/min)_(Median)',
    'LPM_O2_(Min)': 'O2_Flow_(L/min)_(Min)',
    'O2_L/%_(Max)': 'Inspired_O2_Fraction_(Max)',
    'O2_L/%_(Mean)': 'Inspired_O2_Fraction_(Mean)',
    'O2_L/%_(Median)': 'Inspired_O2_Fraction_(Median)',
    'O2_L/%_(Min)': 'Inspired_O2_Fraction_(Min)',
    'ionized_calcium_(Max)': 'Ionized_Calcium_(Max)',
    'ionized_calcium_(Mean)': 'Ionized_Calcium_(Mean)',
    'ionized_calcium_(Median)': 'Ionized_Calcium_(Median)',
    'ionized_calcium_(Min)': 'Ionized_Calcium_(Min)',
    'albumin_(Max)': 'Albumin_(Max)',
    'albumin_(Mean)': 'Albumin_(Mean)',
    'albumin_(Median)': 'Albumin_(Median)',
    'albumin_(Min)': 'Albumin_(Min)',
    'GCS_Total_(Max)': 'GCS_(Max)',
    'GCS_Total_(Mean)': 'GCS_(Mean)',
    'GCS_Total_(Median)': 'GCS_(Median)',
    'GCS_Total_(Min)': 'GCS_(Min)',
    'LDH_(Max)': 'LDH_(Max)',
    'LDH_(Mean)': 'LDH_(Mean)',
    'LDH_(Median)': 'LDH_(Median)',
    'LDH_(Min)': 'LDH_(Min)',
    'ethanol_(Max)': 'ETOH_(Max)',
    'ethanol_(Mean)': 'ETOH_(Mean)',
    'ethanol_(Median)': 'ETOH_(Median)',
    'ethanol_(Min)': 'ETOH_(Min)',
    'Invasive_BP_Systolic_(Max)': 'Arterial_Blood_Pressure_systolic_(mmHg)_(Max)',
    'Invasive_BP_Systolic_(Mean)': 'Arterial_Blood_Pressure_systolic_(mmHg)_(Mean)',
    'Invasive_BP_Systolic_(Median)': 'Arterial_Blood_Pressure_systolic_(mmHg)_(Median)',
    'Invasive_BP_Systolic_(Min)': 'Arterial_Blood_Pressure_systolic_(mmHg)_(Min)',
    'Invasive_BP_Mean_(Max)': 'Arterial_Blood_Pressure_mean_(mmHg)_(Max)',
    'Invasive_BP_Mean_(Mean)': 'Arterial_Blood_Pressure_mean_(mmHg)_(Mean)',
    'Invasive_BP_Mean_(Median)': 'Arterial_Blood_Pressure_mean_(mmHg)_(Median)',
    'Invasive_BP_Mean_(Min)': 'Arterial_Blood_Pressure_mean_(mmHg)_(Min)',
    'serum_osmolality_(Max)': 'Serum_Osmolality_(Max)',
    'serum_osmolality_(Mean)': 'Serum_Osmolality_(Mean)',
    'serum_osmolality_(Median)': 'Serum_Osmolality_(Median)',
    'serum_osmolality_(Min)': 'Serum_Osmolality_(Min)',
    'troponin_-_I_(Max)': 'Troponin-T_(Max)',
    'troponin_-_I_(Mean)': 'Troponin-T_(Mean)',
    'troponin_-_I_(Median)': 'Troponin-T_(Median)',
    'troponin_-_I_(Min)': 'Troponin-T_(Min)',
    'uric_acid_(Max)': 'Uric_Acid_(Max)',
    'uric_acid_(Mean)': 'Uric_Acid_(Mean)',
    'uric_acid_(Median)': 'Uric_Acid_(Median)',
    'uric_acid_(Min)': 'Uric_Acid_(Min)',
    'ammonia_(Max)': 'Ammonia_(Max)',
    'ammonia_(Mean)': 'Ammonia_(Mean)',
    'ammonia_(Median)': 'Ammonia_(Median)',
    'ammonia_(Min)': 'Ammonia_(Min)',
    'CRP_(Max)': 'C_Reactive_Protein_(CRP)_(Max)',
    'CRP_(Mean)': 'C_Reactive_Protein_(CRP)_(Mean)',
    'CRP_(Median)': 'C_Reactive_Protein_(CRP)_(Min)',
    'CRP_(Min)': 'C_Reactive_Protein_(CRP)_(Median)',
    'fibrinogen_(Max)': 'Fibrinogen_(Max)',
    'fibrinogen_(Mean)': 'Fibrinogen_(Mean)',
    'fibrinogen_(Median)': 'Fibrinogen_(Median)',
    'fibrinogen_(Min)': 'Fibrinogen_(Min)',
    'PA_Systolic_(Max)': 'Pulmonary_Artery_Pressure_systolic_(mmHg)_(Max)',
    'PA_Systolic_(Mean)': 'Pulmonary_Artery_Pressure_systolic_(mmHg)_(Mean)',
    'PA_Systolic_(Median)': 'Pulmonary_Artery_Pressure_systolic_(mmHg)_(Median)',
    'PA_Systolic_(Min)': 'Pulmonary_Artery_Pressure_systolic_(mmHg)_(Min)',	
    'PA_Diastolic_(Max)': 'Pulmonary_Artery_Pressure_diastolic_(mmHg)_(Max)',
    'PA_Diastolic_(Mean)': 'Pulmonary_Artery_Pressure_diastolic_(mmHg)_(Mean)',
    'PA_Diastolic_(Median)': 'Pulmonary_Artery_Pressure_diastolic_(mmHg)_(Median)',
    'PA_Diastolic_(Min)': 'Pulmonary_Artery_Pressure_diastolic_(mmHg)_(Min)',
    'PA_Mean_(Max)': 'Pulmonary_Artery_Pressure_mean_(mmHg)_(Max)',
    'PA_Mean_(Mean)': 'Pulmonary_Artery_Pressure_mean_(mmHg)_(Mean)',
    'PA_Mean_(Median)': 'Pulmonary_Artery_Pressure_mean_(mmHg)_(Median)',
    'PA_Mean_(Min)': 'Pulmonary_Artery_Pressure_mean_(mmHg)_(Min)',
    'bedside_glucose_(Max)': 'Glucose_finger_stick_(range_70-100)_(Max)',
    'bedside_glucose_(Mean)': 'Glucose_finger_stick_(range_70-100)_(Mean)',
    'bedside_glucose_(Median)': 'Glucose_finger_stick_(range_70-100)_(Median)',
    'bedside_glucose_(Min)': 'Glucose_finger_stick_(range_70-100)_(Min)',
    'reticulocyte_count_(Max)': 'Reticulocyte_Count_Automated_(Mean)',
    'reticulocyte_count_(Mean)': 'Reticulocyte_Count_Automated_(Median)',
    'reticulocyte_count_(Median)': 'Reticulocyte_Count_Automated_(Min)',
    'reticulocyte_count_(Min)': 'Reticulocyte_Count_Automated_(Max)',
    '-basos_(Max)': 'Differential-Basos_(Max)',
    '-basos_(Mean)': 'Differential-Basos_(Mean)',
    '-basos_(Median)': 'Differential-Basos_(Median)',
    '-basos_(Min)': 'Differential-Basos_(Min)',
    '-eos_(Max)': 'Differential-Eos_(Max)',
    '-eos_(Mean)': 'Differential-Eos_(Mean)',
    '-eos_(Median)': 'Differential-Eos_(Median)',
    '-eos_(Min)': 'Differential-Eos_(Min)',
    '-lymphs_(Max)': 'Differential-Lymphs_(Max)',
    '-lymphs_(Mean)': 'Differential-Lymphs_(Mean)',
    '-lymphs_(Median)': 'Differential-Lymphs_(Median)',
    '-lymphs_(Min)': 'Differential-Lymphs_(Min)',
    '-monos_(Max)': 'Differential-Monos_(Max)',
    '-monos_(Mean)': 'Differential-Monos_(Mean)',
    '-monos_(Median)': 'Differential-Monos_(Median)',
    '-monos_(Min)': 'Differential-Monos_(Min)',
    '-polys_(Max)': 'Differential-Neuts_(Max)',
    '-polys_(Mean)': 'Differential-Neuts_(Mean)',
    '-polys_(Median)': 'Differential-Neuts_(Median)',
    '-polys_(Min)': 'Differential-Neuts_(Min)',
    'haptoglobin_(Max)': 'Haptoglobin_(Max)',
    'haptoglobin_(Mean)': 'Haptoglobin_(Mean)',
    'haptoglobin_(Median)': 'Haptoglobin_(Median)',
    'haptoglobin_(Min)': 'Haptoglobin_(Min)',
    'direct_bilirubin_(Max)': 'Bilirubin_Direct_(Max)',
    'direct_bilirubin_(Mean)': 'Bilirubin_Direct_(Mean)',
    'direct_bilirubin_(Median)': 'Bilirubin_Direct_(Median)',
    'direct_bilirubin_(Min)': 'Bilirubin_Direct_(Min)',
    'free_T4_(Max)': 'Thyroxine_(T4)_Free_(Max)',
    'free_T4_(Mean)': 'Thyroxine_(T4)_Free_(Mean)',
    'free_T4_(Median)': 'Thyroxine_(T4)_Free_(Median)',
    'free_T4_(Min)': 'Thyroxine_(T4)_Free_(Min)',
    'ESR_(Max)': 'Sedimentation_Rate_(Max)',
    'ESR_(Mean)': 'Sedimentation_Rate_(Mean)',
    'ESR_(Median)': 'Sedimentation_Rate_(Median)',
    'ESR_(Min)': 'Sedimentation_Rate_(Min)',
    'CPK-MB_INDEX_(Max)': 'CK-MB_(Max)',
    'CPK-MB_INDEX_(Mean)': 'CK-MB_(Mean)',
    'CPK-MB_INDEX_(Median)': 'CK-MB_(Median)',
    'CPK-MB_INDEX_(Min)': 'CK-MB_(Min)',
    'amylase_(Max)': 'Amylase_(Max)',
    'amylase_(Mean)': 'Amylase_(Mean)',
    'amylase_(Median)': 'Amylase_(Median)',
    'amylase_(Min)': 'Amylase_(Min)',
    'PEEP_(Max)': 'PEEP_set_(cmH2O)_(Max)',
    'PEEP_(Mean)': 'PEEP_set_(cmH2O)_(Mean)',
    'PEEP_(Median)': 'PEEP_set_(cmH2O)_(Median)',
    'PEEP_(Min)': 'PEEP_set_(cmH2O)_(Min)',
    'CVP_(Max)': 'Central_Venous_Pressure_(mmHg)_(Max)',
    'CVP_(Mean)': 'Central_Venous_Pressure_(mmHg)_(Mean)',
    'CVP_(Median)': 'Central_Venous_Pressure_(mmHg)_(Median)',
    'CVP_(Min)': 'Central_Venous_Pressure_(mmHg)_(Min)',
    'total_bilirubin_(Max)': 'Total_Bilirubin_(Max)',
    'total_bilirubin_(Mean)': 'Total_Bilirubin_(Mean)',
    'total_bilirubin_(Median)': 'Total_Bilirubin_(Median)',
    'total_bilirubin_(Min)': 'Total_Bilirubin_(Min)',
    'Invasive_BP_Diastolic_(Max)': 'Arterial_Blood_Pressure_diastolic_(mmHg)_(Max)',
    'Invasive_BP_Diastolic_(Mean)': 'Arterial_Blood_Pressure_diastolic_(mmHg)_(Mean)',
    'Invasive_BP_Diastolic_(Median)': 'Arterial_Blood_Pressure_diastolic_(mmHg)_(Median)',
    'Invasive_BP_Diastolic_(Min)': 'Arterial_Blood_Pressure_diastolic_(mmHg)_(Min)',
    'unitdischargestatus': 'hospital_expire_flag',
    'LOS': 'los'
}

# Replace the DataFrame and column names mapping
eicu_temp.rename(columns=column_eicu_mapping, inplace=True)

temperature_rename_mapping = {
    'Temperature_Fahrenheit_(°F)_(Max)': 'Temperature_Fahrenheit_(F)_(Max)',
    'Temperature_Fahrenheit_(°F)_(Mean)': 'Temperature_Fahrenheit_(F)_(Mean)',
    'Temperature_Fahrenheit_(°F)_(Median)': 'Temperature_Fahrenheit_(F)_(Median)',
    'Temperature_Fahrenheit_(°F)_(Min)': 'Temperature_Fahrenheit_(F)_(Min)'
}

# Rename the columns using the dictionary and reassign the DataFrame
df_mimic_unique = df_mimic_unique.rename(columns=temperature_rename_mapping)

# Remove "-" from the 'subject_id' column in eicu
eicu_temp['subject_id'] = eicu_temp['subject_id'].str.replace('-', '')

# Convert 'subject_id' in eicu to int64
eicu_temp['subject_id'] = eicu_temp['subject_id'].astype(np.int64)

In [20]:
# Replace 'Survive' with 0 and 'Death' with 1 in the 'hospital_expire_flag' column
df_mimic_unique['hospital_expire_flag'] = df_mimic_unique['hospital_expire_flag'].replace({'Survive': 0, 'Death': 1})

  df_mimic_unique['hospital_expire_flag'] = df_mimic_unique['hospital_expire_flag'].replace({'Survive': 0, 'Death': 1})


In [26]:
# Check if mimic and eicu datasets have the same dtype and header names
# Get the column names from each DataFrame
mimic_columns = set(df_mimic_unique.columns)
eicu_columns = set(eicu_temp.columns)

# Get the column names and dtypes of mimic_df
mimic_info = df_mimic_unique.dtypes

# Get the column names and dtypes of eicu_df
eicu_info = eicu_temp.dtypes

# Find the columns that are in mimic_df but not in eicu_df
mimic_not_in_eicu = mimic_columns - eicu_columns

# Find the columns that are in eicu_df but not in mimic_df
eicu_not_in_mimic = eicu_columns - mimic_columns

# Display columns that are different
if mimic_not_in_eicu:
    print("Columns in mimic_df but not in eicu_df:")
    print(mimic_not_in_eicu)

if eicu_not_in_mimic:
    print("\nColumns in eicu_df but not in mimic_df:")
    print(eicu_not_in_mimic)

if not mimic_not_in_eicu and not eicu_not_in_mimic:
    print("The column names are identical between mimic_df and eicu_df.")

# Check if the number of columns is the same
if len(mimic_info) != len(eicu_info):
    print("Number of columns is different between mimic_df and eicu_df.")
else:
    # Iterate over the columns and compare the data type.
    for column_name in mimic_info.index:
        mimic_dtype = mimic_info[column_name]
        eicu_dtype = eicu_info[column_name]
        if mimic_dtype != eicu_dtype:
            print(f"Column '{column_name}' has different data types: mimic_df has '{mimic_dtype}' and eicu_df has '{eicu_dtype}'.")

The column names are identical between mimic_df and eicu_df.


In [22]:
# Convert 'age' in eicu_temp to numeric, handling any non-numeric values by coercing to NaN, then convert to Int64 (nullable integer type)
eicu_temp['age'] = pd.to_numeric(eicu_temp['age'], errors='coerce').astype('Int64')

# Convert 'hospital_expire_flag' in eicu_temp to numeric, handling non-numeric values, and convert to Int64
eicu_temp['hospital_expire_flag'] = pd.to_numeric(eicu_temp['hospital_expire_flag'], errors='coerce').astype('Int64')

# Ensure 'age' and 'hospital_expire_flag' in df_mimic_unique are also Int64 to handle any potential missing values consistently
df_mimic_unique['age'] = df_mimic_unique['age'].astype('Int64')
df_mimic_unique['hospital_expire_flag'] = df_mimic_unique['hospital_expire_flag'].astype('Int64')

In [23]:
# Put 'hospital_expire_flag' and 'los' to the end of df_mimic_unique
hospital_expire_flag_mimic = df_mimic_unique.pop('hospital_expire_flag')
los_mimic = df_mimic_unique.pop('los')
df_mimic_unique = pd.concat([df_mimic_unique, hospital_expire_flag_mimic, los_mimic], axis=1)

# Move 'hospital_expire_flag' and 'los' to the end of eicu_temp
hospital_expire_flag_eicu = eicu_temp.pop('hospital_expire_flag')
los_eicu = eicu_temp.pop('los')
eicu_temp = pd.concat([eicu_temp, hospital_expire_flag_eicu, los_eicu], axis=1)

In [24]:
try:
    logging.info("Starting the export of Mimic DataFrame to a CSV file.")
    
    # Define the output path
    output_path = (f"CSV/exports/whole_set/{subfolder}_mimic.csv")
    output_dir = os.path.dirname(output_path)
    
    # Ensure the directory exists
    if not os.path.exists(output_dir):
        logging.info(f"Directory {output_dir} does not exist. Creating it.")
        os.makedirs(output_dir, exist_ok=True)
    
    # Export the merged DataFrame to a CSV file
    df_mimic_unique.to_csv(output_path, index=False)
    logging.info(f"DataFrame successfully exported to {output_path}.")
except Exception as e:
    logging.error(f"An error occurred during the export of the DataFrame: {e}")

2025-03-22 23:15:01,021 - INFO - Starting the export of Mimic DataFrame to a CSV file.
2025-03-22 23:15:01,023 - INFO - Directory CSV/exports/whole_set does not exist. Creating it.
2025-03-22 23:15:13,629 - INFO - DataFrame successfully exported to CSV/exports/whole_set/o3_hour_overlap_window_mimic.csv.


In [25]:
try:
    logging.info("Starting the export of eICU DataFrame to a CSV file.")
    
    # Define the output path
    output_path = (f"CSV/exports/whole_set/{subfolder}_eicu.csv")
    output_dir = os.path.dirname(output_path)
    
    # Ensure the directory exists
    if not os.path.exists(output_dir):
        logging.info(f"Directory {output_dir} does not exist. Creating it.")
        os.makedirs(output_dir, exist_ok=True)
    
    # Export the merged DataFrame to a CSV file
    eicu_temp.to_csv(output_path, index=False)
    logging.info(f"DataFrame successfully exported to {output_path}.")
except Exception as e:
    logging.error(f"An error occurred during the export of the DataFrame: {e}")

2025-03-22 23:15:32,763 - INFO - Starting the export of eICU DataFrame to a CSV file.
2025-03-22 23:15:50,251 - INFO - DataFrame successfully exported to CSV/exports/whole_set/o3_hour_overlap_window_eicu.csv.


In [29]:
eicu_temp.shape

(86672, 317)

In [28]:
df_mimic_unique.shape

(58144, 317)