In [2]:
import csv
import pandas as pd
import numpy as np

In [3]:
# Read MIMICs CSV file
mimic_mean_df = pd.read_csv('..\\01_MimicIV\\CSV\\Exports\\datasets\\whole_set\\o01_final_mean_with_los.csv', low_memory=False)
mimic_median_df = pd.read_csv('..\\01_MimicIV\\CSV\\Exports\\datasets\\whole_set\\o02_final_median_with_los.csv', low_memory=False)
mimic_min_df = pd.read_csv('..\\01_MimicIV\\CSV\\Exports\\datasets\\whole_set\\o03_final_min_with_los.csv', low_memory=False)
mimic_max_df = pd.read_csv('..\\01_MimicIV\\CSV\\Exports\\datasets\\whole_set\\o04_final_max_with_los.csv', low_memory=False)

# Read eICUs CSV file
eicu_meam_df = pd.read_csv('..\\02_eICU\\CSV\\Exports\\datasets\\whole_set\\o01_final_mean_table.csv', low_memory=False)
eicu_median_df = pd.read_csv('..\\02_eICU\\CSV\\Exports\\datasets\\whole_set\\o02_final_median_table.csv', low_memory=False)
eicu_min_df = pd.read_csv('..\\02_eICU\\CSV\\Exports\\datasets\\whole_set\\o03_final_min_table.csv', low_memory=False)
eicu_max_df = pd.read_csv('..\\02_eICU\\CSV\\Exports\\datasets\\whole_set\\o04_final_max_table.csv', low_memory=False)

In [82]:
# columns to keep
mimic_columns_to_keep = pd.read_csv('CSV\\imports\\mimic_features.csv')
eicu_columns_to_keep = pd.read_csv("CSV\\imports\\eicu_features.csv")

# MIMIC

In [4]:
# Merge mimics dataframes
merged_mimic_df = mimic_mean_df.merge(mimic_median_df, on=['row_count','subject_id', 'hadm_id', 'Time_Zone', 'gender', 'age', 'language', 'marital_status', 'race', 'hospital_expire_flag', 'los'], suffixes=('_mean', '_median'))
merged_mimic_df = merged_mimic_df.merge(mimic_min_df, on=['row_count','subject_id', 'hadm_id', 'Time_Zone', 'gender', 'age', 'language', 'marital_status', 'race', 'hospital_expire_flag', 'los'], suffixes=('', '_min'))
merged_mimic_df = merged_mimic_df.merge(mimic_max_df, on=['row_count','subject_id', 'hadm_id', 'Time_Zone', 'gender', 'age', 'language', 'marital_status', 'race', 'hospital_expire_flag', 'los'], suffixes=('', '_max'))

# Rename columns to replace suffixes
merged_mimic_df.columns = merged_mimic_df.columns.str.replace(r'\s*-\s*Mean', ' (Mean)', regex=True)
merged_mimic_df.columns = merged_mimic_df.columns.str.replace(r'\s*-\s*Median', ' (Median)', regex=True)
merged_mimic_df.columns = merged_mimic_df.columns.str.replace(r'\s*-\s*Min', ' (Min)', regex=True)
merged_mimic_df.columns = merged_mimic_df.columns.str.replace(r'\s*-\s*Max', ' (Max)', regex=True)

# Move the 'hospital_expire_flag' and 'LOS' columns to the end of the dataframe
hospital_expire_flag_column = merged_mimic_df.pop('hospital_expire_flag')
los_column = merged_mimic_df.pop('los')
merged_mimic_df = pd.concat([merged_mimic_df, hospital_expire_flag_column, los_column], axis=1)

# Rename the last two columns to preserve their original names
merged_mimic_df.columns = list(merged_mimic_df.columns[:-2]) + ['hospital_expire_flag', 'los']

In [6]:
# Summarize GCS components for Mean, Median, Min, and Max while handling NaNs
merged_mimic_df['GCS (Mean)'] = merged_mimic_df.apply(lambda row: row[['GCS - Eye Opening (Mean)', 'GCS - Verbal Response (Mean)', 'GCS - Motor Response (Mean)']].sum() if not all(row[['GCS - Eye Opening (Mean)', 'GCS - Verbal Response (Mean)', 'GCS - Motor Response (Mean)']].isna()) else np.nan, axis=1)
merged_mimic_df['GCS (Median)'] = merged_mimic_df.apply(lambda row: row[['GCS - Eye Opening (Median)', 'GCS - Verbal Response (Median)', 'GCS - Motor Response (Median)']].sum() if not all(row[['GCS - Eye Opening (Median)', 'GCS - Verbal Response (Median)', 'GCS - Motor Response (Median)']].isna()) else np.nan, axis=1)
merged_mimic_df['GCS (Min)'] = merged_mimic_df.apply(lambda row: row[['GCS - Eye Opening (Min)', 'GCS - Verbal Response (Min)', 'GCS - Motor Response (Min)']].sum() if not all(row[['GCS - Eye Opening (Min)', 'GCS - Verbal Response (Min)', 'GCS - Motor Response (Min)']].isna()) else np.nan, axis=1)
merged_mimic_df['GCS (Max)'] = merged_mimic_df.apply(lambda row: row[['GCS - Eye Opening (Max)', 'GCS - Verbal Response (Max)', 'GCS - Motor Response (Max)']].sum() if not all(row[['GCS - Eye Opening (Max)', 'GCS - Verbal Response (Max)', 'GCS - Motor Response (Max)']].isna()) else np.nan, axis=1)

# Drop the original GCS component columns
merged_mimic_df.drop(columns=[
    'GCS - Eye Opening (Mean)', 'GCS - Verbal Response (Mean)', 'GCS - Motor Response (Mean)',
    'GCS - Eye Opening (Median)', 'GCS - Verbal Response (Median)', 'GCS - Motor Response (Median)',
    'GCS - Eye Opening (Min)', 'GCS - Verbal Response (Min)', 'GCS - Motor Response (Min)',
    'GCS - Eye Opening (Max)', 'GCS - Verbal Response (Max)', 'GCS - Motor Response (Max)'
], inplace=True)

In [7]:
# Summarize Braden components for Mean, Median, Min, and Max while handling NaNs
merged_mimic_df['Braden (Mean)'] = merged_mimic_df.apply(lambda row: row[['Braden Sensory Perception (Mean)', 'Braden Moisture (Mean)', 'Braden Activity (Mean)', 'Braden Mobility (Mean)', 'Braden Nutrition (Mean)', 'Braden Friction/Shear (Mean)']].sum() if not all(row[['Braden Sensory Perception (Mean)', 'Braden Moisture (Mean)', 'Braden Activity (Mean)', 'Braden Mobility (Mean)', 'Braden Nutrition (Mean)', 'Braden Friction/Shear (Mean)']].isna()) else np.nan, axis=1)
merged_mimic_df['Braden (Median)'] = merged_mimic_df.apply(lambda row: row[['Braden Sensory Perception (Median)', 'Braden Moisture (Median)', 'Braden Activity (Median)', 'Braden Mobility (Median)', 'Braden Nutrition (Median)', 'Braden Friction/Shear (Median)']].sum() if not all(row[['Braden Sensory Perception (Median)', 'Braden Moisture (Median)', 'Braden Activity (Median)', 'Braden Mobility (Median)', 'Braden Nutrition (Median)', 'Braden Friction/Shear (Median)']].isna()) else np.nan, axis=1)
merged_mimic_df['Braden (Min)'] = merged_mimic_df.apply(lambda row: row[['Braden Sensory Perception (Min)', 'Braden Moisture (Min)', 'Braden Activity (Min)', 'Braden Mobility (Min)', 'Braden Nutrition (Min)', 'Braden Friction/Shear (Min)']].sum() if not all(row[['Braden Sensory Perception (Min)', 'Braden Moisture (Min)', 'Braden Activity (Min)', 'Braden Mobility (Min)', 'Braden Nutrition (Min)', 'Braden Friction/Shear (Min)']].isna()) else np.nan, axis=1)
merged_mimic_df['Braden (Max)'] = merged_mimic_df.apply(lambda row: row[['Braden Sensory Perception (Max)', 'Braden Moisture (Max)', 'Braden Activity (Max)', 'Braden Mobility (Max)', 'Braden Nutrition (Max)', 'Braden Friction/Shear (Max)']].sum() if not all(row[['Braden Sensory Perception (Max)', 'Braden Moisture (Max)', 'Braden Activity (Max)', 'Braden Mobility (Max)', 'Braden Nutrition (Max)', 'Braden Friction/Shear (Max)']].isna()) else np.nan, axis=1)

# Drop the original Braden component columns
merged_mimic_df.drop(columns=[
    'Braden Sensory Perception (Mean)', 'Braden Moisture (Mean)', 'Braden Activity (Mean)', 'Braden Mobility (Mean)', 'Braden Nutrition (Mean)', 'Braden Friction/Shear (Mean)',
    'Braden Sensory Perception (Median)', 'Braden Moisture (Median)', 'Braden Activity (Median)', 'Braden Mobility (Median)', 'Braden Nutrition (Median)', 'Braden Friction/Shear (Median)',
    'Braden Sensory Perception (Min)', 'Braden Moisture (Min)', 'Braden Activity (Min)', 'Braden Mobility (Min)', 'Braden Nutrition (Min)', 'Braden Friction/Shear (Min)',
    'Braden Sensory Perception (Max)', 'Braden Moisture (Max)', 'Braden Activity (Max)', 'Braden Mobility (Max)', 'Braden Nutrition (Max)', 'Braden Friction/Shear (Max)'
], inplace=True)

In [9]:
# Remove spaces and commas
merged_mimic_df.columns = merged_mimic_df.columns.str.replace(r'[ ,]+', '_', regex=True)

In [11]:
# Drop second column from the column_names_df
mimic_columns_to_keep.drop(columns=['Unnamed: 1'], inplace=True)

# Extract column names from columns_to_keep DataFrame
columns_to_keep_names = mimic_columns_to_keep['column'].tolist()

# Select only the desired columns
mimic_temp = merged_mimic_df[columns_to_keep_names]

In [12]:
# Remove Duplicate Columns
df_mimic_unique = mimic_temp.loc[:, ~mimic_temp.columns.duplicated()]

In [28]:
# Glucose merge
df_mimic_unique.loc[:, 'Glucose (Max)'] = df_mimic_unique.apply(lambda row: row[['Glucose_(Max)', 'Glucose_(Max).1', 'Glucose_(Max).2']].mean() if not all(row[['Glucose_(Max)', 'Glucose_(Max).1', 'Glucose_(Max).2']].isna()) else np.nan, axis=1)
df_mimic_unique.loc[:, 'Glucose (Mean)'] = df_mimic_unique.apply(lambda row: row[['Glucose_(Mean)', 'Glucose_(Mean).1', 'Glucose_(Mean).2']].mean() if not all(row[['Glucose_(Mean)', 'Glucose_(Mean).1', 'Glucose_(Mean).2']].isna()) else np.nan, axis=1)
df_mimic_unique.loc[:, 'Glucose (Median)'] = df_mimic_unique.apply(lambda row: row[['Glucose_(Median)', 'Glucose_(Median).1', 'Glucose_(Median).2']].mean() if not all(row[['Glucose_(Median)', 'Glucose_(Median).1', 'Glucose_(Median).2']].isna()) else np.nan, axis=1)
df_mimic_unique.loc[:, 'Glucose (Min)'] = df_mimic_unique.apply(lambda row: row[['Glucose_(Min)', 'Glucose_(Min).1', 'Glucose_(Min).2']].mean() if not all(row[['Glucose_(Min)', 'Glucose_(Min).1', 'Glucose_(Min).2']].isna()) else np.nan, axis=1)

# Drop original Glucose columns to keep only the summarized columns
df_mimic_unique.drop(columns=[
    'Glucose_(Max)', 'Glucose_(Max).1', 'Glucose_(Max).2',
    'Glucose_(Mean)', 'Glucose_(Mean).1', 'Glucose_(Mean).2',
    'Glucose_(Median)', 'Glucose_(Median).1', 'Glucose_(Median).2',
    'Glucose_(Min)', 'Glucose_(Min).1', 'Glucose_(Min).2'
], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unique.loc[:, 'Glucose (Min)'] = df_unique.apply(lambda row: row[['Glucose_(Min)', 'Glucose_(Min).1', 'Glucose_(Min).2']].mean() if not all(row[['Glucose_(Min)', 'Glucose_(Min).1', 'Glucose_(Min).2']].isna()) else np.nan, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unique.drop(columns=[


In [32]:
# pH merge
df_mimic_unique.loc[:, 'pH (Max)'] = df_mimic_unique.apply(lambda row: row[['pH_(Max)', 'pH_(Max).1', 'pH_(Max).3']].mean() if not all(row[['pH_(Max)', 'pH_(Max).1', 'pH_(Max).3']].isna()) else np.nan, axis=1)
df_mimic_unique.loc[:, 'pH (Mean)'] = df_mimic_unique.apply(lambda row: row[['pH_(Mean)', 'pH_(Mean).1', 'pH_(Mean).2', 'pH_(Mean).3']].mean() if not all(row[['pH_(Mean)', 'pH_(Mean).1', 'pH_(Mean).2', 'pH_(Mean).3']].isna()) else np.nan, axis=1)
df_mimic_unique.loc[:, 'pH (Median)'] = df_mimic_unique.apply(lambda row: row[['pH_(Median)', 'pH_(Median).1', 'pH_(Median).3']].mean() if not all(row[['pH_(Median)', 'pH_(Median).1', 'pH_(Median).3']].isna()) else np.nan, axis=1)
df_mimic_unique.loc[:, 'pH (Min)'] = df_mimic_unique.apply(lambda row: row[['pH_(Min)', 'pH_(Min).1', 'pH_(Min).3']].mean() if not all(row[['pH_(Min)', 'pH_(Min).1', 'pH_(Min).3']].isna()) else np.nan, axis=1)

# Drop original pH columns to keep only the summarized columns
df_mimic_unique.drop(columns=[
    'pH_(Max)', 'pH_(Max).1', 'pH_(Max).3',
    'pH_(Mean)', 'pH_(Mean).1', 'pH_(Mean).2', 'pH_(Mean).3',
    'pH_(Median)', 'pH_(Median).1', 'pH_(Median).3',
    'pH_(Min)', 'pH_(Min).1', 'pH_(Min).3'
], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unique.loc[:, 'pH (Max)'] = df_unique.apply(lambda row: row[['pH_(Max)', 'pH_(Max).1', 'pH_(Max).3']].mean() if not all(row[['pH_(Max)', 'pH_(Max).1', 'pH_(Max).3']].isna()) else np.nan, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unique.loc[:, 'pH (Mean)'] = df_unique.apply(lambda row: row[['pH_(Mean)', 'pH_(Mean).1', 'pH_(Mean).2', 'pH_(Mean).3']].mean() if not all(row[['pH_(Mean)', 'pH_(Mean).1', 'pH_(Mean).2', 'pH_(Mean).3']].isna()) else np.nan, axis=1)
A value is trying to be set on a copy

# eICU

In [None]:
# Merge eICU dataframes
merged_eicu_df = eicu_meam_df.merge(eicu_median_df, on=['row_count', 'uniquepid', 'patientunitstayid', 'Time_Zone', 'gender', 'age', 'ethnicity', 'unitdischargestatus', 'LOS'], suffixes=('_mean', '_median'))
merged_eicu_df = merged_eicu_df.merge(eicu_min_df, on=['row_count', 'uniquepid', 'patientunitstayid', 'Time_Zone', 'gender', 'age', 'ethnicity', 'unitdischargestatus', 'LOS'], suffixes=('', '_min'))
merged_eicu_df = merged_eicu_df.merge(eicu_max_df, on=['row_count', 'uniquepid', 'patientunitstayid', 'Time_Zone', 'gender', 'age', 'ethnicity', 'unitdischargestatus', 'LOS'], suffixes=('', '_max'))

# Move the 'unitdischargestatus' and 'LOS' columns to the end of the dataframe
unitdischargestatus_column = merged_eicu_df.pop('unitdischargestatus')
los_column = merged_eicu_df.pop('LOS')
merged_eicu_df = pd.concat([merged_eicu_df, unitdischargestatus_column, los_column], axis=1)

# Rename the last two columns to preserve their original names
merged_eicu_df.columns = list(merged_eicu_df.columns[:-2]) + ['unitdischargestatus', 'LOS']

In [84]:
# Drop second column from the column_names_df
#eicu_columns_to_keep.drop(columns=['Unnamed: 1'], inplace=True)

# Extract column names from columns_to_keep DataFrame
columns_to_keep_names = eicu_columns_to_keep['column'].tolist()

# Select only the desired columns
eicu_temp = merged_eicu_df[columns_to_keep_names]

In [93]:
display(df_unique)

Unnamed: 0,row_count,subject_id,hadm_id,Time_Zone,gender,age,race,Alanine_Aminotransferase_(ALT)_(Max),Alanine_Aminotransferase_(ALT)_(Mean),Alanine_Aminotransferase_(ALT)_(Median),...,hospital_expire_flag,los,Glucose (Max),Glucose (Mean),Glucose (Median),Glucose (Min),pH (Max),pH (Mean),pH (Median),pH (Min)
0,1,10004733,27411876,1,M,51,UNKNOWN,46.0,46.0,46.0,...,Survive,8.357373,86.0,86.0,86.0,86.0,6.715,6.715,6.715,6.715
1,2,10004733,27411876,2,M,51,UNKNOWN,46.0,46.0,46.0,...,Survive,8.357373,86.0,86.0,86.0,86.0,6.715,6.715,6.715,6.715
2,3,10004733,27411876,3,M,51,UNKNOWN,46.0,46.0,46.0,...,Survive,8.357373,86.0,86.0,86.0,86.0,6.715,6.715,6.715,6.715
3,4,10004733,27411876,4,M,51,UNKNOWN,46.0,46.0,46.0,...,Survive,8.357373,94.0,90.0,90.0,86.0,6.715,6.715,6.715,6.715
4,5,10004733,27411876,5,M,51,UNKNOWN,46.0,46.0,46.0,...,Survive,8.357373,94.0,90.0,90.0,86.0,6.715,6.715,6.715,6.715
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58139,58140,19999987,23865745,12,F,57,UNKNOWN,63.0,63.0,63.0,...,Survive,1.937847,113.0,113.0,113.0,113.0,6.445,6.445,6.445,6.445
58140,58141,19999987,23865745,13,F,57,UNKNOWN,63.0,63.0,63.0,...,Survive,1.937847,113.0,113.0,113.0,113.0,6.445,6.445,6.445,6.445
58141,58142,19999987,23865745,14,F,57,UNKNOWN,63.0,63.0,63.0,...,Survive,1.937847,113.0,113.0,113.0,113.0,6.445,6.445,6.445,6.445
58142,58143,19999987,23865745,15,F,57,UNKNOWN,63.0,63.0,63.0,...,Survive,1.937847,113.0,113.0,113.0,113.0,6.445,6.445,6.445,6.445


In [88]:

column_names = merged_eicu_df.columns

# Create a dataframe from the column names
columns_df = pd.DataFrame(column_names, columns=['Column_Name'])

# Export to CSV
columns_df.to_csv('column_names.csv', index=False)

PermissionError: [Errno 13] Permission denied: 'column_names.csv'

In [69]:
braden_columns = [col for col in merged_eicu_df.columns if col.startswith('free T4')]
braden_df = merged_eicu_df[braden_columns]

# Count the missing values for each GCS column
missing_values_count = braden_df.isnull().sum()

In [71]:
display(braden_df)

Unnamed: 0,free T4 (Mean),free T4 (Median),free T4 (Min),free T4 (Max)
0,,,,
1,,,,
2,,,,
3,,,,
4,,,,
...,...,...,...,...
86267,,,,
86268,,,,
86269,,,,
86270,,,,


# I stop here

In [None]:
# Mimic -> columns to keep
# Read the CSV file to get the columns to keep
columns_to_keep = pd.read_csv('CSV\\imports\\mimic_features.csv')

# Drop second column from the column_names_df
columns_to_keep.drop(columns=['Unnamed: 1'], inplace=True)

# Extract column names from columns_to_keep DataFrame
columns_to_keep_names = columns_to_keep['column'].tolist()

# Select only the desired columns
mimic_temp = mimic_df[columns_to_keep_names]

"""------------------------------------------------------------------------------"""
# eICU -> columns to keep
# Read the CSV file to get the columns to keep
columns_to_keep = pd.read_csv("CSV\\imports\\eicu_features.csv")

# Drop second column from the column_names_df
columns_to_keep.drop(columns=['Unnamed: 1'], inplace=True)

# Extract column names from columns_to_keep DataFrame
columns_to_keep_names = columns_to_keep['column'].tolist()

# Select only the desired columns
eicu_temp = eicu_df[columns_to_keep_names]

In [None]:
"""--------Replace Block----------"""

# Replace 'Alive' with 0 and 'Expired' with 1 in the 'unitdischargestatus' column
eicu_temp.loc[:, 'unitdischargestatus'] = eicu_temp['unitdischargestatus'].replace({'Alive': 0, 'Expired': 1})

# Replace 'Female' with 'F' and 'Male' with 'M' in the 'gender' column
eicu_temp.loc[:, 'gender'] = eicu_temp['gender'].replace({'Female': 'F', 'Male': 'M'})

# Multiply values by 4 in 'Ionized Calcium' column, leaving NaN values unchanged
mimic_temp.loc[:, 'Ionized Calcium'] = mimic_temp['Ionized Calcium'].apply(lambda x: x * 4 if pd.notna(x) else x)

# Replace values in the 'ethnicity' column for standardization
eicu_temp.loc[:, 'ethnicity'] = eicu_temp['ethnicity'].replace({
    'African American': 'BLACK/AFRICAN AMERICAN',
    'Caucasian': 'WHITE',
    'Hispanic': 'HISPANIC OR LATINO',
    'Asian': 'ASIAN',
    'Native American': 'AMERICAN INDIAN/ALASKA NATIVE',
    'Other/Unknown': 'UNKNOWN'
})

# Replace age values higher than 89 with 90, and convert age to integer
eicu_temp.loc[:, 'age'] = eicu_temp['age'].replace('> 89', 90)
eicu_temp.loc[:, 'age'] = eicu_temp['age'].astype(np.int64)


In [None]:
column_eicu_mapping = {
    'column': 'column',
    'row_count': 'row_count',
    'uniquepid': 'subject_id',
    'patientunitstayid': 'hadm_id',
    'Time_Zone': 'Time_Zone',
    'gender': 'gender',
    'age': 'age',
    'ethnicity': 'race',
    'Base Excess': 'Base Excess',
    'lactate': 'Lactate',
    'paCO2': 'pCO2',
    'Total CO2': 'Calculated Total CO2',
    'BUN': 'BUN',
    'pH': 'pH',
    'paO2': 'pO2',
    'ALT (SGPT)': 'Alanine Aminotransferase (ALT)',
    'alkaline phos.': 'Alkaline Phosphatase',
    'anion gap': 'Anion Gap',
    'AST (SGOT)': 'Asparate Aminotransferase (AST)',
    'bicarbonate': 'Bicarbonate',
    'chloride': 'Chloride',
    'creatinine': 'Creatinine',
    'glucose': 'Glucose',
    'magnesium': 'Magnesium',
    'phosphate': 'Phosphate',
    'potassium': 'Potassium',
    'sodium': 'Sodium',
    'Hct': 'Hematocrit',
    'Hgb': 'Hemoglobin',
    'PT - INR': 'INR(PT)',
    'MCH': 'MCH',
    'MCHC': 'MCHC',
    'MCV': 'MCV',
    'platelets x 1000': 'Platelet Count',
    'PT': 'PT',
    'PTT': 'PTT',
    'RDW': 'RDW',
    'RBC': 'Red Blood Cells',
    'WBC x 1000': 'White Blood Cells',
    'Heart Rate': 'Heart Rate (bpm)',
    'Non-Invasive BP Diastolic': 'Non Invasive Blood Pressure systolic (mmHg)',
    'Non-Invasive BP Systolic': 'Non Invasive Blood Pressure diastolic (mmHg)',
    'Non-Invasive BP Mean': 'Non Invasive Blood Pressure mean (mmHg)',
    'Respiratory Rate': 'Respiratory Rate (insp/min)',
    'O2 Saturation': 'O2 saturation pulseoxymetry (%)',
    'CI': 'Chloride (serum)',
    'calcium': 'Calcium non-ionized',
    'CPK': 'CK (CPK)',
    'Temperature (F)': 'Temperature Fahrenheit (F)',
    'Pain Score': 'Pain Level',
    'LPM O2': 'O2 Flow (L/min)',
    'O2 L/%': 'Inspired O2 Fraction',
    'ionized calcium': 'Ionized Calcium',
    'albumin': 'Albumin',
    'GCS Total': 'GCS',
    'total bilirubin': 'Total Bilirubin',
    'LDH': 'LDH',
    'ethanol': 'ETOH',
    'Invasive BP Systolic': 'Arterial Blood Pressure systolic (mmHg)',
    'Invasive BP Diastolic': 'Arterial Blood Pressure diastolic (mmHg)',
    'Invasive BP Mean': 'Arterial Blood Pressure mean (mmHg)',
    'serum osmolality':	'Serum Osmolality',
    'troponin - I':	'Troponin-T',
    'uric acid': 'Uric Acid',
    'ammonia': 'Ammonia',
    'CRP': 'C Reactive Protein (CRP)',
    'fibrinogen': 'Fibrinogen',
    'PA Systolic': 'Pulmonary Artery Pressure systolic (mmHg)',
    'PA Diastolic': 'Pulmonary Artery Pressure diastolic (mmHg)',
    'PA Mean': 'Pulmonary Artery Pressure mean (mmHg)',
    'Bedside Glucose': 'Glucose finger stick (range 70-100)',
    'reticulocyte count': 'Reticulocyte Count Automated',
    '-basos': 'Differential-Basos',
    '-eos': 'Differential-Eos',
    '-lymphs': 'Differential-Lymphs',
    '-monos': 'Differential-Monos',
    '-polys': 'Differential-Neuts',
    'haptoglobin': 'Haptoglobin',
    'direct bilirubin': 'Bilirubin Direct',
    'free T4': 'Thyroxine (T4) Free',
    'ESR': 'Sedimentation Rate',
    'CPK-MB INDEX': 'CK-MB',
    'amylase': 'Amylase',
    'PEEP': 'PEEP set (cmH2O)',
    'CVP': 'Central Venous Pressure (mmHg)',
    'unitdischargestatus': 'hospital_expire_flag',
    'LOS': 'los'
}

# Replace the DataFrame and column names mapping
eicu_temp.rename(columns=column_eicu_mapping, inplace=True)



column_mimic_mapping = {
    'Temperature Fahrenheit (°F)': 'Temperature Fahrenheit (F)',
}

# Replace the DataFrame and column names mapping
mimic_temp.rename(columns=column_mimic_mapping, inplace=True)

# Remove "-" from the 'subject_id' column in eicu
eicu_temp['subject_id'] = eicu_temp['subject_id'].str.replace('-', '')

# Convert 'subject_id' in eicu to int64
eicu_temp['subject_id'] = eicu_temp['subject_id'].astype(np.int64)

In [None]:
# Remove duplicate columns
mimic_temp = mimic_temp.loc[:, ~mimic_temp.columns.duplicated()]

# Remove duplicate columns
eicu_temp = eicu_temp.loc[:, ~eicu_temp.columns.duplicated()]

In [None]:
display(mimic_temp)

display(eicu_temp)

In [None]:
# Replace 'Survive' with 0 and 'Death' with 1 in the 'hospital_expire_flag' column
mimic_temp['hospital_expire_flag'] = mimic_temp['hospital_expire_flag'].replace({'Survive': 0, 'Death': 1})

In [None]:
# Check if mimic and eicu datasets have the same dtype and header names

# Get the column names and dtypes of mimic_df
mimic_info = mimic_temp.dtypes

# Get the column names and dtypes of eicu_df
eicu_info = eicu_temp.dtypes

# Check if the column names are the same
if all(mimic_info.index == eicu_info.index):
    print("The column names are the same.")
else:
    print("The column names are different.")

print("\n")

# Check if the number of columns is the same
if len(mimic_info) != len(eicu_info):
    print("Number of columns is different between mimic_df and eicu_df.")
else:
    # Iterate over the columns and compare the data type.
    for column_name in mimic_info.index:
        mimic_dtype = mimic_info[column_name]
        eicu_dtype = eicu_info[column_name]
        if mimic_dtype != eicu_dtype:
            print(f"Column '{column_name}' has different data types: mimic_df has '{mimic_dtype}' and eicu_df has '{eicu_dtype}'.")

In [None]:
# Export the merged DataFrame to a CSV file
mimic_temp.to_csv('CSV\\exports\\final\\mimic_mean_final.csv', index=False)

In [None]:
# Export the merged DataFrame to a CSV file
eicu_temp.to_csv('CSV\\exports\\final\\eicu_mean_final.csv', index=False)