## Adding Demographics

In [1]:
import gzip
import pandas as pd
import gc

In [2]:
# Read the exported for header chartevent CSV file
admissions_df = pd.read_csv(r'CSV\Exports\o02_eicu_unique_admissions.csv')

mean_df = pd.read_csv(r'CSV\Exports\datasets\Temp\o01_mean_table.csv')
median_df = pd.read_csv(r'CSV\Exports\datasets\Temp\o02_median_table.csv')
min_df = pd.read_csv(r'CSV\Exports\datasets\Temp\o03_min_table.csv')
max_df = pd.read_csv(r'CSV\Exports\datasets\Temp\o04_max_table.csv')

In [5]:
columns_to_keep = ["uniquepid", "patientunitstayid", 'gender',
                   'age', 'ethnicity', 'unitdischargestatus', 'unitdischargeoffset']
demographics_df = admissions_df[columns_to_keep]

# Mean & Demographics

In [6]:
# Merge mean with demographics
temp_df = pd.merge(mean_df, demographics_df, on=['patientunitstayid', 'uniquepid'])


# Set new order
new_order = ['uniquepid', 'patientunitstayid', 'Time_Zone', 'gender', 'age', 'ethnicity'
            ] + [col for col in temp_df.columns if col not in 
                 ['uniquepid', 'patientunitstayid', 'Time_Zone', 'gender', 'age', 'ethnicity']]


# Reorder the columns in the DataFrame
temp_df = temp_df[new_order]

# Sort DataFrame based on 'patientunitstayid' and 'Time_Zone'
temp_df.sort_values(by=['patientunitstayid', 'Time_Zone'], inplace=True)

# Add a new column 'row_count' at the beginning
# I'm going to use it as a split point when I load
# the dataset in the rapidminer with filter operator
temp_df.insert(0, 'row_count', range(1, len(temp_df) + 1))

# Reset the index to match the row_count
temp_df.reset_index(drop=True, inplace=True)

# Export the merged DataFrame to a CSV file
temp_df.to_csv(r'CSV\Exports\datasets\whole_set\o01_final_mean_table.csv', index=False)

# Free RAM
temp_df = None
gc.collect()

0

# Median & Demographics

In [7]:
# Merge median with demographics
temp_df = pd.merge(median_df, demographics_df, on=['patientunitstayid', 'uniquepid'])


# Set new order
new_order = ['uniquepid', 'patientunitstayid', 'Time_Zone', 'gender', 'age', 'ethnicity'
            ] + [col for col in temp_df.columns if col not in 
                 ['uniquepid', 'patientunitstayid', 'Time_Zone', 'gender', 'age', 'ethnicity']]


# Reorder the columns in the DataFrame
temp_df = temp_df[new_order]

# Sort DataFrame based on 'patientunitstayid' and 'Time_Zone'
temp_df.sort_values(by=['patientunitstayid', 'Time_Zone'], inplace=True)

# Add a new column 'row_count' at the beginning
# I'm going to use it as a split point when I load
# the dataset in the rapidminer with filter operator
temp_df.insert(0, 'row_count', range(1, len(temp_df) + 1))

# Reset the index to match the row_count
temp_df.reset_index(drop=True, inplace=True)

# Export the merged DataFrame to a CSV file
temp_df.to_csv(r'CSV\Exports\datasets\whole_set\o02_final_median_table.csv', index=False)

# Free RAM
temp_df = None
gc.collect()

0

# # Min & Demographics

In [8]:
# Merge min with demographics
temp_df = pd.merge(min_df, demographics_df, on=['patientunitstayid', 'uniquepid'])


# Set new order
new_order = ['uniquepid', 'patientunitstayid', 'Time_Zone', 'gender', 'age', 'ethnicity'
            ] + [col for col in temp_df.columns if col not in 
                 ['uniquepid', 'patientunitstayid', 'Time_Zone', 'gender', 'age', 'ethnicity']]


# Reorder the columns in the DataFrame
temp_df = temp_df[new_order]

# Sort DataFrame based on 'patientunitstayid' and 'Time_Zone'
temp_df.sort_values(by=['patientunitstayid', 'Time_Zone'], inplace=True)

# Add a new column 'row_count' at the beginning
# I'm going to use it as a split point when I load
# the dataset in the rapidminer with filter operator
temp_df.insert(0, 'row_count', range(1, len(temp_df) + 1))

# Reset the index to match the row_count
temp_df.reset_index(drop=True, inplace=True)

# Export the merged DataFrame to a CSV file
temp_df.to_csv(r'CSV\Exports\datasets\whole_set\o03_final_min_table.csv', index=False)

# Free RAM
temp_df = None
gc.collect()

0

# Max & Demographics

In [9]:
# Merge max with demographics
temp_df = pd.merge(max_df, demographics_df, on=['patientunitstayid', 'uniquepid'])


# Set new order
new_order = ['uniquepid', 'patientunitstayid', 'Time_Zone', 'gender', 'age', 'ethnicity'
            ] + [col for col in temp_df.columns if col not in 
                 ['uniquepid', 'patientunitstayid', 'Time_Zone', 'gender', 'age', 'ethnicity']]


# Reorder the columns in the DataFrame
temp_df = temp_df[new_order]

# Sort DataFrame based on 'patientunitstayid' and 'Time_Zone'
temp_df.sort_values(by=['patientunitstayid', 'Time_Zone'], inplace=True)

# Add a new column 'row_count' at the beginning
# I'm going to use it as a split point when I load
# the dataset in the rapidminer with filter operator
temp_df.insert(0, 'row_count', range(1, len(temp_df) + 1))

# Reset the index to match the row_count
temp_df.reset_index(drop=True, inplace=True)

# Export the merged DataFrame to a CSV file
temp_df.to_csv(r'CSV\Exports\datasets\whole_set\o04_final_max_table.csv', index=False)

# Free RAM
temp_df = None
gc.collect()

0