# Merge and keep unique data from admissions and patient tables  

In [1]:
import gc
import gzip
import pandas as pd

# Set the display option to show all columns
pd.set_option('display.max_columns', None)

# Diagnosis file 
diagnosis_path = 'CSV\Exports\o01_eicu_diagnosis.csv'

compressed_file_path = r"..\Datasets\eicu-2_0\patient.csv.gz"

df = pd.read_csv(compressed_file_path, compression='gzip')

# Read the specific ICD codes from a CSV file
diagnosis_df = pd.read_csv(diagnosis_path)

In [2]:
# Columns related to ICU
icu_columns = [
    'uniquepid', # ID for a unique patient
    'unitvisitnumber', # identifies the visit number of the patient
    'patienthealthsystemstayid', # surrogate key for hospital Stay
    'patientunitstayid', # surrogate key for ICU Stay
    'gender', # gender of the patient
    'age', # age of the patient in full years
    'ethnicity', # ethnicity of the patient
    'wardid', # surrogate key for the ward associated with the patient unit stay
    'apacheadmissiondx', #Full path string of admission diagnosis for patients unit stay
    'admissionheight', # admission height of the patient in cm
    'unittype', # the picklist unit type of the unit
    'unitadmittime24', # time in 24 hour format of when the unit admit event occurred
    'unitadmitsource', # picklist location from where the patient was admitted
    'admissionweight', #admission weight of the patient in kilograms
    'dischargeweight', # patient weight at time of unit discharge in kilograms 
    'unitdischargetime24', # time in 24 hour format of when the unit discharge event occurred
    'unitdischargeoffset', # number of minutes from unit admit time that the patient was discharged from the unit
    'unitdischargelocation', #  Structured list of locations where the patient was discharged to from the unit
    'unitdischargestatus' # specifies patient’s condition upon leaving the unit
]

# Create a new DataFrame with only the ICU-related columns
temp_df = df[icu_columns]

In [3]:
# Merge dataframes
merged_df = pd.merge(diagnosis_df, df, on='patientunitstayid', how='inner')

# Move 'uniquepid' column to the beginning
uniquepid_column = merged_df.pop('uniquepid')
merged_df.insert(0, 'uniquepid', uniquepid_column)


# Sort the DataFrame by uniquepid and hospitalAdmitOffset
sorted_table = merged_df.sort_values(by=['uniquepid', 'hospitaladmitoffset'])

In [4]:
# Create an additional column to represent the order of appearance
sorted_table['order_of_appearance'] = sorted_table.groupby('uniquepid').cumcount()

In [5]:
# Group by 'uniquepid' and get the index of the row with the maximum 'order_of_appearance'
idx = sorted_table.groupby('uniquepid')['order_of_appearance'].idxmax()

# Select the corresponding rows and reset index
result = sorted_table.loc[idx].reset_index(drop=True)

In [6]:
# Export the filtered DataFrame to a CSV file
result.to_csv('CSV\Exports\o02_eicu_unique_admissions.csv', index=False, header=True)

# Free RAM
gc.collect()

0