# Filter admissions with specific ICD code

In [1]:
import gc
import gzip
import pandas as pd

# Set the display option to show all columns
pd.set_option('display.max_columns', None)

# CAUTION: Path with the specific ICD codes from a CSV file 
icd = 'CSV\Imports\icd_stroke.csv'

compressed_file_path = r"..\Datasets\eicu-2_0\diagnosis.csv.gz"

df = pd.read_csv(compressed_file_path, compression='gzip')

# Read the specific ICD codes from a CSV file
specific_icd_numbers = pd.read_csv(icd, header=None)

In [2]:
# Split the icd9code column and create multiple rows
df['icd9code'] = df['icd9code'].str.split(', ')
df = df.explode('icd9code')
df = df.dropna(subset=['icd9code'])

In [3]:
display(df)

Unnamed: 0,diagnosisid,patientunitstayid,activeupondischarge,diagnosisoffset,diagnosisstring,icd9code,diagnosispriority
0,4222318,141168,False,72,cardiovascular|chest pain / ASHD|coronary arte...,414.00,Other
0,4222318,141168,False,72,cardiovascular|chest pain / ASHD|coronary arte...,I25.10,Other
2,4160941,141168,False,72,pulmonary|disorders of the airways|COPD,491.20,Other
2,4160941,141168,False,72,pulmonary|disorders of the airways|COPD,J44.9,Other
3,4103261,141168,True,118,pulmonary|disorders of the airways|COPD,491.20,Other
...,...,...,...,...,...,...,...
2710669,46259796,3353254,True,41,renal|disorder of kidney|acute renal failure|d...,N17.9,Major
2710670,46204273,3353254,True,41,gastrointestinal|GI bleeding / PUD|lower GI bl...,578.9,Primary
2710670,46204273,3353254,True,41,gastrointestinal|GI bleeding / PUD|lower GI bl...,K92.2,Primary
2710671,46335124,3353263,True,100,pulmonary|disorders of vasculature|pulmonary e...,415.19,Other


In [4]:
# Convert the list of ICD codes to a list of strings
specific_icd_numbers = specific_icd_numbers[0].tolist()

In [5]:
# Filter rows based on desired_icd9codes
filtered_df = df[df['icd9code'].isin(specific_icd_numbers)]

filtered_df = filtered_df.copy()
filtered_df.drop_duplicates(subset='patientunitstayid', keep='first', inplace=True)

# Display the filtered DataFrame
display(filtered_df)

Unnamed: 0,diagnosisid,patientunitstayid,activeupondischarge,diagnosisoffset,diagnosisstring,icd9code,diagnosispriority
307,4262229,141677,True,21,neurologic|disorders of vasculature|stroke|hem...,430,Other
947,4237805,142974,False,804,neurologic|disorders of vasculature|stroke|isc...,434.91,Primary
1036,4124998,143056,True,900,neurologic|disorders of vasculature|stroke|isc...,434.91,Major
1056,4138034,143057,False,57,neurologic|disorders of vasculature|stroke|isc...,434.91,Primary
1306,3368454,143448,True,25,neurologic|disorders of vasculature|stroke|isc...,434.91,Primary
...,...,...,...,...,...,...,...
2709683,46347417,3352817,False,104,neurologic|disorders of vasculature|stroke|isc...,434.91,Other
2709748,46215232,3352870,True,128,neurologic|disorders of vasculature|stroke|hem...,430,Primary
2710142,46119636,3353094,False,4338,neurologic|disorders of vasculature|stroke|isc...,434.91,Other
2710287,46347752,3353144,False,6847,neurologic|disorders of vasculature|stroke|isc...,434.91,Other


In [6]:
# Export the filtered DataFrame to a CSV file
filtered_df.to_csv('CSV\Exports\o01_eicu_diagnosis.csv', index=False, header=True)

# Free RAM
gc.collect()

27