In [1]:
import pandas as pd
import json
import os

In [2]:
def detailed_infection_analysis(subject_id, df_microbiologyevents):
    micro_data = df_microbiologyevents[df_microbiologyevents['subject_id'] == subject_id]
    micro_data['chartdate'] = pd.to_datetime(micro_data['chartdate'])
    return micro_data.to_dict(orient='records')

def medication_administration_details(subject_id, df_emar, df_emar_detail):
    emar_data = df_emar[df_emar['subject_id'] == subject_id]
    emar_detail_data = df_emar_detail[df_emar_detail['subject_id'] == subject_id]
    emar_data['charttime'] = pd.to_datetime(emar_data['charttime'])
    emar_detail_data['dose_given'] = pd.to_numeric(emar_detail_data['dose_given'], errors='coerce')
    emar_detail_data = emar_detail_data.dropna(subset=['dose_given'])
    combined_data = pd.merge(emar_data, emar_detail_data, on=['subject_id', 'emar_id', 'emar_seq'])
    return combined_data.to_dict(orient='records')

def icu_input_output_events(subject_id, df_inputevents, df_outputevents, df_d_items):
    input_data = df_inputevents[df_inputevents['subject_id'] == subject_id]
    output_data = df_outputevents[df_outputevents['subject_id'] == subject_id]
    input_data['starttime'] = pd.to_datetime(input_data['starttime'])
    output_data['charttime'] = pd.to_datetime(output_data['charttime'])
    input_data['event'] = 'input'
    output_data['event'] = 'output'
    input_data = input_data.merge(df_d_items[['itemid', 'label', 'category']], on='itemid', how='left')
    output_data = output_data.merge(df_d_items[['itemid', 'label', 'category']], on='itemid', how='left')
    input_data = input_data.rename(columns={'starttime': 'charttime'})
    combined_data = pd.concat([input_data[['charttime', 'itemid', 'label', 'category', 'amount', 'event']],
                               output_data[['charttime', 'itemid', 'label', 'category', 'value', 'event']].rename(columns={'value': 'amount'})])
    return combined_data.to_dict(orient='records')

In [3]:
def split_read(path, chunksize=100000, condition="") :
    #chunksize = 100000  # 원하는 chunk 사이즈를 설정합니다.
    #condition = '특정열 == "조건"'  # 필터링할 조건을 설정합니다.
    filtered_df = pd.DataFrame() 
    for chunk in pd.read_csv(path, chunksize=chunksize):
        # 조건에 맞는 행들만 선택하여 필터링합니다.
        if condition == "":
            filtered_df = pd.concat([filtered_df, chunk], ignore_index=True)
        else :
            filtered_chunk = chunk.query(condition)
            filtered_df = pd.concat([filtered_df, filtered_chunk], ignore_index=True)
        # 필터링된 chunk를 처리합니다.
        # 예를 들어, 이 부분에 필터링된 chunk를 다른 데이터프레임에 추가하거나
        # 원하는 작업을 수행할 수 있습니다.
    return filtered_df

# 현재 노트북 파일의 디렉토리를 가져옵니다.
current_directory = os.getcwd()
print(current_directory)
relative_folder_path = "../physionet.org/files/mimiciv/2.2/"

base_df = split_read(os.path.join(current_directory, relative_folder_path, "icu/icustays.csv"))

condition = 'subject_id in ['
for i in list(base_df['subject_id'].unique()[:1000]) :
    condition += str(i)
    condition += ', '
condition += ']'
print(condition)

# 데이터 파일 로드
prescriptions_df = pd.read_csv(os.path.join(current_directory, relative_folder_path, "hosp/prescriptions.csv"))
microbiologyevents_df = pd.read_csv(os.path.join(current_directory, relative_folder_path, "hosp/microbiologyevents.csv"))
emar_df = pd.read_csv(os.path.join(current_directory, relative_folder_path, "hosp/emar.csv"))
emar_detail_df = pd.read_csv(os.path.join(current_directory, relative_folder_path, "hosp/emar_detail.csv"))
inputevents_df = pd.read_csv(os.path.join(current_directory, relative_folder_path, "icu/inputevents.csv"))
outputevents_df = pd.read_csv(os.path.join(current_directory, relative_folder_path, "icu/outputevents.csv"))
d_items_df = pd.read_csv(os.path.join(current_directory, relative_folder_path, "icu/d_items.csv"))
admissions_df = split_read(os.path.join(current_directory, relative_folder_path, "hosp/admissions.csv"), condition = condition)
patients_df = split_read(os.path.join(current_directory, relative_folder_path, "hosp/patients.csv"), condition = condition)

/home/bckim97/SNU/MedVis/emr_dashboard_jj
subject_id in [10000032, 10000980, 10001217, 10001725, 10001884, 10002013, 10002155, 10002348, 10002428, 10002430, 10002443, 10002495, 10002760, 10002930, 10003019, 10003046, 10003400, 10003502, 10004113, 10004235, 10004401, 10004422, 10004457, 10004606, 10004720, 10004733, 10004764, 10005123, 10005348, 10005606, 10005817, 10005866, 10005909, 10006053, 10006131, 10006277, 10006580, 10006821, 10007058, 10007795, 10007818, 10007920, 10007928, 10008077, 10008100, 10008287, 10008454, 10008924, 10009035, 10009049, 10009628, 10009686, 10010058, 10010471, 10010867, 10011189, 10011365, 10011398, 10011427, 10011668, 10011938, 10012055, 10012206, 10012292, 10012438, 10012476, 10012552, 10012853, 10013015, 10013049, 10013310, 10013419, 10013569, 10013643, 10014078, 10014136, 10014179, 10014354, 10014610, 10014729, 10015272, 10015834, 10015860, 10015931, 10016150, 10016742, 10016810, 10016859, 10017285, 10017308, 10017437, 10017492, 10017531, 10017679, 100

  prescriptions_df = pd.read_csv(os.path.join(current_directory, relative_folder_path, "hosp/prescriptions.csv"))
  microbiologyevents_df = pd.read_csv(os.path.join(current_directory, relative_folder_path, "hosp/microbiologyevents.csv"))
  emar_detail_df = pd.read_csv(os.path.join(current_directory, relative_folder_path, "hosp/emar_detail.csv"))


In [4]:
# 'starttime'을 datetime 형식으로 변환
prescriptions_df['datetime'] = pd.to_datetime(prescriptions_df['starttime'])
prescriptions_df['date'] = prescriptions_df['datetime'].dt.date
prescriptions_df['hour'] = prescriptions_df['datetime'].dt.hour

In [5]:
# JSON 객체 초기화
accum_json = {}

# 각 subject_id에 대해 정보를 추가
for subject_id in [int(i) for i in patients_df['subject_id'].unique()]:
    accum_json[subject_id] = {}

    # 환자 정보 추가
    patient_info = patients_df[patients_df['subject_id'] == subject_id].to_dict(orient='records')[0]
    accum_json[subject_id]['patient_info'] = patient_info

    # 입원 정보 추가
    admission_info = admissions_df[admissions_df['subject_id'] == subject_id].to_dict(orient='records')
    accum_json[subject_id]['admission_info'] = admission_info

    # 처방 정보 추가
    patient_data = prescriptions_df[prescriptions_df['subject_id'] == subject_id]
    grouped_data = patient_data.groupby(['date', 'hour']).apply(
        lambda x: pd.Series({
            'date': x['date'].iloc[0],  # 수정된 부분: date 열 추가
            'hour': x['hour'].iloc[0],  # 수정된 부분: hour 열 추가
            'formulary_drug_cd': x['formulary_drug_cd'].count(),
            'drug_info': ', '.join((x['drug'].fillna('') + " via " + x['route'].fillna('')).astype(str))
        })
    ).reset_index(drop=True)
    accum_json[subject_id]['prescriptions'] = grouped_data.to_dict(orient='records')

    # 감염 분석 추가
    accum_json[subject_id]['detailed_infection_analysis'] = detailed_infection_analysis(subject_id, microbiologyevents_df)

    # 약물 투여 기록 추가
    accum_json[subject_id]['medication_administration_details'] = medication_administration_details(subject_id, emar_df, emar_detail_df)

    # ICU 투입 및 배출 이벤트 추가
    accum_json[subject_id]['icu_input_output_events'] = icu_input_output_events(subject_id, inputevents_df, outputevents_df, d_items_df)

  grouped_data = patient_data.groupby(['date', 'hour']).apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  micro_data['chartdate'] = pd.to_datetime(micro_data['chartdate'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  emar_data['charttime'] = pd.to_datetime(emar_data['charttime'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  emar_detail_

In [6]:
# JSON 파일을 불러와서 사전으로 변환
with open('./data/accumulated_data.json', 'w') as json_file:
    json.dump(accum_json, json_file, indent=4, default=str)
