In [1]:
import pandas as pd
import numpy as np

  from pandas.core import (


In [2]:
import gzip
import shutil

In [3]:
# QUIQ 테이블의 컬럼 정의
QUIQ_cols = [
    "Primary_key", "Variable_ID", "Original_table_name", "Variable_name", "Event_date", 
    "Value", "Unit", "Variable_type","Is_categorical", "Recorder", "Recorder_position", "Recorder_affiliation",
    "Patient_id", "Admission_id", "Ground_truth", "Mapping_info_1", "Mapping_info_2"
]

## logical accuracy 테스트용 2000

In [4]:
patients = pd.read_csv('PATIENTS.csv.gz', compression='gzip')
patients

Unnamed: 0,ROW_ID,SUBJECT_ID,GENDER,DOB,DOD,DOD_HOSP,DOD_SSN,EXPIRE_FLAG
0,234,249,F,2075-03-13 00:00:00,,,,0
1,235,250,F,2164-12-27 00:00:00,2188-11-22 00:00:00,2188-11-22 00:00:00,,1
2,236,251,M,2090-03-15 00:00:00,,,,0
3,237,252,M,2078-03-06 00:00:00,,,,0
4,238,253,F,2089-11-26 00:00:00,,,,0
...,...,...,...,...,...,...,...,...
46515,31840,44089,M,2026-05-25 00:00:00,,,,0
46516,31841,44115,F,2124-07-27 00:00:00,,,,0
46517,31842,44123,F,2049-11-26 00:00:00,2135-01-12 00:00:00,2135-01-12 00:00:00,,1
46518,31843,44126,F,2076-07-25 00:00:00,,,,0


In [5]:
# 1. 중복 제거 후 환자 ID 샘플링
la = patients["SUBJECT_ID"].drop_duplicates().sample(n=2000, random_state=42)

# 2. 샘플링된 환자 ID에 해당하는 모든 행 추출
sampled_la = patients[patients["SUBJECT_ID"].isin(la)]

# 3. 인덱스 초기화
sampled_la = sampled_la.reset_index(drop=True)
sampled_la

Unnamed: 0,ROW_ID,SUBJECT_ID,GENDER,DOB,DOD,DOD_HOSP,DOD_SSN,EXPIRE_FLAG
0,238,253,F,2089-11-26 00:00:00,,,,0
1,655,695,F,2093-05-14 00:00:00,2178-09-16 00:00:00,,2178-09-16 00:00:00,1
2,750,796,M,2062-02-13 00:00:00,,,,0
3,753,799,F,2134-08-28 00:00:00,,,,0
4,779,825,M,2106-07-20 00:00:00,,,,0
...,...,...,...,...,...,...,...,...
1995,31768,43787,M,1811-09-12 00:00:00,,,,0
1996,31783,43866,M,2072-06-20 00:00:00,,,,0
1997,31805,43946,F,2061-07-31 00:00:00,,,,0
1998,31815,43990,F,2101-06-17 00:00:00,2155-09-10 00:00:00,2155-09-10 00:00:00,2155-09-10 00:00:00,1


In [6]:
# 결과 저장 리스트
rows = []

# 제외할 컬럼
exclude_cols = {"ROW_ID", "SUBJECT_ID"}

for index, row in sampled_la.iterrows():
    patient_id = row["SUBJECT_ID"]
    #admission_id = row["HADM_ID"]
    
    for col in sampled_la.columns:
        if col not in exclude_cols:
            value = row[col]
            rows.append({
                "Primary_key": index + 1,
                "Variable_ID": np.nan,
                "Original_table_name": "PATIENTS",
                "Variable_name": col,
                "Event_date": np.nan,
                "Value": value,
                "Unit": np.nan,
                "Variable_type": np.nan,      # 나중에 설정
                "Is_categorical": np.nan,     # 나중에 설정
                "Recorder": np.nan,
                "Recorder_position": np.nan,
                "Recorder_affiliation": np.nan,
                "Patient_id": patient_id,
                "Admission_id": np.nan,
                "Ground_truth": np.nan,
                "Mapping_info_1": np.nan,
                "Mapping_info_2": np.nan
            })

# QUIQ 테이블로 변환
patient_quiq = pd.DataFrame(rows, columns=QUIQ_cols)

# -------------------------------
# Variable_type 추론 함수
def infer_variable_type(val):
    if pd.isna(val):
        return np.nan
    elif isinstance(val, (int, float)):
        return "numeric"
    elif isinstance(val, pd.Timestamp):
        return "timestamp"
    elif isinstance(val, str):
        try:
            pd.to_datetime(val, errors="raise")
            return "timestamp"
        except Exception:
            return "string"
    else:
        return "unknown"

# Variable_type 컬럼 채우기
patient_quiq["Variable_type"] = patient_quiq["Value"].apply(infer_variable_type)

# -------------------------------
# Is_categorical 판단: 고유값 수가 적은 변수는 범주형으로 간주
CATEGORICAL_THRESHOLD = 10

# 각 Variable_name 별 고유값 수 계산
value_counts = patient_quiq.groupby("Variable_name")["Value"].nunique()

# 고유값 수가 기준 이하인 변수 목록 추출
categorical_vars = value_counts[value_counts <= CATEGORICAL_THRESHOLD].index

# Is_categorical 컬럼 채우기: 범주형이면 1, 아니면 0
patient_quiq["Is_categorical"] = patient_quiq["Variable_name"].apply(
    lambda var: 1 if var in categorical_vars else 0
)
patient_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1,,PATIENTS,GENDER,,F,,string,1,,,,253,,,,
1,1,,PATIENTS,DOB,,2089-11-26 00:00:00,,timestamp,0,,,,253,,,,
2,1,,PATIENTS,DOD,,,,,0,,,,253,,,,
3,1,,PATIENTS,DOD_HOSP,,,,,0,,,,253,,,,
4,1,,PATIENTS,DOD_SSN,,,,,0,,,,253,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,2000,,PATIENTS,DOB,,2086-03-29 00:00:00,,timestamp,0,,,,44073,,,,
11996,2000,,PATIENTS,DOD,,,,,0,,,,44073,,,,
11997,2000,,PATIENTS,DOD_HOSP,,,,,0,,,,44073,,,,
11998,2000,,PATIENTS,DOD_SSN,,,,,0,,,,44073,,,,


In [7]:
# 1. 매핑 룰 정의
mapping_rules = {
    "DOB": ("date", np.nan),
    "DOD": ("date", np.nan),
    "DOD_SSN": ("date", np.nan),
    "DOD_HOSP": ("date", np.nan)
}

# 2. 매핑 함수
def map_mapping_info(row):
    var_name = row["Variable_name"]
    value = row["Value"]

    if pd.isna(value):
        return pd.Series([np.nan, np.nan])  # ✅ value가 NaN이면 매핑 안함
    elif var_name == "DIAGNOSIS":
        return pd.Series(["diagnosis", np.nan])
    else:
        return pd.Series(mapping_rules.get(var_name, (np.nan, np.nan)))

# 3. 적용
patient_quiq[["Mapping_info_1", "Mapping_info_2"]] = patient_quiq.apply(map_mapping_info, axis=1)
patient_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1,,PATIENTS,GENDER,,F,,string,1,,,,253,,,,
1,1,,PATIENTS,DOB,,2089-11-26 00:00:00,,timestamp,0,,,,253,,,date,
2,1,,PATIENTS,DOD,,,,,0,,,,253,,,,
3,1,,PATIENTS,DOD_HOSP,,,,,0,,,,253,,,,
4,1,,PATIENTS,DOD_SSN,,,,,0,,,,253,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,2000,,PATIENTS,DOB,,2086-03-29 00:00:00,,timestamp,0,,,,44073,,,date,
11996,2000,,PATIENTS,DOD,,,,,0,,,,44073,,,,
11997,2000,,PATIENTS,DOD_HOSP,,,,,0,,,,44073,,,,
11998,2000,,PATIENTS,DOD_SSN,,,,,0,,,,44073,,,,


In [103]:
admission = pd.read_csv('ADMISSIONS.csv.gz', compression='gzip')
admission

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA
0,21,22,165315,2196-04-09 12:26:00,2196-04-10 15:54:00,,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,Private,,UNOBTAINABLE,MARRIED,WHITE,2196-04-09 10:06:00,2196-04-09 13:24:00,BENZODIAZEPINE OVERDOSE,0,1
1,22,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,,CATHOLIC,MARRIED,WHITE,,,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,0,1
2,23,23,124321,2157-10-18 19:34:00,2157-10-25 14:00:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,,,BRAIN MASS,0,1
3,24,24,161859,2139-06-06 16:14:00,2139-06-09 12:48:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Private,,PROTESTANT QUAKER,SINGLE,WHITE,,,INTERIOR MYOCARDIAL INFARCTION,0,1
4,25,25,129635,2160-11-02 02:06:00,2160-11-05 14:55:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,,UNOBTAINABLE,MARRIED,WHITE,2160-11-02 01:01:00,2160-11-02 04:27:00,ACUTE CORONARY SYNDROME,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58971,58594,98800,191113,2131-03-30 21:13:00,2131-04-02 15:02:00,,EMERGENCY,CLINIC REFERRAL/PREMATURE,HOME,Private,ENGL,NOT SPECIFIED,SINGLE,WHITE,2131-03-30 19:44:00,2131-03-30 22:41:00,TRAUMA,0,1
58972,58595,98802,101071,2151-03-05 20:00:00,2151-03-06 09:10:00,2151-03-06 09:10:00,EMERGENCY,CLINIC REFERRAL/PREMATURE,DEAD/EXPIRED,Medicare,ENGL,CATHOLIC,WIDOWED,WHITE,2151-03-05 17:23:00,2151-03-05 21:06:00,SAH,1,1
58973,58596,98805,122631,2200-09-12 07:15:00,2200-09-20 12:08:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Private,ENGL,NOT SPECIFIED,MARRIED,WHITE,,,RENAL CANCER/SDA,0,1
58974,58597,98813,170407,2128-11-11 02:29:00,2128-12-22 13:11:00,,EMERGENCY,EMERGENCY ROOM ADMIT,SNF,Private,ENGL,CATHOLIC,MARRIED,WHITE,2128-11-10 23:48:00,2128-11-11 03:16:00,S/P FALL,0,0


In [104]:
admission_la = admission[admission["SUBJECT_ID"].isin(la)].reset_index(drop=True)
admission_la

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA
0,47,46,144073,2133-02-20 18:21:00,2133-02-24 14:42:00,,EMERGENCY,CLINIC REFERRAL/PREMATURE,HOME,Medicare,,CATHOLIC,SINGLE,WHITE,,,GASTROINTESTINAL BLEED,0,1
1,80,79,181542,2175-09-25 23:05:00,2175-09-29 14:10:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Private,,UNOBTAINABLE,,UNKNOWN/NOT SPECIFIED,,,ANTERIOR MI\CATH,0,1
2,85,84,120969,2196-02-02 07:15:00,2196-02-04 17:48:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME,Private,,OTHER,MARRIED,WHITE,,,MEDIAL PARIETAL TUMOR/SDA,0,0
3,86,84,166401,2196-04-14 04:02:00,2196-04-17 13:42:00,2196-04-17 13:42:00,EMERGENCY,EMERGENCY ROOM ADMIT,DEAD/EXPIRED,Private,,OTHER,MARRIED,WHITE,2196-04-13 22:23:00,2196-04-14 04:31:00,"GLIOBLASTOMA,NAUSEA",1,1
4,105,102,195700,2196-02-27 11:15:00,2196-03-31 20:27:00,,NEWBORN,CLINIC REFERRAL/PREMATURE,HOME,Private,,NOT SPECIFIED,,WHITE,,,NEWBORN,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2543,56989,93829,197106,2142-08-13 00:43:00,2142-08-23 13:05:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Private,ENGL,CHRISTIAN SCIENTIST,MARRIED,WHITE,,,AORTIC STENOSIS\BENTAL PROCEDURE,0,1
2544,56990,93831,107720,2116-11-17 18:05:00,2116-12-01 12:27:00,,EMERGENCY,EMERGENCY ROOM ADMIT,REHAB/DISTINCT PART HOSP,Government,ENGL,CATHOLIC,SINGLE,WHITE,2116-11-17 12:28:00,2116-11-17 19:46:00,BACK PAIN,0,1
2545,57759,96232,153969,2162-03-17 08:00:00,2162-04-29 13:15:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,REHAB/DISTINCT PART HOSP,Private,ENGL,CATHOLIC,SINGLE,WHITE,,,AORTIC STENOSIS\AORTIC VALVE REPLACEMENT /SDA,0,1
2546,58556,98697,166893,2195-09-08 22:31:00,2195-09-11 17:00:00,,EMERGENCY,CLINIC REFERRAL/PREMATURE,HOME,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,2195-09-08 16:58:00,2195-09-09 00:24:00,PNEUMONIA,0,1


In [105]:
# 결과 저장 리스트
rows = []

# 제외할 컬럼
exclude_cols = {"ROW_ID", "SUBJECT_ID", "HADM_ID"}

# admission 반복 처리
for index, row in admission_la.iterrows():
    patient_id = row["SUBJECT_ID"]
    admission_id = row["HADM_ID"]
    
    for col in admission_la.columns:
        if col not in exclude_cols:
            
            event_date_val = row["ADMITTIME"] if col == "DIAGNOSIS" and "ADMITTIME" in admission_la.columns else np.nan
            value = row[col]
            rows.append({
                "Primary_key": index + 2001,
                "Variable_ID": np.nan,
                "Original_table_name": "ADMISSIONS",
                "Variable_name": col,
                "Event_date": event_date_val,
                "Value": value,
                "Unit": np.nan,
                "Variable_type": np.nan,      # 나중에 설정
                "Is_categorical": np.nan,     # 나중에 설정
                "Recorder": np.nan,
                "Recorder_position": np.nan,
                "Recorder_affiliation": np.nan,
                "Patient_id": patient_id,
                "Admission_id": admission_id,
                "Ground_truth": np.nan,
                "Mapping_info_1": np.nan,
                "Mapping_info_2": np.nan
            })

# QUIQ 테이블로 변환
admission_quiq = pd.DataFrame(rows, columns=QUIQ_cols)

# -------------------------------
# Variable_type 추론 함수
def infer_variable_type(val):
    if pd.isna(val):
        return np.nan
    elif isinstance(val, (int, float)):
        return "numeric"
    elif isinstance(val, pd.Timestamp):
        return "timestamp"
    elif isinstance(val, str):
        try:
            pd.to_datetime(val, errors="raise")
            return "timestamp"
        except Exception:
            return "string"
    else:
        return "unknown"

# Variable_type 컬럼 채우기
admission_quiq["Variable_type"] = admission_quiq["Value"].apply(infer_variable_type)

# -------------------------------
# Is_categorical 판단: 고유값 수가 적은 변수는 범주형으로 간주
CATEGORICAL_THRESHOLD = 10

# 각 Variable_name 별 고유값 수 계산
value_counts = admission_quiq.groupby("Variable_name")["Value"].nunique()

# 고유값 수가 기준 이하인 변수 목록 추출
categorical_vars = value_counts[value_counts <= CATEGORICAL_THRESHOLD].index

# Is_categorical 컬럼 채우기: 범주형이면 1, 아니면 0
admission_quiq["Is_categorical"] = admission_quiq["Variable_name"].apply(
    lambda var: 1 if var in categorical_vars else 0
)
admission_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,2001,,ADMISSIONS,ADMITTIME,,2133-02-20 18:21:00,,timestamp,0,,,,46,144073,,,
1,2001,,ADMISSIONS,DISCHTIME,,2133-02-24 14:42:00,,timestamp,0,,,,46,144073,,,
2,2001,,ADMISSIONS,DEATHTIME,,,,,0,,,,46,144073,,,
3,2001,,ADMISSIONS,ADMISSION_TYPE,,EMERGENCY,,string,1,,,,46,144073,,,
4,2001,,ADMISSIONS,ADMISSION_LOCATION,,CLINIC REFERRAL/PREMATURE,,string,1,,,,46,144073,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40763,4548,,ADMISSIONS,EDREGTIME,,,,,0,,,,98748,122488,,,
40764,4548,,ADMISSIONS,EDOUTTIME,,,,,0,,,,98748,122488,,,
40765,4548,,ADMISSIONS,DIAGNOSIS,2166-12-31 08:00:00,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,,string,0,,,,98748,122488,,,
40766,4548,,ADMISSIONS,HOSPITAL_EXPIRE_FLAG,,0,,numeric,1,,,,98748,122488,,,


In [106]:
# 1. 매핑 룰 정의
mapping_rules = {
    "ADMITTIME": ("date", np.nan),
    "DISCHTIME": ("date", np.nan),
    "DEATHTIME": ("date", np.nan),
    "EDREGTIME": ("date", np.nan),
    "EDOUTTIME": ("date", np.nan)
}

# 2. 매핑 함수
def map_mapping_info(row):
    var_name = row["Variable_name"]
    value = row["Value"]

    if pd.isna(value):
        return pd.Series([np.nan, np.nan])  # ✅ value가 NaN이면 매핑 안함
    elif var_name == "DIAGNOSIS":
        return pd.Series(["diagnosis", np.nan])
    else:
        return pd.Series(mapping_rules.get(var_name, (np.nan, np.nan)))

# 3. 적용
admission_quiq[["Mapping_info_1", "Mapping_info_2"]] = admission_quiq.apply(map_mapping_info, axis=1)
admission_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,2001,,ADMISSIONS,ADMITTIME,,2133-02-20 18:21:00,,timestamp,0,,,,46,144073,,date,
1,2001,,ADMISSIONS,DISCHTIME,,2133-02-24 14:42:00,,timestamp,0,,,,46,144073,,date,
2,2001,,ADMISSIONS,DEATHTIME,,,,,0,,,,46,144073,,,
3,2001,,ADMISSIONS,ADMISSION_TYPE,,EMERGENCY,,string,1,,,,46,144073,,,
4,2001,,ADMISSIONS,ADMISSION_LOCATION,,CLINIC REFERRAL/PREMATURE,,string,1,,,,46,144073,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40763,4548,,ADMISSIONS,EDREGTIME,,,,,0,,,,98748,122488,,,
40764,4548,,ADMISSIONS,EDOUTTIME,,,,,0,,,,98748,122488,,,
40765,4548,,ADMISSIONS,DIAGNOSIS,2166-12-31 08:00:00,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,,string,0,,,,98748,122488,,diagnosis,
40766,4548,,ADMISSIONS,HOSPITAL_EXPIRE_FLAG,,0,,numeric,1,,,,98748,122488,,,


In [107]:
procedureevents_mv = pd.read_csv('PROCEDUREEVENTS_MV.csv.gz', compression='gzip')
procedureevents_mv

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,STARTTIME,ENDTIME,ITEMID,VALUE,VALUEUOM,LOCATION,...,ORDERCATEGORYNAME,SECONDARYORDERCATEGORYNAME,ORDERCATEGORYDESCRIPTION,ISOPENBAG,CONTINUEINNEXTDEPT,CANCELREASON,STATUSDESCRIPTION,COMMENTS_EDITEDBY,COMMENTS_CANCELEDBY,COMMENTS_DATE
0,379,29070,115071,232563.0,2145-03-12 23:04:00,2145-03-12 23:05:00,225401,1.0,,,...,Procedures,,Electrolytes,0,0,0,FinishedRunning,,,
1,380,29070,115071,232563.0,2145-03-12 23:04:00,2145-03-12 23:05:00,225454,1.0,,,...,Procedures,,Electrolytes,0,0,0,FinishedRunning,,,
2,381,29070,115071,232563.0,2145-03-12 23:05:00,2145-03-18 20:01:00,225792,8456.0,hour,,...,Ventilation,,Task,1,0,0,FinishedRunning,,,
3,382,29070,115071,232563.0,2145-03-12 23:36:00,2145-03-12 23:37:00,225402,1.0,,,...,Procedures,,Electrolytes,0,0,0,FinishedRunning,,,
4,383,29070,115071,232563.0,2145-03-13 01:27:00,2145-03-16 16:00:00,224560,5193.0,min,Right IJ,...,Invasive Lines,,Task,1,0,0,FinishedRunning,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
258061,257337,41958,182711,246007.0,2155-08-09 01:51:00,2155-08-10 11:20:00,225204,2009.0,min,Right Antecube,...,Invasive Lines,,Task,1,0,0,FinishedRunning,,,
258062,257338,41958,182711,246007.0,2155-08-09 01:57:00,2155-08-09 01:58:00,225402,1.0,,,...,Procedures,,Electrolytes,0,0,0,FinishedRunning,,,
258063,257339,41958,182711,246007.0,2155-08-09 01:57:00,2155-08-09 01:58:00,225459,1.0,,,...,Imaging,,Electrolytes,0,0,0,FinishedRunning,,,
258064,257340,41958,182711,246007.0,2155-08-09 01:57:00,2155-08-09 01:58:00,225966,1.0,,,...,Procedures,,Electrolytes,0,0,1,Rewritten,,RN,2155-08-09 02:20:00


In [108]:
procedure_la = procedureevents_mv[procedureevents_mv["SUBJECT_ID"].isin(la)].reset_index(drop=True)
procedure_la

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,STARTTIME,ENDTIME,ITEMID,VALUE,VALUEUOM,LOCATION,...,ORDERCATEGORYNAME,SECONDARYORDERCATEGORYNAME,ORDERCATEGORYDESCRIPTION,ISOPENBAG,CONTINUEINNEXTDEPT,CANCELREASON,STATUSDESCRIPTION,COMMENTS_EDITEDBY,COMMENTS_CANCELEDBY,COMMENTS_DATE
0,987,27366,174736,248077.0,2143-03-01 04:23:00,2143-03-02 07:25:00,224277,1622.0,min,R Antecube,...,Peripheral Lines,,Task,1,0,0,FinishedRunning,,,
1,988,27366,174736,248077.0,2143-03-01 04:24:00,2143-03-01 07:00:00,224275,156.0,min,R Hand,...,Peripheral Lines,,Task,1,0,0,FinishedRunning,,,
2,989,27366,174736,248077.0,2143-03-01 07:45:00,2143-03-01 07:46:00,224385,1.0,,,...,Intubation/Extubation,,Electrolytes,0,0,0,FinishedRunning,,,
3,990,27366,174736,248077.0,2143-03-01 08:00:00,2143-03-01 09:24:00,224277,84.0,min,R Hand,...,Peripheral Lines,,Task,1,0,0,FinishedRunning,,,
4,991,27366,174736,248077.0,2143-03-01 08:31:00,2143-03-01 08:32:00,224385,1.0,,,...,Intubation/Extubation,,Electrolytes,0,0,2,Rewritten,RN,,2143-03-01 08:32:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10296,257762,45910,153970,249864.0,2197-10-25 20:47:00,2197-10-27 14:47:00,224275,2520.0,min,,...,Peripheral Lines,,Task,1,0,0,FinishedRunning,,,
10297,257763,45910,153970,249864.0,2197-10-25 21:18:00,2197-10-26 07:52:00,224268,634.0,min,Right IJ,...,Invasive Lines,,Task,1,0,0,FinishedRunning,,,
10298,257764,45910,153970,249864.0,2197-10-26 08:11:00,2197-10-27 12:38:00,224268,1707.0,min,Right IJ,...,Invasive Lines,,Task,1,0,0,FinishedRunning,,,
10299,257765,45910,153970,249864.0,2197-10-26 17:05:00,2197-10-26 17:06:00,227194,1.0,,,...,Intubation/Extubation,,Electrolytes,0,0,0,FinishedRunning,,,


In [109]:
procedure_la = pd.merge(
    procedure_la,
    d_item[['ITEMID', 'LABEL', 'CATEGORY']],
    how='left',
    on='ITEMID'
)

# 조인 안 된 행만 필터링 (LABEL 또는 CATEGORY가 NaN인 경우)
unmatched_p = procedure_la[procedure_la['LABEL'].isna()]

# 결과 확인
unmatched_p

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,STARTTIME,ENDTIME,ITEMID,VALUE,VALUEUOM,LOCATION,...,ORDERCATEGORYDESCRIPTION,ISOPENBAG,CONTINUEINNEXTDEPT,CANCELREASON,STATUSDESCRIPTION,COMMENTS_EDITEDBY,COMMENTS_CANCELEDBY,COMMENTS_DATE,LABEL,CATEGORY


In [110]:
procedure_la

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,STARTTIME,ENDTIME,ITEMID,VALUE,VALUEUOM,LOCATION,...,ORDERCATEGORYDESCRIPTION,ISOPENBAG,CONTINUEINNEXTDEPT,CANCELREASON,STATUSDESCRIPTION,COMMENTS_EDITEDBY,COMMENTS_CANCELEDBY,COMMENTS_DATE,LABEL,CATEGORY
0,987,27366,174736,248077.0,2143-03-01 04:23:00,2143-03-02 07:25:00,224277,1622.0,min,R Antecube,...,Task,1,0,0,FinishedRunning,,,,18 Gauge,Access Lines - Peripheral
1,988,27366,174736,248077.0,2143-03-01 04:24:00,2143-03-01 07:00:00,224275,156.0,min,R Hand,...,Task,1,0,0,FinishedRunning,,,,20 Gauge,Access Lines - Peripheral
2,989,27366,174736,248077.0,2143-03-01 07:45:00,2143-03-01 07:46:00,224385,1.0,,,...,Electrolytes,0,0,0,FinishedRunning,,,,Intubation,1-Intubation/Extubation
3,990,27366,174736,248077.0,2143-03-01 08:00:00,2143-03-01 09:24:00,224277,84.0,min,R Hand,...,Task,1,0,0,FinishedRunning,,,,18 Gauge,Access Lines - Peripheral
4,991,27366,174736,248077.0,2143-03-01 08:31:00,2143-03-01 08:32:00,224385,1.0,,,...,Electrolytes,0,0,2,Rewritten,RN,,2143-03-01 08:32:00,Intubation,1-Intubation/Extubation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10296,257762,45910,153970,249864.0,2197-10-25 20:47:00,2197-10-27 14:47:00,224275,2520.0,min,,...,Task,1,0,0,FinishedRunning,,,,20 Gauge,Access Lines - Peripheral
10297,257763,45910,153970,249864.0,2197-10-25 21:18:00,2197-10-26 07:52:00,224268,634.0,min,Right IJ,...,Task,1,0,0,FinishedRunning,,,,Trauma line,Access Lines - Invasive
10298,257764,45910,153970,249864.0,2197-10-26 08:11:00,2197-10-27 12:38:00,224268,1707.0,min,Right IJ,...,Task,1,0,0,FinishedRunning,,,,Trauma line,Access Lines - Invasive
10299,257765,45910,153970,249864.0,2197-10-26 17:05:00,2197-10-26 17:06:00,227194,1.0,,,...,Electrolytes,0,0,0,FinishedRunning,,,,Extubation,1-Intubation/Extubation


In [111]:
# 결과 저장 리스트
rows = []

# 제외할 컬럼
exclude_cols = {'ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'VALUEUOM', 'CHARTTIME', 'ITEMID', 'CATEGORY', 'LABEL'}

for index, row in procedure_la.iterrows():
    patient_id = row["SUBJECT_ID"]
    admission_id = row["HADM_ID"]

    # ✅ 1. LABEL 기반 행 생성
    label_val = row["CATEGORY"]
    value_val = row["LABEL"]
    event_date_val = row["STARTTIME"] if "STARTTIME" in procedure_la.columns else np.nan
    variable_id_val = row["ITEMID"] if "ITEMID" in procedure_la.columns else np.nan

    rows.append({
        "Primary_key": index + 4548,
        "Variable_ID": variable_id_val,
        "Original_table_name": "PROCEDUREEVENTS",
        "Variable_name": label_val,
        "Event_date": event_date_val,
        "Value": value_val,
        "Unit": np.nan,
        "Variable_type": np.nan,
        "Is_categorical": np.nan,
        "Recorder": np.nan,
        "Recorder_position": np.nan,
        "Recorder_affiliation": np.nan,
        "Patient_id": patient_id,
        "Admission_id": admission_id,
        "Ground_truth": np.nan,
        "Mapping_info_1": "procedure",
        "Mapping_info_2": np.nan
    })

    # ✅ 2. 나머지 열들에 대해 반복 (컬럼명을 Variable_name으로)
    for col in procedure_la.columns:
        if col not in exclude_cols:
            value = row[col]
            unit_val = row["VALUEUOM"] if col == "VALUE" and "VALUEUOM" in procedure_la.columns else np.nan
            rows.append({
                "Primary_key": index + 4548,  # 고유성 확보용 소수 해시
                "Variable_ID": np.nan,
                "Original_table_name": "PROCEDUREEVENTS",
                "Variable_name": col,
                "Event_date": np.nan,
                "Value": value,
                "Unit": unit_val,
                "Variable_type": np.nan,
                "Is_categorical": np.nan,
                "Recorder": np.nan,
                "Recorder_position": np.nan,
                "Recorder_affiliation": np.nan,
                "Patient_id": patient_id,
                "Admission_id": admission_id,
                "Ground_truth": np.nan,
                "Mapping_info_1": np.nan,
                "Mapping_info_2": np.nan
            })


# QUIQ 테이블로 변환
procedure_la_quiq = pd.DataFrame(rows, columns=QUIQ_cols)

# -------------------------------
# Variable_type 추론 함수
def infer_variable_type(val):
    if pd.isna(val):
        return np.nan
    elif isinstance(val, (int, float)):
        return "numeric"
    elif isinstance(val, pd.Timestamp):
        return "timestamp"
    elif isinstance(val, str):
        try:
            pd.to_datetime(val, errors="raise")
            return "timestamp"
        except Exception:
            return "string"
    else:
        return "unknown"

# Variable_type 컬럼 채우기
procedure_la_quiq["Variable_type"] = procedure_la_quiq["Value"].apply(infer_variable_type)

# -------------------------------
# Is_categorical 판단
CATEGORICAL_THRESHOLD = 10
value_counts = procedure_la_quiq.groupby("Variable_name")["Value"].nunique()
categorical_vars = value_counts[value_counts <= CATEGORICAL_THRESHOLD].index
procedure_la_quiq["Is_categorical"] = procedure_la_quiq["Variable_name"].apply(
    lambda var: 1 if var in categorical_vars else 0
)
procedure_la_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,4548,224277.0,PROCEDUREEVENTS,Access Lines - Peripheral,2143-03-01 04:23:00,18 Gauge,,string,1,,,,27366,174736,,procedure,
1,4548,,PROCEDUREEVENTS,STARTTIME,,2143-03-01 04:23:00,,timestamp,0,,,,27366,174736,,,
2,4548,,PROCEDUREEVENTS,ENDTIME,,2143-03-02 07:25:00,,timestamp,0,,,,27366,174736,,,
3,4548,,PROCEDUREEVENTS,VALUE,,1622.0,min,numeric,0,,,,27366,174736,,,
4,4548,,PROCEDUREEVENTS,LOCATION,,R Antecube,,string,0,,,,27366,174736,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206015,14848,,PROCEDUREEVENTS,CANCELREASON,,0,,numeric,1,,,,45910,153970,,,
206016,14848,,PROCEDUREEVENTS,STATUSDESCRIPTION,,FinishedRunning,,string,1,,,,45910,153970,,,
206017,14848,,PROCEDUREEVENTS,COMMENTS_EDITEDBY,,,,,1,,,,45910,153970,,,
206018,14848,,PROCEDUREEVENTS,COMMENTS_CANCELEDBY,,,,,1,,,,45910,153970,,,


In [112]:
# 1. 매핑 함수 (값이 없고 기존 매핑도 없을 때만 매핑 시도)
mapping_rules = {
    'STARTTIME':("date", np.nan),
    'ENDTIME':("date", np.nan), 
    'COMMENTS_DATE': ("date", np.nan),
    'STORETIME': ("date", np.nan)
}

def map_mapping_info(row):
    var_name = row["Variable_name"]
    value = row["Value"]

    # 기존 매핑 유지
    if pd.notna(row["Mapping_info_1"]):
        return pd.Series([row["Mapping_info_1"], row["Mapping_info_2"]])
    
    # 새 매핑 적용 (단, value가 NaN이면 매핑 안함)
    if pd.isna(value):
        return pd.Series([np.nan, np.nan])
    else:
        return pd.Series(mapping_rules.get(var_name, (np.nan, np.nan)))

# 2. 기존 Mapping_info 컬럼이 없으면 생성 (예외 방지)
if "Mapping_info_1" not in procedure_la_quiq.columns:
    procedure_la_quiq["Mapping_info_1"] = np.nan
    procedure_la_quiq["Mapping_info_2"] = np.nan

# 3. 적용
procedure_la_quiq[["Mapping_info_1", "Mapping_info_2"]] = procedure_la_quiq.apply(map_mapping_info, axis=1)
procedure_la_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,4548,224277.0,PROCEDUREEVENTS,Access Lines - Peripheral,2143-03-01 04:23:00,18 Gauge,,string,1,,,,27366,174736,,procedure,
1,4548,,PROCEDUREEVENTS,STARTTIME,,2143-03-01 04:23:00,,timestamp,0,,,,27366,174736,,date,
2,4548,,PROCEDUREEVENTS,ENDTIME,,2143-03-02 07:25:00,,timestamp,0,,,,27366,174736,,date,
3,4548,,PROCEDUREEVENTS,VALUE,,1622.0,min,numeric,0,,,,27366,174736,,,
4,4548,,PROCEDUREEVENTS,LOCATION,,R Antecube,,string,0,,,,27366,174736,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206015,14848,,PROCEDUREEVENTS,CANCELREASON,,0,,numeric,1,,,,45910,153970,,,
206016,14848,,PROCEDUREEVENTS,STATUSDESCRIPTION,,FinishedRunning,,string,1,,,,45910,153970,,,
206017,14848,,PROCEDUREEVENTS,COMMENTS_EDITEDBY,,,,,1,,,,45910,153970,,,
206018,14848,,PROCEDUREEVENTS,COMMENTS_CANCELEDBY,,,,,1,,,,45910,153970,,,


In [113]:
prescriptions = pd.read_csv('PRESCRIPTIONS.csv.gz', compression='gzip')
prescriptions

  prescriptions = pd.read_csv('PRESCRIPTIONS.csv.gz', compression='gzip')


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,STARTDATE,ENDDATE,DRUG_TYPE,DRUG,DRUG_NAME_POE,DRUG_NAME_GENERIC,FORMULARY_DRUG_CD,GSN,NDC,PROD_STRENGTH,DOSE_VAL_RX,DOSE_UNIT_RX,FORM_VAL_DISP,FORM_UNIT_DISP,ROUTE
0,2214776,6,107064,,2175-06-11 00:00:00,2175-06-12 00:00:00,MAIN,Tacrolimus,Tacrolimus,Tacrolimus,TACR1,021796,4.690617e+08,1mg Capsule,2,mg,2,CAP,PO
1,2214775,6,107064,,2175-06-11 00:00:00,2175-06-12 00:00:00,MAIN,Warfarin,Warfarin,Warfarin,WARF5,006562,5.601728e+07,5mg Tablet,5,mg,1,TAB,PO
2,2215524,6,107064,,2175-06-11 00:00:00,2175-06-12 00:00:00,MAIN,Heparin Sodium,,,HEPAPREMIX,006522,3.380550e+08,"25,000 unit Premix Bag",25000,UNIT,1,BAG,IV
3,2216265,6,107064,,2175-06-11 00:00:00,2175-06-12 00:00:00,BASE,D5W,,,HEPBASE,,0.000000e+00,HEPARIN BASE,250,ml,250,ml,IV
4,2214773,6,107064,,2175-06-11 00:00:00,2175-06-12 00:00:00,MAIN,Furosemide,Furosemide,Furosemide,FURO20,008208,5.482972e+07,20mg Tablet,20,mg,1,TAB,PO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4156445,3996662,98887,121032,238144.0,2144-09-06 00:00:00,2144-09-06 00:00:00,MAIN,PredniSONE,PredniSONE,PredniSONE,PRED20,006751,5.400182e+07,20 mg Tablet,40,mg,2,TAB,PO/NG
4156446,3996070,98887,121032,238144.0,2144-09-06 00:00:00,2144-09-06 00:00:00,MAIN,Ipratropium Bromide Neb,Ipratropium Bromide Neb,Ipratropium Bromide Neb,IPRA2H,021700,4.879801e+08,2.5mL Vial,1,NEB,1,VIAL,IH
4156447,3996063,98887,121032,238144.0,2144-09-06 00:00:00,2144-09-06 00:00:00,MAIN,HYDROmorphone (Dilaudid),HYDROmorphone (Dilaudid),HYDROmorphone,HYDR20/100NS,048078,6.155302e+10,20 mg / 100 mL Premix Bag,0.12,mg,0.01,BAG,IVPCA
4156448,3996062,98887,121032,238144.0,2144-09-06 00:00:00,2144-09-06 00:00:00,MAIN,Docusate Sodium,Docusate Sodium,Docusate Sodium,DOCU100,003009,9.042245e+08,100mg Capsule,100,mg,1,CAP,PO


In [114]:
prescription_la = prescriptions[prescriptions["SUBJECT_ID"].isin(la)].reset_index(drop=True)
prescription_la

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,STARTDATE,ENDDATE,DRUG_TYPE,DRUG,DRUG_NAME_POE,DRUG_NAME_GENERIC,FORMULARY_DRUG_CD,GSN,NDC,PROD_STRENGTH,DOSE_VAL_RX,DOSE_UNIT_RX,FORM_VAL_DISP,FORM_UNIT_DISP,ROUTE
0,1943685,46,144073,,2133-02-20 00:00:00,2133-02-20 00:00:00,MAIN,Docusate Sodium,Docusate Sodium,Docusate Sodium,DOCU100,003009,5.107900e+10,100MG CAP,100,mg,1,CAP,PO
1,1943686,46,144073,,2133-02-20 00:00:00,2133-02-20 00:00:00,MAIN,Zolpidem Tartrate,Zolpidem Tartrate,Zolpidem Tartrate,AMBI5,019187,2.554013e+07,5MG TAB,5,mg,1,TAB,PO
2,1943700,46,144073,,2133-02-20 00:00:00,2133-02-21 00:00:00,MAIN,Levofloxacin,,,LEVO500PM,029929,4.500680e+07,500MG PM BAG,500,mg,1,BAG,IV
3,1943703,46,144073,,2133-02-20 00:00:00,2133-02-21 00:00:00,BASE,NS,,,NS100,001210,3.380049e+08,100ML BAG,100,ml,100,ml,IV
4,1943704,46,144073,,2133-02-20 00:00:00,2133-02-21 00:00:00,BASE,NS,,,NS50,001210,3.380049e+08,50ML BAG,50,ml,50,ml,IV
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179980,3907361,99068,164100,201560.0,2193-05-24 00:00:00,2193-05-26 00:00:00,MAIN,Ampicillin-Sulbactam,,,UNAS3I,008921,6.332304e+10,3g Vial,3,g,1,VIAL,IV
179981,3907332,99068,164100,201560.0,2193-05-24 00:00:00,2193-05-26 00:00:00,MAIN,Pneumococcal Vac Polyvalent,Pneumococcal Vac Polyvalent,PNEUMOcoccal Vac Polyvalent,PNEU25I,048548,6.473900e+06,25mcg/0.5mL Vial,0.5,mL,1,VIAL,IM
179982,3907350,99068,164100,201560.0,2193-05-24 00:00:00,2193-05-27 00:00:00,BASE,Vial,,,VIAL,,0.000000e+00,Send Vial,1,VIAL,1,VIAL,IV
179983,3907363,99068,164100,201560.0,2193-05-24 00:00:00,2193-05-27 00:00:00,MAIN,Pantoprazole,,,PANT40I,047635,8.092355e+06,40 mg Vial,40,mg,1,VIAL,IV


In [115]:
# 결과 저장 리스트
rows = []

# 제외할 컬럼
exclude_cols = {'ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID',  'DOSE_UNIT_RX', 'FORM_UNIT_DISP',}


# admission 반복 처리
for index, row in prescription_la.iterrows():
    patient_id = row["SUBJECT_ID"]
    admission_id = row["HADM_ID"]
    
    for col in prescription_la.columns:
        if col not in exclude_cols:
            value = row[col]

            # Unit 처리 조건
            if col == "DOSE_VAL_RX":
                unit_val = row["DOSE_UNIT_RX"] if "DOSE_UNIT_RX" in prescription_la.columns else np.nan
            elif col == "FORM_VAL_DISP":
                unit_val = row["FORM_UNIT_DISP"] if "FORM_UNIT_DISP" in prescription_la.columns else np.nan
            else:
                unit_val = np.nan
                
            # Event_date 설정
            if col == "DRUG":
                event_date_val = row["STARTDATE"] if "STARTDATE" in prescription_la.columns else np.nan
            else:
                event_date_val = np.nan

            rows.append({
                "Primary_key": index + 14849,
                "Variable_ID": np.nan,
                "Original_table_name": "PRESCRIPTIONS",
                "Variable_name": col,
                "Event_date": event_date_val,
                "Value": value,
                "Unit": unit_val,
                "Variable_type": np.nan,      # 나중에 설정
                "Is_categorical": np.nan,     # 나중에 설정
                "Recorder": np.nan,
                "Recorder_position": np.nan,
                "Recorder_affiliation": np.nan,
                "Patient_id": patient_id,
                "Admission_id": admission_id,
                "Ground_truth": np.nan,
                "Mapping_info_1": np.nan,
                "Mapping_info_2": np.nan
            })

# QUIQ 테이블로 변환
prescription_la_quiq = pd.DataFrame(rows, columns=QUIQ_cols)

# -------------------------------
# Variable_type 추론 함수
def infer_variable_type(val):
    if pd.isna(val):
        return np.nan
    elif isinstance(val, (int, float)):
        return "numeric"
    elif isinstance(val, pd.Timestamp):
        return "timestamp"
    elif isinstance(val, str):
        try:
            pd.to_datetime(val, errors="raise")
            return "timestamp"
        except Exception:
            return "string"
    else:
        return "unknown"

# Variable_type 컬럼 채우기
prescription_la_quiq["Variable_type"] = prescription_la_quiq["Value"].apply(infer_variable_type)

# -------------------------------
# Is_categorical 판단: 고유값 수가 적은 변수는 범주형으로 간주
CATEGORICAL_THRESHOLD = 10

# 각 Variable_name 별 고유값 수 계산
value_counts = prescription_la_quiq.groupby("Variable_name")["Value"].nunique()

# 고유값 수가 기준 이하인 변수 목록 추출
categorical_vars = value_counts[value_counts <= CATEGORICAL_THRESHOLD].index

# Is_categorical 컬럼 채우기: 범주형이면 1, 아니면 0
prescription_la_quiq["Is_categorical"] = prescription_la_quiq["Variable_name"].apply(
    lambda var: 1 if var in categorical_vars else 0
)
prescription_la_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,14849,,PRESCRIPTIONS,STARTDATE,,2133-02-20 00:00:00,,timestamp,0,,,,46,144073,,,
1,14849,,PRESCRIPTIONS,ENDDATE,,2133-02-20 00:00:00,,timestamp,0,,,,46,144073,,,
2,14849,,PRESCRIPTIONS,DRUG_TYPE,,MAIN,,string,1,,,,46,144073,,,
3,14849,,PRESCRIPTIONS,DRUG,2133-02-20 00:00:00,Docusate Sodium,,string,0,,,,46,144073,,,
4,14849,,PRESCRIPTIONS,DRUG_NAME_POE,,Docusate Sodium,,string,0,,,,46,144073,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2339800,194833,,PRESCRIPTIONS,NDC,,781305714.0,,numeric,0,,,,99068,164100,,,
2339801,194833,,PRESCRIPTIONS,PROD_STRENGTH,,2mg/mL-2mL,,string,0,,,,99068,164100,,,
2339802,194833,,PRESCRIPTIONS,DOSE_VAL_RX,,4,mg,string,0,,,,99068,164100,,,
2339803,194833,,PRESCRIPTIONS,FORM_VAL_DISP,,1,VIAL,string,0,,,,99068,164100,,,


In [116]:
# 1. 매핑 룰 정의
mapping_rules = {'STARTDATE': ("date", np.nan), 
                 'ENDDATE': ("date", np.nan),   
                 'DRUG': ("prescription", "drug"),
                 'GSN': ("medical_code", np.nan), 
                 'NDC': ("medical_code", np.nan),   
                 'DOSE_VAL_RX': ("prescription", "prescription_info"), 
                 'FORM_VAL_DISP': ("prescription", "prescription_info")
}

# 2. 매핑 함수
def map_mapping_info(row):
    var_name = row["Variable_name"]
    value = row["Value"]

    if pd.isna(value):
        return pd.Series([np.nan, np.nan])  # ✅ value가 NaN이면 매핑 안함
    else:
        return pd.Series(mapping_rules.get(var_name, (np.nan, np.nan)))

# 3. 적용
prescription_la_quiq[["Mapping_info_1", "Mapping_info_2"]] = prescription_la_quiq.apply(map_mapping_info, axis=1)
prescription_la_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,14849,,PRESCRIPTIONS,STARTDATE,,2133-02-20 00:00:00,,timestamp,0,,,,46,144073,,date,
1,14849,,PRESCRIPTIONS,ENDDATE,,2133-02-20 00:00:00,,timestamp,0,,,,46,144073,,date,
2,14849,,PRESCRIPTIONS,DRUG_TYPE,,MAIN,,string,1,,,,46,144073,,,
3,14849,,PRESCRIPTIONS,DRUG,2133-02-20 00:00:00,Docusate Sodium,,string,0,,,,46,144073,,prescription,drug
4,14849,,PRESCRIPTIONS,DRUG_NAME_POE,,Docusate Sodium,,string,0,,,,46,144073,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2339800,194833,,PRESCRIPTIONS,NDC,,781305714.0,,numeric,0,,,,99068,164100,,medical_code,
2339801,194833,,PRESCRIPTIONS,PROD_STRENGTH,,2mg/mL-2mL,,string,0,,,,99068,164100,,,
2339802,194833,,PRESCRIPTIONS,DOSE_VAL_RX,,4,mg,string,0,,,,99068,164100,,prescription,prescription_info
2339803,194833,,PRESCRIPTIONS,FORM_VAL_DISP,,1,VIAL,string,0,,,,99068,164100,,prescription,prescription_info


------

In [16]:
diagnosisICD = pd.read_csv('DIAGNOSES_ICD.csv.gz', compression='gzip')
diagnosisICD

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
0,1297,109,172335,1.0,40301
1,1298,109,172335,2.0,486
2,1299,109,172335,3.0,58281
3,1300,109,172335,4.0,5855
4,1301,109,172335,5.0,4254
...,...,...,...,...,...
651042,639798,97503,188195,2.0,20280
651043,639799,97503,188195,3.0,V5869
651044,639800,97503,188195,4.0,V1279
651045,639801,97503,188195,5.0,5275


In [17]:
diagnosisICD_la = diagnosisICD[diagnosisICD["SUBJECT_ID"].isin(la)].reset_index(drop=True)
diagnosisICD_la

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
0,1554,118,147035,1.0,V3000
1,1555,118,147035,2.0,V053
2,634,79,181542,1.0,41011
3,635,79,181542,2.0,4271
4,636,79,181542,3.0,41401
...,...,...,...,...,...
27978,638368,97143,122472,22.0,412
27979,638369,97143,122472,23.0,28529
27980,638370,97143,122472,24.0,27650
27981,638371,97143,122472,25.0,V1302


In [18]:
diagnosisICD_la = pd.merge(
    diagnosisICD_la,
    d_diagnosis[['ICD9_CODE', 'SHORT_TITLE']],
    how='left',
    on='ICD9_CODE'
)
diagnosisICD_la

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,SHORT_TITLE
0,1554,118,147035,1.0,V3000,Single lb in-hosp w/o cs
1,1555,118,147035,2.0,V053,Need prphyl vc vrl hepat
2,634,79,181542,1.0,41011,"AMI anterior wall, init"
3,635,79,181542,2.0,4271,Parox ventric tachycard
4,636,79,181542,3.0,41401,Crnry athrscl natve vssl
...,...,...,...,...,...,...
27978,638368,97143,122472,22.0,412,Old myocardial infarct
27979,638369,97143,122472,23.0,28529,Anemia-other chronic dis
27980,638370,97143,122472,24.0,27650,Volume depletion NOS
27981,638371,97143,122472,25.0,V1302,Personal history UTI


In [82]:
# 열 이름 변경
diagnosisICD_la = diagnosisICD_la.rename(columns={'SHORT_TITLE': 'DIAGNOSIS'})

In [120]:
diagnosisICD_la = pd.merge(
    diagnosisICD_la,
    admission_la[['HADM_ID', 'ADMITTIME']],
    how='left',
    on='HADM_ID'
)
diagnosisICD_la

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,SHORT_TITLE,ADMITTIME
0,1554,118,147035,1.0,V3000,Single lb in-hosp w/o cs,2103-08-11 14:11:00
1,1555,118,147035,2.0,V053,Need prphyl vc vrl hepat,2103-08-11 14:11:00
2,634,79,181542,1.0,41011,"AMI anterior wall, init",2175-09-25 23:05:00
3,635,79,181542,2.0,4271,Parox ventric tachycard,2175-09-25 23:05:00
4,636,79,181542,3.0,41401,Crnry athrscl natve vssl,2175-09-25 23:05:00
...,...,...,...,...,...,...,...
27978,638368,97143,122472,22.0,412,Old myocardial infarct,2189-03-07 22:14:00
27979,638369,97143,122472,23.0,28529,Anemia-other chronic dis,2189-03-07 22:14:00
27980,638370,97143,122472,24.0,27650,Volume depletion NOS,2189-03-07 22:14:00
27981,638371,97143,122472,25.0,V1302,Personal history UTI,2189-03-07 22:14:00


In [121]:
# 결과 저장 리스트
rows = []

# 제외할 컬럼
exclude_cols = {'ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID',  'DOSE_UNIT_RX', 'FORM_UNIT_DISP',}


# admission 반복 처리
for index, row in diagnosisICD_la.iterrows():
    patient_id = row["SUBJECT_ID"]
    admission_id = row["HADM_ID"]
    
    for col in diagnosisICD_la.columns:
        if col not in exclude_cols:
            value = row[col]
                
            # Event_date 설정
            if col == "DIAGNOSIS":
                event_date_val = row["ADMITTIME"] if "ADMITTIME" in diagnosisICD_la.columns else np.nan
            else:
                event_date_val = np.nan

            rows.append({
                "Primary_key": index + 194834,
                "Variable_ID": np.nan,
                "Original_table_name": "DIAGNOSIS",
                "Variable_name": col,
                "Event_date": event_date_val,
                "Value": value,
                "Unit": np.nan,
                "Variable_type": np.nan,      # 나중에 설정
                "Is_categorical": np.nan,     # 나중에 설정
                "Recorder": np.nan,
                "Recorder_position": np.nan,
                "Recorder_affiliation": np.nan,
                "Patient_id": patient_id,
                "Admission_id": admission_id,
                "Ground_truth": np.nan,
                "Mapping_info_1": np.nan,
                "Mapping_info_2": np.nan
            })

# QUIQ 테이블로 변환
diagnosisICD_la_quiq = pd.DataFrame(rows, columns=QUIQ_cols)

# -------------------------------
# Variable_type 추론 함수
def infer_variable_type(val):
    if pd.isna(val):
        return np.nan
    elif isinstance(val, (int, float)):
        return "numeric"
    elif isinstance(val, pd.Timestamp):
        return "timestamp"
    elif isinstance(val, str):
        try:
            pd.to_datetime(val, errors="raise")
            return "timestamp"
        except Exception:
            return "string"
    else:
        return "unknown"

# Variable_type 컬럼 채우기
diagnosisICD_la_quiq["Variable_type"] = diagnosisICD_la_quiq["Value"].apply(infer_variable_type)

# -------------------------------
# Is_categorical 판단: 고유값 수가 적은 변수는 범주형으로 간주
CATEGORICAL_THRESHOLD = 10

# 각 Variable_name 별 고유값 수 계산
value_counts = diagnosisICD_la_quiq.groupby("Variable_name")["Value"].nunique()

# 고유값 수가 기준 이하인 변수 목록 추출
categorical_vars = value_counts[value_counts <= CATEGORICAL_THRESHOLD].index

# Is_categorical 컬럼 채우기: 범주형이면 1, 아니면 0
diagnosisICD_la_quiq["Is_categorical"] = diagnosisICD_la_quiq["Variable_name"].apply(
    lambda var: 1 if var in categorical_vars else 0
)
diagnosisICD_la_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,194834,,DIAGNOSIS,SEQ_NUM,,1.0,,numeric,0,,,,118,147035,,,
1,194834,,DIAGNOSIS,ICD9_CODE,,V3000,,string,0,,,,118,147035,,,
2,194834,,DIAGNOSIS,SHORT_TITLE,,Single lb in-hosp w/o cs,,string,0,,,,118,147035,,,
3,194834,,DIAGNOSIS,ADMITTIME,,2103-08-11 14:11:00,,timestamp,0,,,,118,147035,,,
4,194835,,DIAGNOSIS,SEQ_NUM,,2.0,,numeric,0,,,,118,147035,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111927,222815,,DIAGNOSIS,ADMITTIME,,2189-03-07 22:14:00,,timestamp,0,,,,97143,122472,,,
111928,222816,,DIAGNOSIS,SEQ_NUM,,26.0,,numeric,0,,,,97143,122472,,,
111929,222816,,DIAGNOSIS,ICD9_CODE,,V441,,string,0,,,,97143,122472,,,
111930,222816,,DIAGNOSIS,SHORT_TITLE,,Gastrostomy status,,string,0,,,,97143,122472,,,


In [122]:
# 1. 매핑 룰 정의
mapping_rules = {
    "ADMITTIME": ("date", np.nan),
    "ICD9_CODE": ("medical_code", np.nan)

}

# 2. 매핑 함수
def map_mapping_info(row):
    var_name = row["Variable_name"]
    value = row["Value"]

    if pd.isna(value):
        return pd.Series([np.nan, np.nan])  # ✅ value가 NaN이면 매핑 안함
    elif var_name == "DIAGNOSIS":
        return pd.Series(["diagnosis", np.nan])
    else:
        return pd.Series(mapping_rules.get(var_name, (np.nan, np.nan)))

# 3. 적용
diagnosisICD_la_quiq[["Mapping_info_1", "Mapping_info_2"]] = diagnosisICD_la_quiq.apply(map_mapping_info, axis=1)
diagnosisICD_la_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,194834,,DIAGNOSIS,SEQ_NUM,,1.0,,numeric,0,,,,118,147035,,,
1,194834,,DIAGNOSIS,ICD9_CODE,,V3000,,string,0,,,,118,147035,,medical_code,
2,194834,,DIAGNOSIS,SHORT_TITLE,,Single lb in-hosp w/o cs,,string,0,,,,118,147035,,,
3,194834,,DIAGNOSIS,ADMITTIME,,2103-08-11 14:11:00,,timestamp,0,,,,118,147035,,date,
4,194835,,DIAGNOSIS,SEQ_NUM,,2.0,,numeric,0,,,,118,147035,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111927,222815,,DIAGNOSIS,ADMITTIME,,2189-03-07 22:14:00,,timestamp,0,,,,97143,122472,,date,
111928,222816,,DIAGNOSIS,SEQ_NUM,,26.0,,numeric,0,,,,97143,122472,,,
111929,222816,,DIAGNOSIS,ICD9_CODE,,V441,,string,0,,,,97143,122472,,medical_code,
111930,222816,,DIAGNOSIS,SHORT_TITLE,,Gastrostomy status,,string,0,,,,97143,122472,,,


In [124]:
labevents = pd.read_csv('LABEVENTS.csv.gz', compression='gzip')
labevents

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ITEMID,CHARTTIME,VALUE,VALUENUM,VALUEUOM,FLAG
0,281,3,,50820,2101-10-12 16:07:00,7.39,7.39,units,
1,282,3,,50800,2101-10-12 18:17:00,ART,,,
2,283,3,,50802,2101-10-12 18:17:00,-1,-1.00,mEq/L,
3,284,3,,50804,2101-10-12 18:17:00,22,22.00,mEq/L,
4,285,3,,50808,2101-10-12 18:17:00,0.93,0.93,mmol/L,abnormal
...,...,...,...,...,...,...,...,...,...
27854050,27428435,96443,103219.0,50882,2109-12-30 01:40:00,26,26.00,mEq/L,
27854051,27428436,96443,103219.0,50885,2109-12-30 01:40:00,2.1,2.10,mg/dL,abnormal
27854052,27428437,96443,103219.0,50902,2109-12-30 01:40:00,97,97.00,mEq/L,
27854053,27428438,96443,103219.0,50911,2109-12-30 01:40:00,2,2.00,ng/mL,


In [125]:
labevents_la = labevents[labevents["SUBJECT_ID"].isin(la)].reset_index(drop=True)
labevents_la

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ITEMID,CHARTTIME,VALUE,VALUENUM,VALUEUOM,FLAG
0,21194,46,144073.0,50868,2133-02-20 17:52:00,8,8.0,mEq/L,
1,21195,46,144073.0,50878,2133-02-20 17:52:00,180,180.0,IU/L,abnormal
2,21196,46,144073.0,50882,2133-02-20 17:52:00,17,17.0,mEq/L,abnormal
3,21197,46,144073.0,50883,2133-02-20 17:52:00,0.7,0.7,mg/dL,abnormal
4,21198,46,144073.0,50884,2133-02-20 17:52:00,0.3,0.3,mg/dL,
...,...,...,...,...,...,...,...,...,...
1207432,27806593,99288,179982.0,51256,2194-05-08 04:26:00,70.3,70.3,%,abnormal
1207433,27806594,99288,179982.0,51265,2194-05-08 04:26:00,391,391.0,K/uL,
1207434,27806595,99288,179982.0,51277,2194-05-08 04:26:00,15.7,15.7,%,abnormal
1207435,27806596,99288,179982.0,51279,2194-05-08 04:26:00,4.30,4.3,m/uL,abnormal


In [126]:
merged_df = pd.merge(
    labevents_la,
    d_lab[['ITEMID', 'LABEL', 'CATEGORY']],
    how='left',
    on='ITEMID'
)

# 조인 안 된 행만 필터링 (LABEL 또는 CATEGORY가 NaN인 경우)
unmatched = merged_df[merged_df['LABEL'].isna()]

# 결과 확인
unmatched

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ITEMID,CHARTTIME,VALUE,VALUENUM,VALUEUOM,FLAG,LABEL,CATEGORY


In [127]:
labevents_la = merged_df

In [131]:
# 결과 저장 리스트
rows = []

# 제외할 컬럼
exclude_cols = {
    "ROW_ID", "SUBJECT_ID", "HADM_ID", "ITEMID", "LABEL",
    "CHARTTIME", "VALUE", "VALUEUOM", "CATEGORY", "FLAG"
}

for index, row in labevents_la.iterrows():
    patient_id = row["SUBJECT_ID"]
    admission_id = row["HADM_ID"]
    item_id = row["ITEMID"]
    variable_name = row["LABEL"]
    event_date = row["CHARTTIME"]
    unit = row["VALUEUOM"]
    
    # 기존 컬럼 처리
    for col in labevents_la.columns:
        if col not in exclude_cols:
            value = row[col]
            
            # 숫자인 경우에만 단위 유지
            if isinstance(value, (int, float)) or (isinstance(value, str) and value.replace('.', '', 1).isdigit()):
                final_unit = unit
            else:
                final_unit = np.nan

            rows.append({
                "Primary_key": index + 222817,
                "Variable_ID": item_id,
                "Original_table_name": "LABEVENTS",
                "Variable_name": variable_name,
                "Event_date": event_date,
                "Value": value,
                "Unit": final_unit,
                "Variable_type": np.nan,
                "Is_categorical": np.nan,
                "Recorder": np.nan,
                "Recorder_position": np.nan,
                "Recorder_affiliation": np.nan,
                "Patient_id": patient_id,
                "Admission_id": admission_id,
                "Ground_truth": np.nan,
                "Mapping_info_1": "event",
                "Mapping_info_2": "lab_event"
            })

    # ⬇️ Flag 컬럼 무조건 추가 (NaN 포함)
    flag_value = row["FLAG"]  # NaN일 수도 있음

    rows.append({
        "Primary_key": index + 222817,  # 기존과 충돌 방지용 오프셋
        "Variable_ID": np.nan,
        "Original_table_name": "LABEVENTS",
        "Variable_name": "FLAG",  # 고정 문자열
        "Event_date": event_date,
        "Value": flag_value,
        "Unit": np.nan,
        "Variable_type": np.nan,
        "Is_categorical": np.nan,
        "Recorder": np.nan,
        "Recorder_position": np.nan,
        "Recorder_affiliation": np.nan,
        "Patient_id": patient_id,
        "Admission_id": admission_id,
        "Ground_truth": np.nan,
        "Mapping_info_1": "event",
        "Mapping_info_2": "lab_event"
    })
    
labevents_la_quiq = pd.DataFrame(rows, columns=QUIQ_cols)

# -------------------------------
# Variable_type 추론 함수
def infer_variable_type(val):
    if pd.isna(val):
        return np.nan
    elif isinstance(val, (int, float)):
        return "numeric"
    elif isinstance(val, pd.Timestamp):
        return "timestamp"
    elif isinstance(val, str):
        try:
            pd.to_datetime(val, errors="raise")
            return "timestamp"
        except Exception:
            return "string"
    else:
        return "unknown"

# Variable_type 먼저 계산
labevents_la_quiq["Variable_type"] = labevents_la_quiq["Value"].apply(infer_variable_type)

# CATEGORICAL_THRESHOLD 설정
CATEGORICAL_THRESHOLD = 10

# 각 Variable_name 별 고유값 수 계산 (NaN 제외)
value_counts = labevents_la_quiq.groupby("Variable_name")["Value"].nunique(dropna=True)

# 고유값 수가 기준 이하인 변수 목록 추출
categorical_vars = value_counts[value_counts <= CATEGORICAL_THRESHOLD].index

# Is_categorical 컬럼 채우기
labevents_la_quiq["Is_categorical"] = labevents_la_quiq.apply(
    lambda row: np.nan if pd.isna(row["Value"])
    else 1 if row["Variable_name"] in categorical_vars
    else 0,
    axis=1
)
labevents_la_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,222817,50868.0,LABEVENTS,Anion Gap,2133-02-20 17:52:00,8.0,mEq/L,numeric,0.0,,,,46,144073.0,,event,lab_event
1,222817,,LABEVENTS,FLAG,2133-02-20 17:52:00,,,,,,,,46,144073.0,,event,lab_event
2,222818,50878.0,LABEVENTS,Asparate Aminotransferase (AST),2133-02-20 17:52:00,180.0,IU/L,numeric,0.0,,,,46,144073.0,,event,lab_event
3,222818,,LABEVENTS,FLAG,2133-02-20 17:52:00,abnormal,,string,1.0,,,,46,144073.0,,event,lab_event
4,222819,50882.0,LABEVENTS,Bicarbonate,2133-02-20 17:52:00,17.0,mEq/L,numeric,0.0,,,,46,144073.0,,event,lab_event
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2414869,1430251,,LABEVENTS,FLAG,2194-05-08 04:26:00,abnormal,,string,1.0,,,,99288,179982.0,,event,lab_event
2414870,1430252,51279.0,LABEVENTS,Red Blood Cells,2194-05-08 04:26:00,4.3,m/uL,numeric,0.0,,,,99288,179982.0,,event,lab_event
2414871,1430252,,LABEVENTS,FLAG,2194-05-08 04:26:00,abnormal,,string,1.0,,,,99288,179982.0,,event,lab_event
2414872,1430253,51301.0,LABEVENTS,White Blood Cells,2194-05-08 04:26:00,12.9,K/uL,numeric,0.0,,,,99288,179982.0,,event,lab_event


In [132]:
QUIQ_table = pd.concat(
    [patient_quiq, admission_quiq, procedure_la_quiq, prescription_la_quiq, diagnosisICD_la_quiq, labevents_la_quiq],
    ignore_index=True
)
QUIQ_table

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1,,PATIENTS,GENDER,,F,,string,1.0,,,,253,,,,
1,1,,PATIENTS,DOB,,2089-11-26 00:00:00,,timestamp,0.0,,,,253,,,date,
2,1,,PATIENTS,DOD,,,,,0.0,,,,253,,,,
3,1,,PATIENTS,DOD_HOSP,,,,,0.0,,,,253,,,,
4,1,,PATIENTS,DOD_SSN,,,,,0.0,,,,253,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5125394,1430251,,LABEVENTS,FLAG,2194-05-08 04:26:00,abnormal,,string,1.0,,,,99288,179982.0,,event,lab_event
5125395,1430252,51279.0,LABEVENTS,Red Blood Cells,2194-05-08 04:26:00,4.3,m/uL,numeric,0.0,,,,99288,179982.0,,event,lab_event
5125396,1430252,,LABEVENTS,FLAG,2194-05-08 04:26:00,abnormal,,string,1.0,,,,99288,179982.0,,event,lab_event
5125397,1430253,51301.0,LABEVENTS,White Blood Cells,2194-05-08 04:26:00,12.9,K/uL,numeric,0.0,,,,99288,179982.0,,event,lab_event


In [133]:
QUIQ_table.to_csv("QUIQ_table.csv", index=False)

## Table Description

- D_CPT: High-level dictionary of CPT codes
- D_ICD_DIAGNOSIS: Dictionary of Internationl Statistical Classification of Disease and Related Health Problems codes relating to diagnosis
- D_ICD_PROCEDURES: Dictionary of International Statistical Classification of Disease and Related Health Problems codes relating to procedures
- D_ITEMS: Dictinoary of ITEMIDs appearing in the MIMIC database, except those that relate to laboratory tests
- D_LABITEMS: Dictionary of ITEMIDs in the aboratory database that relate to labroatory tests

In [8]:
d_lab = pd.read_csv('D_LABITEMS.csv.gz', compression='gzip')

In [10]:
d_lab

Unnamed: 0,ROW_ID,ITEMID,LABEL,FLUID,CATEGORY,LOINC_CODE
0,546,51346,Blasts,Cerebrospinal Fluid (CSF),Hematology,26447-3
1,547,51347,Eosinophils,Cerebrospinal Fluid (CSF),Hematology,26451-5
2,548,51348,"Hematocrit, CSF",Cerebrospinal Fluid (CSF),Hematology,30398-2
3,549,51349,Hypersegmented Neutrophils,Cerebrospinal Fluid (CSF),Hematology,26506-6
4,550,51350,Immunophenotyping,Cerebrospinal Fluid (CSF),Hematology,
...,...,...,...,...,...,...
748,749,51551,VOIDED SPECIMEN,OTHER BODY FLUID,HEMATOLOGY,
749,750,51552,VOIDED SPECIMEN,STOOL,CHEMISTRY,
750,751,51553,VOIDED SPECIMEN,URINE,CHEMISTRY,
751,752,51554,VOIDED SPECIMEN,JOINT FLUID,HEMATOLOGY,


In [11]:
d_item = pd.read_csv('D_ITEMS.csv.gz', compression= 'gzip')

In [12]:
d_item

Unnamed: 0,ROW_ID,ITEMID,LABEL,ABBREVIATION,DBSOURCE,LINKSTO,CATEGORY,UNITNAME,PARAM_TYPE,CONCEPTID
0,457,497,Patient controlled analgesia (PCA) [Inject],,carevue,chartevents,,,,
1,458,498,PCA Lockout (Min),,carevue,chartevents,,,,
2,459,499,PCA Medication,,carevue,chartevents,,,,
3,460,500,PCA Total Dose,,carevue,chartevents,,,,
4,461,501,PCV Exh Vt (Obser),,carevue,chartevents,,,,
...,...,...,...,...,...,...,...,...,...,...
12482,14518,226757,GCSMotorApacheIIValue,GCSMotorApacheIIValue,metavision,chartevents,Scores - APACHE II,,Text,
12483,14519,226758,GCSVerbalApacheIIValue,GCSVerbalApacheIIValue,metavision,chartevents,Scores - APACHE II,,Text,
12484,14520,226759,HCO3ApacheIIValue,HCO3ApacheIIValue,metavision,chartevents,Scores - APACHE II,,Numeric,
12485,14521,226760,HCO3Score,HCO3Score,metavision,chartevents,Scores - APACHE II,,Numeric,


In [13]:
d_diagnosis = pd.read_csv('D_ICD_DIAGNOSES.csv.gz', compression='gzip')
d_diagnosis

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
0,174,01166,TB pneumonia-oth test,"Tuberculous pneumonia [any form], tubercle bac..."
1,175,01170,TB pneumothorax-unspec,"Tuberculous pneumothorax, unspecified"
2,176,01171,TB pneumothorax-no exam,"Tuberculous pneumothorax, bacteriological or h..."
3,177,01172,TB pneumothorx-exam unkn,"Tuberculous pneumothorax, bacteriological or h..."
4,178,01173,TB pneumothorax-micro dx,"Tuberculous pneumothorax, tubercle bacilli fou..."
...,...,...,...,...
14562,14432,V7399,Scrn unspcf viral dis,Special screening examination for unspecified ...
14563,14433,V740,Screening for cholera,Screening examination for cholera
14564,14434,V741,Screening-pulmonary TB,Screening examination for pulmonary tuberculosis
14565,14435,V742,Screening for leprosy,Screening examination for leprosy (Hansen's di...


In [14]:
d_procedure = pd.read_csv('D_ICD_PROCEDURES.csv.gz', compression='gzip')
d_procedure

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
0,264,851,Canthotomy,Canthotomy
1,265,852,Blepharorrhaphy,Blepharorrhaphy
2,266,859,Adjust lid position NEC,Other adjustment of lid position
3,267,861,Lid reconst w skin graft,Reconstruction of eyelid with skin flap or graft
4,268,862,Lid reconst w muc graft,Reconstruction of eyelid with mucous membrane ...
...,...,...,...,...
3877,3344,9959,Vaccination/innocula NEC,Other vaccination and inoculation
3878,3345,9960,Cardiopulm resuscita NOS,"Cardiopulmonary resuscitation, not otherwise s..."
3879,3346,9961,Atrial cardioversion,Atrial cardioversion
3880,3347,9962,Heart countershock NEC,Other electric countershock of heart


In [15]:
d_cpt = pd.read_csv('D_CPT.csv.gz', compression='gzip')
d_cpt

Unnamed: 0,ROW_ID,CATEGORY,SECTIONRANGE,SECTIONHEADER,SUBSECTIONRANGE,SUBSECTIONHEADER,CODESUFFIX,MINCODEINSUBSECTION,MAXCODEINSUBSECTION
0,1,1,99201-99499,Evaluation and management,99201-99216,Office/other outpatient services,,99201,99216
1,2,1,99201-99499,Evaluation and management,99217-99220,Hospital observation services,,99217,99220
2,3,1,99201-99499,Evaluation and management,99221-99239,Hospital inpatient services,,99221,99239
3,4,1,99201-99499,Evaluation and management,99241-99255,Consultations,,99241,99255
4,5,1,99201-99499,Evaluation and management,99261-99263,Follow-up inpatient consultations (deleted codes),,99261,99263
...,...,...,...,...,...,...,...,...,...
129,130,2,0001F-7025F,Performance measurement,5005F-5100F,Follow-up or other outcomes,F,5005,5100
130,131,2,0001F-7025F,Performance measurement,6005F-6045F,Patient safety,F,6005,6045
131,132,2,0001F-7025F,Performance measurement,7010F-7025F,Structural Measures,F,7010,7025
132,133,3,0016T-0207T,Emerging technology,0016T-0207T,Temporary codes,T,16,207


In [12]:
# QUIQ 테이블의 컬럼 정의
QUIQ_cols = [
    "Primary_key", "Variable_ID", "Original_table_name", "Variable_name", "Event_date", 
    "Value", "Unit", "Variable_type","Is_categorical", "Recorder", "Recorder_position", "Recorder_affiliation",
    "Patient_id", "Admission_id", "Ground_truth", "Mapping_info_1", "Mapping_info_2"
]

# Caregivers 

## links to CHARTEVENTS on CGID

In [11]:
caregivers = pd.read_csv('CAREGIVERS.csv.gz', compression='gzip')
caregivers

Unnamed: 0,ROW_ID,CGID,LABEL,DESCRIPTION
0,2228,16174,RO,Read Only
1,2229,16175,RO,Read Only
2,2230,16176,Res,Resident/Fellow/PA/NP
3,2231,16177,RO,Read Only
4,2232,16178,RT,Respiratory
...,...,...,...,...
7562,6300,20303,MD,
7563,6301,20304,RN,RN
7564,6302,20305,MDs,
7565,6303,20306,RPH,Pharmacist


-----

# Patients

In [19]:
patients = pd.read_csv('PATIENTS.csv.gz', compression='gzip')
patients

Unnamed: 0,ROW_ID,SUBJECT_ID,GENDER,DOB,DOD,DOD_HOSP,DOD_SSN,EXPIRE_FLAG
0,234,249,F,2075-03-13 00:00:00,,,,0
1,235,250,F,2164-12-27 00:00:00,2188-11-22 00:00:00,2188-11-22 00:00:00,,1
2,236,251,M,2090-03-15 00:00:00,,,,0
3,237,252,M,2078-03-06 00:00:00,,,,0
4,238,253,F,2089-11-26 00:00:00,,,,0
...,...,...,...,...,...,...,...,...
46515,31840,44089,M,2026-05-25 00:00:00,,,,0
46516,31841,44115,F,2124-07-27 00:00:00,,,,0
46517,31842,44123,F,2049-11-26 00:00:00,2135-01-12 00:00:00,2135-01-12 00:00:00,,1
46518,31843,44126,F,2076-07-25 00:00:00,,,,0


In [20]:
# 1. 중복 제거 후 환자 ID 샘플링
sampled_subjects = patients["SUBJECT_ID"].drop_duplicates().sample(n=10000, random_state=42)

# 2. 샘플링된 환자 ID에 해당하는 모든 행 추출
sampled_df = patients[patients["SUBJECT_ID"].isin(sampled_subjects)]

# 3. 인덱스 초기화
sampled_df = sampled_df.reset_index(drop=True)

In [21]:
sampled_df

Unnamed: 0,ROW_ID,SUBJECT_ID,GENDER,DOB,DOD,DOD_HOSP,DOD_SSN,EXPIRE_FLAG
0,235,250,F,2164-12-27 00:00:00,2188-11-22 00:00:00,2188-11-22 00:00:00,,1
1,238,253,F,2089-11-26 00:00:00,,,,0
2,241,257,F,2031-04-03 00:00:00,2121-07-08 00:00:00,2121-07-08 00:00:00,2121-07-08 00:00:00,1
3,245,262,M,2090-01-05 00:00:00,,,,0
4,247,264,F,2162-11-30 00:00:00,,,,0
...,...,...,...,...,...,...,...,...
9995,31822,44019,F,1901-01-14 00:00:00,2201-01-14 00:00:00,2201-01-14 00:00:00,2201-01-14 00:00:00,1
9996,31828,44052,M,2114-04-03 00:00:00,,,,0
9997,31834,44069,M,2064-04-08 00:00:00,,,,0
9998,31836,44073,F,2086-03-29 00:00:00,,,,0


- DOB: date of birth
- DOD: date of death
- DOD_HOSP: date of death as recorded in the hospital database
- DOD_SSN: date of death from the social security database
- EXPIRE_FLAG: binary flag which indicates whether the patient died

In [22]:
# 결과 저장 리스트
rows = []

# 제외할 컬럼
exclude_cols = {"ROW_ID", "SUBJECT_ID"}

for index, row in sampled_la.iterrows():
    patient_id = row["SUBJECT_ID"]
    #admission_id = row["HADM_ID"]
    
    for col in sampled_la.columns:
        if col not in exclude_cols:
            value = row[col]
            rows.append({
                "Primary_key": index + 1,
                "Variable_ID": np.nan,
                "Original_table_name": "PATIENTS",
                "Variable_name": col,
                "Event_date": np.nan,
                "Value": value,
                "Unit": np.nan,
                "Variable_type": np.nan,      # 나중에 설정
                "Is_categorical": np.nan,     # 나중에 설정
                "Recorder": np.nan,
                "Recorder_position": np.nan,
                "Recorder_affiliation": np.nan,
                "Patient_id": patient_id,
                "Admission_id": np.nan,
                "Ground_truth": np.nan,
                "Mapping_info_1": np.nan,
                "Mapping_info_2": np.nan
            })

# QUIQ 테이블로 변환
patient_quiq = pd.DataFrame(rows, columns=QUIQ_cols)

# -------------------------------
# Variable_type 추론 함수
def infer_variable_type(val):
    if pd.isna(val):
        return np.nan
    elif isinstance(val, (int, float)):
        return "numeric"
    elif isinstance(val, pd.Timestamp):
        return "timestamp"
    elif isinstance(val, str):
        try:
            pd.to_datetime(val, errors="raise")
            return "timestamp"
        except Exception:
            return "string"
    else:
        return "unknown"

# Variable_type 컬럼 채우기
patient_quiq["Variable_type"] = patient_quiq["Value"].apply(infer_variable_type)

# -------------------------------
# Is_categorical 판단: 고유값 수가 적은 변수는 범주형으로 간주
CATEGORICAL_THRESHOLD = 10

# 각 Variable_name 별 고유값 수 계산
value_counts = patient_quiq.groupby("Variable_name")["Value"].nunique()

# 고유값 수가 기준 이하인 변수 목록 추출
categorical_vars = value_counts[value_counts <= CATEGORICAL_THRESHOLD].index

# Is_categorical 컬럼 채우기: 범주형이면 1, 아니면 0
patient_quiq["Is_categorical"] = patient_quiq["Variable_name"].apply(
    lambda var: 1 if var in categorical_vars else 0
)
patient_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1,,PATIENTS,GENDER,,F,,string,1,,,,253,,,,
1,1,,PATIENTS,DOB,,2089-11-26 00:00:00,,timestamp,0,,,,253,,,,
2,1,,PATIENTS,DOD,,,,,0,,,,253,,,,
3,1,,PATIENTS,DOD_HOSP,,,,,0,,,,253,,,,
4,1,,PATIENTS,DOD_SSN,,,,,0,,,,253,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,2000,,PATIENTS,DOB,,2086-03-29 00:00:00,,timestamp,0,,,,44073,,,,
11996,2000,,PATIENTS,DOD,,,,,0,,,,44073,,,,
11997,2000,,PATIENTS,DOD_HOSP,,,,,0,,,,44073,,,,
11998,2000,,PATIENTS,DOD_SSN,,,,,0,,,,44073,,,,


In [23]:
# 1. 매핑 룰 정의
mapping_rules = {
    "DOB": ("date", np.nan),
    "DOD": ("date", np.nan),
    "DOD_HOSP": ("date", np.nan),
    "DOD_SSN": ("date", np.nan),
}

# 2. 매핑 함수
def map_mapping_info(row):
    var_name = row["Variable_name"]
    value = row["Value"]
    
    if pd.isna(value):
        return pd.Series([np.nan, np.nan])  # ✅ Value가 비어있으면 매핑하지 않음
    else:
        return pd.Series(mapping_rules.get(var_name, (np.nan, np.nan)))  # ✅ 기본 매핑 처리

# 3. 적용
patient_quiq[["Mapping_info_1", "Mapping_info_2"]] = patient_quiq.apply(map_mapping_info, axis=1)

In [24]:
patient_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1,,PATIENTS,GENDER,,F,,string,1,,,,253,,,,
1,1,,PATIENTS,DOB,,2089-11-26 00:00:00,,timestamp,0,,,,253,,,date,
2,1,,PATIENTS,DOD,,,,,0,,,,253,,,,
3,1,,PATIENTS,DOD_HOSP,,,,,0,,,,253,,,,
4,1,,PATIENTS,DOD_SSN,,,,,0,,,,253,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,2000,,PATIENTS,DOB,,2086-03-29 00:00:00,,timestamp,0,,,,44073,,,date,
11996,2000,,PATIENTS,DOD,,,,,0,,,,44073,,,,
11997,2000,,PATIENTS,DOD_HOSP,,,,,0,,,,44073,,,,
11998,2000,,PATIENTS,DOD_SSN,,,,,0,,,,44073,,,,


In [18]:
patient_quiq.to_csv("G:/2000/MIMIC_patient_QUIQ.csv", index=False)

In [20]:
sampled_la.columns

Index(['ROW_ID', 'SUBJECT_ID', 'GENDER', 'DOB', 'DOD', 'DOD_HOSP', 'DOD_SSN',
       'EXPIRE_FLAG'],
      dtype='object')

In [21]:
# VIA 테이블 생성
via_variable_names = ['GENDER', 'DOB', 'DOD', 'DOD_HOSP', 'DOD_SSN',
       'EXPIRE_FLAG']
via_descriptions = ["Patient's gender ('M' = male, 'F' = female)",
    "Date of birth (YYYY-MM-DD format; date shifted for de-identification)",
    "Date of death (regardless of in- or out-of-hospital)",
    "Date of in-hospital death",
    "Date of death based on Social Security records",
    "In-hospital death indicator (1 = deceased, 0 = alive)"
]
via_patients = pd.DataFrame({
    'Original_table_name': 'PATIENT',
    'Variable_name': via_variable_names,
    'Description': via_descriptions
})

via_patients

Unnamed: 0,Original_table_name,Variable_name,Description
0,PATIENT,GENDER,"Patient's gender ('M' = male, 'F' = female)"
1,PATIENT,DOB,Date of birth (YYYY-MM-DD format; date shifted...
2,PATIENT,DOD,Date of death (regardless of in- or out-of-hos...
3,PATIENT,DOD_HOSP,Date of in-hospital death
4,PATIENT,DOD_SSN,Date of death based on Social Security records
5,PATIENT,EXPIRE_FLAG,"In-hospital death indicator (1 = deceased, 0 =..."


In [22]:
via_patients.to_csv("G:/2000/MIMIC_patient_VIA.csv", index=False)

------

# Admission

In [29]:
admission = pd.read_csv('ADMISSIONS.csv.gz', compression='gzip')
admission

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA
0,21,22,165315,2196-04-09 12:26:00,2196-04-10 15:54:00,,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,Private,,UNOBTAINABLE,MARRIED,WHITE,2196-04-09 10:06:00,2196-04-09 13:24:00,BENZODIAZEPINE OVERDOSE,0,1
1,22,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,,CATHOLIC,MARRIED,WHITE,,,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,0,1
2,23,23,124321,2157-10-18 19:34:00,2157-10-25 14:00:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,,,BRAIN MASS,0,1
3,24,24,161859,2139-06-06 16:14:00,2139-06-09 12:48:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Private,,PROTESTANT QUAKER,SINGLE,WHITE,,,INTERIOR MYOCARDIAL INFARCTION,0,1
4,25,25,129635,2160-11-02 02:06:00,2160-11-05 14:55:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,,UNOBTAINABLE,MARRIED,WHITE,2160-11-02 01:01:00,2160-11-02 04:27:00,ACUTE CORONARY SYNDROME,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58971,58594,98800,191113,2131-03-30 21:13:00,2131-04-02 15:02:00,,EMERGENCY,CLINIC REFERRAL/PREMATURE,HOME,Private,ENGL,NOT SPECIFIED,SINGLE,WHITE,2131-03-30 19:44:00,2131-03-30 22:41:00,TRAUMA,0,1
58972,58595,98802,101071,2151-03-05 20:00:00,2151-03-06 09:10:00,2151-03-06 09:10:00,EMERGENCY,CLINIC REFERRAL/PREMATURE,DEAD/EXPIRED,Medicare,ENGL,CATHOLIC,WIDOWED,WHITE,2151-03-05 17:23:00,2151-03-05 21:06:00,SAH,1,1
58973,58596,98805,122631,2200-09-12 07:15:00,2200-09-20 12:08:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Private,ENGL,NOT SPECIFIED,MARRIED,WHITE,,,RENAL CANCER/SDA,0,1
58974,58597,98813,170407,2128-11-11 02:29:00,2128-12-22 13:11:00,,EMERGENCY,EMERGENCY ROOM ADMIT,SNF,Private,ENGL,CATHOLIC,MARRIED,WHITE,2128-11-10 23:48:00,2128-11-11 03:16:00,S/P FALL,0,0


In [30]:
admission_df = admission[admission["SUBJECT_ID"].isin(la)].reset_index(drop=True)

In [31]:
admission_df

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA
0,47,46,144073,2133-02-20 18:21:00,2133-02-24 14:42:00,,EMERGENCY,CLINIC REFERRAL/PREMATURE,HOME,Medicare,,CATHOLIC,SINGLE,WHITE,,,GASTROINTESTINAL BLEED,0,1
1,80,79,181542,2175-09-25 23:05:00,2175-09-29 14:10:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Private,,UNOBTAINABLE,,UNKNOWN/NOT SPECIFIED,,,ANTERIOR MI\CATH,0,1
2,85,84,120969,2196-02-02 07:15:00,2196-02-04 17:48:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME,Private,,OTHER,MARRIED,WHITE,,,MEDIAL PARIETAL TUMOR/SDA,0,0
3,86,84,166401,2196-04-14 04:02:00,2196-04-17 13:42:00,2196-04-17 13:42:00,EMERGENCY,EMERGENCY ROOM ADMIT,DEAD/EXPIRED,Private,,OTHER,MARRIED,WHITE,2196-04-13 22:23:00,2196-04-14 04:31:00,"GLIOBLASTOMA,NAUSEA",1,1
4,105,102,195700,2196-02-27 11:15:00,2196-03-31 20:27:00,,NEWBORN,CLINIC REFERRAL/PREMATURE,HOME,Private,,NOT SPECIFIED,,WHITE,,,NEWBORN,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2543,56989,93829,197106,2142-08-13 00:43:00,2142-08-23 13:05:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Private,ENGL,CHRISTIAN SCIENTIST,MARRIED,WHITE,,,AORTIC STENOSIS\BENTAL PROCEDURE,0,1
2544,56990,93831,107720,2116-11-17 18:05:00,2116-12-01 12:27:00,,EMERGENCY,EMERGENCY ROOM ADMIT,REHAB/DISTINCT PART HOSP,Government,ENGL,CATHOLIC,SINGLE,WHITE,2116-11-17 12:28:00,2116-11-17 19:46:00,BACK PAIN,0,1
2545,57759,96232,153969,2162-03-17 08:00:00,2162-04-29 13:15:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,REHAB/DISTINCT PART HOSP,Private,ENGL,CATHOLIC,SINGLE,WHITE,,,AORTIC STENOSIS\AORTIC VALVE REPLACEMENT /SDA,0,1
2546,58556,98697,166893,2195-09-08 22:31:00,2195-09-11 17:00:00,,EMERGENCY,CLINIC REFERRAL/PREMATURE,HOME,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,2195-09-08 16:58:00,2195-09-09 00:24:00,PNEUMONIA,0,1


In [28]:
# 결과 저장 리스트
rows = []

# 제외할 컬럼
exclude_cols = {"ROW_ID", "SUBJECT_ID", "HADM_ID"}

# admission 반복 처리
for index, row in admission_df.iterrows():
    patient_id = row["SUBJECT_ID"]
    admission_id = row["HADM_ID"]
    
    for col in admission_df.columns:
        if col not in exclude_cols:
            value = row[col]
            rows.append({
                "Primary_key": index + 2001,
                "Variable_ID": np.nan,
                "Original_table_name": "ADMISSIONS",
                "Variable_name": col,
                "Event_date": np.nan,
                "Value": value,
                "Unit": np.nan,
                "Variable_type": np.nan,      # 나중에 설정
                "Is_categorical": np.nan,     # 나중에 설정
                "Recorder": np.nan,
                "Recorder_position": np.nan,
                "Recorder_affiliation": np.nan,
                "Patient_id": patient_id,
                "Admission_id": admission_id,
                "Ground_truth": np.nan,
                "Mapping_info_1": np.nan,
                "Mapping_info_2": np.nan
            })

# QUIQ 테이블로 변환
admission_quiq = pd.DataFrame(rows, columns=QUIQ_cols)

# -------------------------------
# Variable_type 추론 함수
def infer_variable_type(val):
    if pd.isna(val):
        return np.nan
    elif isinstance(val, (int, float)):
        return "numeric"
    elif isinstance(val, pd.Timestamp):
        return "timestamp"
    elif isinstance(val, str):
        try:
            pd.to_datetime(val, errors="raise")
            return "timestamp"
        except Exception:
            return "string"
    else:
        return "unknown"

# Variable_type 컬럼 채우기
admission_quiq["Variable_type"] = admission_quiq["Value"].apply(infer_variable_type)

# -------------------------------
# Is_categorical 판단: 고유값 수가 적은 변수는 범주형으로 간주
CATEGORICAL_THRESHOLD = 10

# 각 Variable_name 별 고유값 수 계산
value_counts = admission_quiq.groupby("Variable_name")["Value"].nunique()

# 고유값 수가 기준 이하인 변수 목록 추출
categorical_vars = value_counts[value_counts <= CATEGORICAL_THRESHOLD].index

# Is_categorical 컬럼 채우기: 범주형이면 1, 아니면 0
admission_quiq["Is_categorical"] = admission_quiq["Variable_name"].apply(
    lambda var: 1 if var in categorical_vars else 0
)
admission_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,2001,,ADMISSIONS,ADMITTIME,,2133-02-20 18:21:00,,timestamp,0,,,,46,144073,,,
1,2001,,ADMISSIONS,DISCHTIME,,2133-02-24 14:42:00,,timestamp,0,,,,46,144073,,,
2,2001,,ADMISSIONS,DEATHTIME,,,,,0,,,,46,144073,,,
3,2001,,ADMISSIONS,ADMISSION_TYPE,,EMERGENCY,,string,1,,,,46,144073,,,
4,2001,,ADMISSIONS,ADMISSION_LOCATION,,CLINIC REFERRAL/PREMATURE,,string,1,,,,46,144073,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40763,4548,,ADMISSIONS,EDREGTIME,,,,,0,,,,98748,122488,,,
40764,4548,,ADMISSIONS,EDOUTTIME,,,,,0,,,,98748,122488,,,
40765,4548,,ADMISSIONS,DIAGNOSIS,,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,,string,0,,,,98748,122488,,,
40766,4548,,ADMISSIONS,HOSPITAL_EXPIRE_FLAG,,0,,numeric,1,,,,98748,122488,,,


In [35]:
condition_ad = (
    (admission_quiq["Variable_type"] == "string") &
    (admission_quiq["Variable_name"] == "DIAGNOSIS") &
    (admission_quiq["Is_categorical"] == 0)
)

# 해당 조건에 맞는 행들의 Is_categorical 값을 0으로 변경
admission_quiq.loc[condition_ad, "Is_categorical"] = 1
admission_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,2001,,ADMISSIONS,ADMITTIME,,2133-02-20 18:21:00,,timestamp,0,,,,46,144073,,,
1,2001,,ADMISSIONS,DISCHTIME,,2133-02-24 14:42:00,,timestamp,0,,,,46,144073,,,
2,2001,,ADMISSIONS,DEATHTIME,,,,,0,,,,46,144073,,,
3,2001,,ADMISSIONS,ADMISSION_TYPE,,EMERGENCY,,string,1,,,,46,144073,,,
4,2001,,ADMISSIONS,ADMISSION_LOCATION,,CLINIC REFERRAL/PREMATURE,,string,1,,,,46,144073,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40763,4548,,ADMISSIONS,EDREGTIME,,,,,0,,,,98748,122488,,,
40764,4548,,ADMISSIONS,EDOUTTIME,,,,,0,,,,98748,122488,,,
40765,4548,,ADMISSIONS,DIAGNOSIS,,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,,string,1,,,,98748,122488,,,
40766,4548,,ADMISSIONS,HOSPITAL_EXPIRE_FLAG,,0,,numeric,1,,,,98748,122488,,,


In [36]:
# 1. 매핑 룰 정의
mapping_rules = {
    "ADMITTIME": ("date", np.nan),
    "DISCHTIME": ("date", np.nan),
    "DEATHTIME": ("date", np.nan),
    "EDREGTIME": ("date", np.nan),
    "EDOUTTIME": ("date", np.nan)
}

# 2. 매핑 함수
def map_mapping_info(row):
    var_name = row["Variable_name"]
    value = row["Value"]

    if pd.isna(value):
        return pd.Series([np.nan, np.nan])  # ✅ value가 NaN이면 매핑 안함
    elif var_name == "DIAGNOSIS":
        return pd.Series(["diagnosis", np.nan])
    else:
        return pd.Series(mapping_rules.get(var_name, (np.nan, np.nan)))

# 3. 적용
admission_quiq[["Mapping_info_1", "Mapping_info_2"]] = admission_quiq.apply(map_mapping_info, axis=1)
admission_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,2001,,ADMISSIONS,ADMITTIME,,2133-02-20 18:21:00,,timestamp,0,,,,46,144073,,date,
1,2001,,ADMISSIONS,DISCHTIME,,2133-02-24 14:42:00,,timestamp,0,,,,46,144073,,date,
2,2001,,ADMISSIONS,DEATHTIME,,,,,0,,,,46,144073,,,
3,2001,,ADMISSIONS,ADMISSION_TYPE,,EMERGENCY,,string,1,,,,46,144073,,,
4,2001,,ADMISSIONS,ADMISSION_LOCATION,,CLINIC REFERRAL/PREMATURE,,string,1,,,,46,144073,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40763,4548,,ADMISSIONS,EDREGTIME,,,,,0,,,,98748,122488,,,
40764,4548,,ADMISSIONS,EDOUTTIME,,,,,0,,,,98748,122488,,,
40765,4548,,ADMISSIONS,DIAGNOSIS,,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,,string,1,,,,98748,122488,,diagnosis,
40766,4548,,ADMISSIONS,HOSPITAL_EXPIRE_FLAG,,0,,numeric,1,,,,98748,122488,,,


In [37]:
admission_quiq.to_csv("G:/2000/MIMIC_admission_QUIQ.csv", index=False)

In [32]:
# VIA 테이블 생성
via_variable_names = ['ADMITTIME', 'DISCHTIME','DEATHTIME','ADMISSION_TYPE', 'ADMISSION_LOCATION', 'DISCHARGE_LOCATION', 
                      'INSURANCE', 'LANGUAGE', 'RELIGION','MARITAL_STATUS', 'ETHNICITY','EDREGTIME', 'EDOUTTIME','DIAGNOSIS',
                      'HOSPITAL_EXPIRE_FLAG', 'HAS_CHARTEVENTS_DATA']
via_descriptions = [
    'Timestamp when the patient was admitted to the hospital.',
'Timestamp when the patient was discharged from the hospital.',
'Timestamp of death (if the patient died during admission); otherwise null.',
'Type of admission, such as emergency, urgent, elective, or newborn.',
'Location from which the patient was admitted (e.g., emergency room, transfer from another hospital).',
'Location to which the patient was discharged (e.g., home, nursing facility, another hospital).',
'Type of insurance coverage (e.g., Medicare, Medicaid, Private).',
'Preferred language of the patient.',
'Stated religion of the patient (if provided).',
'Marital status of the patient (e.g., single, married, divorced).',
'Self-reported ethnicity of the patient.',
'Timestamp when the patient registered in the Emergency Department.',
'Timestamp when the patient left the Emergency Department.',
'Free-text description of the admitting diagnosis, typically noted by clinicians.',
'Indicates whether the patient died during the hospital stay (1 = died, 0 = survived).',
'Indicates whether the patient has any data recorded in the CHARTEVENTS table (1 = yes, 0 = no).'

]
via_admission = pd.DataFrame({
    'Original_table_name': 'ADMISSIONS',
    'Variable_name': via_variable_names,
    'Description': via_descriptions
})
via_admission

Unnamed: 0,Original_table_name,Variable_name,Description
0,ADMISSIONS,ADMITTIME,Timestamp when the patient was admitted to the...
1,ADMISSIONS,DISCHTIME,Timestamp when the patient was discharged from...
2,ADMISSIONS,DEATHTIME,Timestamp of death (if the patient died during...
3,ADMISSIONS,ADMISSION_TYPE,"Type of admission, such as emergency, urgent, ..."
4,ADMISSIONS,ADMISSION_LOCATION,Location from which the patient was admitted (...
5,ADMISSIONS,DISCHARGE_LOCATION,Location to which the patient was discharged (...
6,ADMISSIONS,INSURANCE,"Type of insurance coverage (e.g., Medicare, Me..."
7,ADMISSIONS,LANGUAGE,Preferred language of the patient.
8,ADMISSIONS,RELIGION,Stated religion of the patient (if provided).
9,ADMISSIONS,MARITAL_STATUS,"Marital status of the patient (e.g., single, m..."


In [34]:
via_admission.to_csv("G:/2000/MIMIC_admission_VIA.csv", index=False)

- 'ADMITTIME', 'DISCHTIME','DEATHTIME', 
- 'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'DISCHARGE_LOCATION', 'INSURANCE', 'LANGUAGE', 'RELIGION','MARITAL_STATUS', 'ETHNICITY',
- 'EDREGTIME', 'EDOUTTIME',
- 'DIAGNOSIS',
- 'HOSPITAL_EXPIRE_FLAG', 'HAS_CHARTEVENTS_DATA'

# ICU stay

In [35]:
icustay = pd.read_csv('ICUSTAYS.csv.gz', compression='gzip')
icustay

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,DBSOURCE,FIRST_CAREUNIT,LAST_CAREUNIT,FIRST_WARDID,LAST_WARDID,INTIME,OUTTIME,LOS
0,365,268,110404,280836,carevue,MICU,MICU,52,52,2198-02-14 23:27:38,2198-02-18 05:26:11,3.2490
1,366,269,106296,206613,carevue,MICU,MICU,52,52,2170-11-05 11:05:29,2170-11-08 17:46:57,3.2788
2,367,270,188028,220345,carevue,CCU,CCU,57,57,2128-06-24 15:05:20,2128-06-27 12:32:29,2.8939
3,368,271,173727,249196,carevue,MICU,SICU,52,23,2120-08-07 23:12:42,2120-08-10 00:39:04,2.0600
4,369,272,164716,210407,carevue,CCU,CCU,57,57,2186-12-25 21:08:04,2186-12-27 12:01:13,1.6202
...,...,...,...,...,...,...,...,...,...,...,...,...
61527,59806,94944,143774,201233,metavision,CSRU,CSRU,15,15,2104-04-15 10:18:16,2104-04-17 14:51:00,2.1894
61528,59807,94950,123750,283653,metavision,CCU,CCU,7,7,2155-12-08 05:33:16,2155-12-10 17:24:58,2.4942
61529,59808,94953,196881,241585,metavision,SICU,SICU,57,57,2160-03-03 16:09:11,2160-03-04 14:22:33,0.9259
61530,59809,94954,118475,202802,metavision,CSRU,CSRU,15,15,2183-03-25 09:53:10,2183-03-27 17:55:03,2.3346


In [36]:
icustay_df = icustay[icustay["SUBJECT_ID"].isin(la)].reset_index(drop=True)
icustay_df

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,DBSOURCE,FIRST_CAREUNIT,LAST_CAREUNIT,FIRST_WARDID,LAST_WARDID,INTIME,OUTTIME,LOS
0,428,322,177634,217128,carevue,CSRU,CSRU,12,12,2135-05-02 14:46:48,2135-05-04 20:23:20,2.2337
1,83,79,181542,290076,carevue,CSRU,CCU,14,57,2175-09-26 01:00:46,2175-09-28 03:00:09,2.0829
2,88,84,166401,239661,carevue,SICU,SICU,14,14,2196-04-16 23:43:00,2196-04-17 13:42:00,0.5826
3,108,102,195700,245390,carevue,NICU,NICU,56,56,2196-02-27 11:29:12,2196-03-31 21:53:54,33.4338
4,157,110,154943,282073,carevue,NICU,NICU,56,56,2110-06-02 02:49:26,2110-06-05 19:47:04,3.7067
...,...,...,...,...,...,...,...,...,...,...,...,...
2658,58813,91881,184208,236973,metavision,SICU,SICU,33,33,2132-10-02 03:08:33,2132-10-05 16:50:41,3.5709
2659,58818,91904,178866,258717,metavision,SICU,SICU,57,57,2170-12-01 15:43:11,2170-12-03 17:44:25,2.0842
2660,58852,92003,193800,278536,metavision,TSICU,TSICU,14,14,2147-12-18 11:46:18,2147-12-19 13:10:45,1.0586
2661,59792,94911,199992,294585,metavision,CCU,CCU,7,7,2155-02-11 17:13:23,2155-02-12 15:29:40,0.9280


In [37]:
icustay_df.columns

Index(['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'DBSOURCE',
       'FIRST_CAREUNIT', 'LAST_CAREUNIT', 'FIRST_WARDID', 'LAST_WARDID',
       'INTIME', 'OUTTIME', 'LOS'],
      dtype='object')

In [41]:
# 결과 저장을 위한 리스트
rows = []

for index, row in icustay_df.iterrows():
    patient_id = row["SUBJECT_ID"]
    admission_id = row["HADM_ID"]

    for col in icustay_df.columns:
        if col not in ["SUBJECT_ID", "HADM_ID", "ROW_ID", "ICUSTAY_ID", "FIRST_WARDID", "LAST_WARDID"]:  # 특정 열 제외
            
            rows.append({
                "Primary_key": index + 4549,  # 원본 테이블의 각 row 마다 부여
                "Variable_ID": np.nan,
                "Original_table_name": "ICUSTAY",  # 테이블 이름
                "Variable_name": col,  # 컬럼명이 변수명
                "Record_datetime": np.nan,
                "Value": row[col],  # 해당 변수 값
                "Unit": np.nan,
                "Variable_type": np.nan,  # 추후 설정
                "Recorder": np.nan,
                "Recorder_position": np.nan,
                "Recorder_affiliation": np.nan,
                "Patient_id": patient_id, 
                "Admission_id": admission_id,
                "Ground_truth": np.nan,
                "Mapping_info_1": np.nan, 
                "Mapping_info_2": np.nan
            })

# QUIQ 테이블 컬럼 순서 맞추기
icustay_quiq = pd.DataFrame(rows, columns=QUIQ_cols)

def infer_variable_type(val):
    if pd.isna(val):
        return np.nan
    elif isinstance(val, bool):
        return "boolean"
    elif isinstance(val, (int, float)):
        return "numeric"
    elif isinstance(val, pd.Timestamp):
        return "timestamp"
    elif isinstance(val, str):
        # 문자열인데 datetime처럼 보이는 경우 처리
        try:
            parsed = pd.to_datetime(val, errors='raise')
            return "timestamp"
        except Exception:
            return "string"
    else:
        return "unknown"

icustay_quiq["Variable_type"] = icustay_quiq["Value"].apply(infer_variable_type)

# CATEGORICAL_THRESHOLD 설정
CATEGORICAL_THRESHOLD = 10

# 각 Variable_name 별 고유값 수 계산 (NaN 제외)
value_counts = icustay_quiq.groupby("Variable_name")["Value"].nunique(dropna=True)

# 고유값 수가 기준 이하인 변수 목록 추출
categorical_vars = value_counts[value_counts <= CATEGORICAL_THRESHOLD].index

# Is_categorical 컬럼 채우기
icustay_quiq["Is_categorical"] = icustay_quiq.apply(
    lambda row: np.nan if pd.isna(row["Value"])
    else 1 if row["Variable_name"] in categorical_vars
    else 0,
    axis=1
)

In [42]:
# 1. 매핑 룰 정의
mapping_rules = {
    "INTIME": ("date", np.nan),
    "OUTTIME": ("date", np.nan),
}

# 2. 매핑 함수
def map_mapping_info(row):
    var_name = row["Variable_name"]
    value = row["Value"]

    if pd.isna(value):
        return pd.Series([np.nan, np.nan])  # ✅ value가 NaN이면 매핑 안함
    elif var_name == "DIAGNOSIS":
        return pd.Series(["diagnosis", np.nan])
    else:
        return pd.Series(mapping_rules.get(var_name, (np.nan, np.nan)))

# 3. 적용
icustay_quiq[["Mapping_info_1", "Mapping_info_2"]] = icustay_quiq.apply(map_mapping_info, axis=1)

In [43]:
icustay_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,4549,,ICUSTAY,DBSOURCE,,carevue,,string,1,,,,322,177634,,,
1,4549,,ICUSTAY,FIRST_CAREUNIT,,CSRU,,string,1,,,,322,177634,,,
2,4549,,ICUSTAY,LAST_CAREUNIT,,CSRU,,string,1,,,,322,177634,,,
3,4549,,ICUSTAY,INTIME,,2135-05-02 14:46:48,,timestamp,0,,,,322,177634,,date,
4,4549,,ICUSTAY,OUTTIME,,2135-05-04 20:23:20,,timestamp,0,,,,322,177634,,date,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15973,7211,,ICUSTAY,FIRST_CAREUNIT,,SICU,,string,1,,,,94953,196881,,,
15974,7211,,ICUSTAY,LAST_CAREUNIT,,SICU,,string,1,,,,94953,196881,,,
15975,7211,,ICUSTAY,INTIME,,2160-03-03 16:09:11,,timestamp,0,,,,94953,196881,,date,
15976,7211,,ICUSTAY,OUTTIME,,2160-03-04 14:22:33,,timestamp,0,,,,94953,196881,,date,


In [44]:
icustay_quiq.to_csv('G:/2000/MIMIC_icustay_QUIQ.csv', index=False)

In [45]:
# VIA 테이블 생성
via_variable_names = ['DBSOURCE',
       'FIRST_CAREUNIT', 'LAST_CAREUNIT', 'FIRST_WARDID', 'LAST_WARDID',
       'INTIME', 'OUTTIME', 'LOS']
via_descriptions = ["Source database of the record (e.g., 'carevue' or 'metavision')", 
                    "Care unit the patient was first admitted to during the ICU stay", 
                    "Care unit the patient was last in during the ICU stay", 
                    "Identifier for the first hospital ward during admission", 
                    "Identifier for the last hospital ward during admission", 
                    "ICU admission time (start of ICU stay)", 
                    "ICU discharge time (end of ICU stay)", 
                    "Length of ICU stay in days (OUTTIME - INTIME)"


]

via_icustay = pd.DataFrame({
    'Original_table_name': 'ICUSTAY',
    'Variable_name': via_variable_names,
    'Description': via_descriptions
})
via_icustay

Unnamed: 0,Original_table_name,Variable_name,Description
0,ICUSTAY,DBSOURCE,"Source database of the record (e.g., 'carevue'..."
1,ICUSTAY,FIRST_CAREUNIT,Care unit the patient was first admitted to du...
2,ICUSTAY,LAST_CAREUNIT,Care unit the patient was last in during the I...
3,ICUSTAY,FIRST_WARDID,Identifier for the first hospital ward during ...
4,ICUSTAY,LAST_WARDID,Identifier for the last hospital ward during a...
5,ICUSTAY,INTIME,ICU admission time (start of ICU stay)
6,ICUSTAY,OUTTIME,ICU discharge time (end of ICU stay)
7,ICUSTAY,LOS,Length of ICU stay in days (OUTTIME - INTIME)


In [46]:
via_icustay.to_csv('G:/2000/MIMIC_icustay_VIA.csv', index=False)

-----

# Labevents

In [38]:
labevents = pd.read_csv('LABEVENTS.csv.gz', compression='gzip')
labevents

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ITEMID,CHARTTIME,VALUE,VALUENUM,VALUEUOM,FLAG
0,281,3,,50820,2101-10-12 16:07:00,7.39,7.39,units,
1,282,3,,50800,2101-10-12 18:17:00,ART,,,
2,283,3,,50802,2101-10-12 18:17:00,-1,-1.00,mEq/L,
3,284,3,,50804,2101-10-12 18:17:00,22,22.00,mEq/L,
4,285,3,,50808,2101-10-12 18:17:00,0.93,0.93,mmol/L,abnormal
...,...,...,...,...,...,...,...,...,...
27854050,27428435,96443,103219.0,50882,2109-12-30 01:40:00,26,26.00,mEq/L,
27854051,27428436,96443,103219.0,50885,2109-12-30 01:40:00,2.1,2.10,mg/dL,abnormal
27854052,27428437,96443,103219.0,50902,2109-12-30 01:40:00,97,97.00,mEq/L,
27854053,27428438,96443,103219.0,50911,2109-12-30 01:40:00,2,2.00,ng/mL,


In [39]:
merged_df = pd.merge(
    labevents,
    d_lab[['ITEMID', 'LABEL', 'CATEGORY']],
    how='left',
    on='ITEMID'
)

# 조인 안 된 행만 필터링 (LABEL 또는 CATEGORY가 NaN인 경우)
unmatched = merged_df[merged_df['LABEL'].isna()]

# 결과 확인
unmatched

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ITEMID,CHARTTIME,VALUE,VALUENUM,VALUEUOM,FLAG,LABEL,CATEGORY


In [40]:
labevents_m = merged_df

In [41]:
labevents_m = labevents_m[labevents_m["SUBJECT_ID"].isin(la)].reset_index(drop=True)

In [42]:
labevents_m

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ITEMID,CHARTTIME,VALUE,VALUENUM,VALUEUOM,FLAG,LABEL,CATEGORY
0,21194,46,144073.0,50868,2133-02-20 17:52:00,8,8.0,mEq/L,,Anion Gap,Chemistry
1,21195,46,144073.0,50878,2133-02-20 17:52:00,180,180.0,IU/L,abnormal,Asparate Aminotransferase (AST),Chemistry
2,21196,46,144073.0,50882,2133-02-20 17:52:00,17,17.0,mEq/L,abnormal,Bicarbonate,Chemistry
3,21197,46,144073.0,50883,2133-02-20 17:52:00,0.7,0.7,mg/dL,abnormal,"Bilirubin, Direct",Chemistry
4,21198,46,144073.0,50884,2133-02-20 17:52:00,0.3,0.3,mg/dL,,"Bilirubin, Indirect",Chemistry
...,...,...,...,...,...,...,...,...,...,...,...
1207432,27806593,99288,179982.0,51256,2194-05-08 04:26:00,70.3,70.3,%,abnormal,Neutrophils,Hematology
1207433,27806594,99288,179982.0,51265,2194-05-08 04:26:00,391,391.0,K/uL,,Platelet Count,Hematology
1207434,27806595,99288,179982.0,51277,2194-05-08 04:26:00,15.7,15.7,%,abnormal,RDW,Hematology
1207435,27806596,99288,179982.0,51279,2194-05-08 04:26:00,4.30,4.3,m/uL,abnormal,Red Blood Cells,Hematology


In [43]:
# 결과 저장 리스트
rows = []

# 제외할 컬럼
exclude_cols = {
    "ROW_ID", "SUBJECT_ID", "HADM_ID", "ITEMID", "LABEL",
    "CHARTTIME", "VALUENUM", "VALUEUOM", "CATEGORY", "FLAG"
}

for index, row in labevents_m.iterrows():
    patient_id = row["SUBJECT_ID"]
    admission_id = row["HADM_ID"]
    item_id = row["ITEMID"]
    variable_name = row["LABEL"]
    event_date = row["CHARTTIME"]
    unit = row["VALUEUOM"]
    
    # 기존 컬럼 처리
    for col in labevents_m.columns:
        if col not in exclude_cols:
            value = row[col]
            
            # 숫자인 경우에만 단위 유지
            if isinstance(value, (int, float)) or (isinstance(value, str) and value.replace('.', '', 1).isdigit()):
                final_unit = unit
            else:
                final_unit = np.nan

            rows.append({
                "Primary_key": index + 7212,
                "Variable_ID": item_id,
                "Original_table_name": "LABEVENTS",
                "Variable_name": variable_name,
                "Event_date": event_date,
                "Value": value,
                "Unit": final_unit,
                "Variable_type": np.nan,
                "Is_categorical": np.nan,
                "Recorder": np.nan,
                "Recorder_position": np.nan,
                "Recorder_affiliation": np.nan,
                "Patient_id": patient_id,
                "Admission_id": admission_id,
                "Ground_truth": np.nan,
                "Mapping_info_1": "event",
                "Mapping_info_2": "lab_event"
            })

    # ⬇️ Flag 컬럼 무조건 추가 (NaN 포함)
    flag_value = row["FLAG"]  # NaN일 수도 있음

    rows.append({
        "Primary_key": index + 7212,  # 기존과 충돌 방지용 오프셋
        "Variable_ID": np.nan,
        "Original_table_name": "LABEVENTS",
        "Variable_name": "FLAG",  # 고정 문자열
        "Event_date": event_date,
        "Value": flag_value,
        "Unit": np.nan,
        "Variable_type": np.nan,
        "Is_categorical": np.nan,
        "Recorder": np.nan,
        "Recorder_position": np.nan,
        "Recorder_affiliation": np.nan,
        "Patient_id": patient_id,
        "Admission_id": admission_id,
        "Ground_truth": np.nan,
        "Mapping_info_1": "event",
        "Mapping_info_2": "lab_event"
    })
    
labevents_quiq = pd.DataFrame(rows, columns=QUIQ_cols)

# -------------------------------
# Variable_type 추론 함수
def infer_variable_type(val):
    if pd.isna(val):
        return np.nan
    elif isinstance(val, (int, float)):
        return "numeric"
    elif isinstance(val, pd.Timestamp):
        return "timestamp"
    elif isinstance(val, str):
        # 먼저 숫자형 가능한지 체크
        try:
            float(val)  # 문자열이 숫자면 float 변환 성공
            return "numeric"
        except ValueError:
            pass

        # 다음으로 timestamp 가능한지 체크
        try:
            pd.to_datetime(val, errors="raise")
            return "timestamp"
        except Exception:
            return "string"
    else:
        return "unknown"
# Variable_type 먼저 계산
labevents_quiq["Variable_type"] = labevents_quiq["Value"].apply(infer_variable_type)

# CATEGORICAL_THRESHOLD 설정
CATEGORICAL_THRESHOLD = 10

# 각 Variable_name 별 고유값 수 계산 (NaN 제외)
value_counts = labevents_quiq.groupby("Variable_name")["Value"].nunique(dropna=True)

# 고유값 수가 기준 이하인 변수 목록 추출
categorical_vars = value_counts[value_counts <= CATEGORICAL_THRESHOLD].index

# Is_categorical 컬럼 채우기
labevents_quiq["Is_categorical"] = labevents_quiq.apply(
    lambda row: np.nan if pd.isna(row["Value"])
    else 1 if row["Variable_name"] in categorical_vars
    else 0,
    axis=1
)
labevents_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,7212,50868.0,LABEVENTS,Anion Gap,2133-02-20 17:52:00,8,mEq/L,numeric,0.0,,,,46,144073.0,,event,lab_event
1,7212,,LABEVENTS,FLAG,2133-02-20 17:52:00,,,,,,,,46,144073.0,,event,lab_event
2,7213,50878.0,LABEVENTS,Asparate Aminotransferase (AST),2133-02-20 17:52:00,180,IU/L,numeric,0.0,,,,46,144073.0,,event,lab_event
3,7213,,LABEVENTS,FLAG,2133-02-20 17:52:00,abnormal,,string,1.0,,,,46,144073.0,,event,lab_event
4,7214,50882.0,LABEVENTS,Bicarbonate,2133-02-20 17:52:00,17,mEq/L,numeric,0.0,,,,46,144073.0,,event,lab_event
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2414869,1214646,,LABEVENTS,FLAG,2194-05-08 04:26:00,abnormal,,string,1.0,,,,99288,179982.0,,event,lab_event
2414870,1214647,51279.0,LABEVENTS,Red Blood Cells,2194-05-08 04:26:00,4.30,m/uL,numeric,0.0,,,,99288,179982.0,,event,lab_event
2414871,1214647,,LABEVENTS,FLAG,2194-05-08 04:26:00,abnormal,,string,1.0,,,,99288,179982.0,,event,lab_event
2414872,1214648,51301.0,LABEVENTS,White Blood Cells,2194-05-08 04:26:00,12.9,K/uL,numeric,0.0,,,,99288,179982.0,,event,lab_event


In [46]:
# 1. 각 Variable_name의 고유 Value 개수 계산
value_counts = labevents_quiq.groupby("Variable_name")["Value"].nunique(dropna=True)

# 2. 고유값 개수가 1개인 Variable_name만 추출
single_value_vars = value_counts[value_counts == 1].index

# 3. 해당 Variable_name들의 행 중에서 variable_type이 numeric이고, Unit이 있는 경우만 필터
filtered_rows = labevents_quiq[
    (labevents_quiq["Variable_name"].isin(single_value_vars)) &
    (labevents_quiq["Variable_type"] == "numeric") &
    (labevents_quiq["Unit"].notna())
]
filtered_rows

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
46608,30516,50869.0,LABEVENTS,Anti-DGP (IgA/IgG),2117-12-21 06:00:00,3,units,numeric,0.0,,,,948,125857.0,,event,lab_event
91572,52998,51040.0,LABEVENTS,"Phosphate, Body Fluid",2110-03-20 08:08:00,2.2,mg/dL,numeric,0.0,,,,2222,127855.0,,event,lab_event
133974,74199,50840.0,LABEVENTS,"Cholesterol, Ascites",2193-04-23 14:46:00,39,mg/dL,numeric,0.0,,,,3851,144544.0,,event,lab_event
138574,76499,51224.0,LABEVENTS,Hemoglobin C,2160-06-12 14:45:00,0,%,numeric,0.0,,,,3747,,,event,lab_event
138578,76501,51227.0,LABEVENTS,Hemogloblin S,2160-06-12 14:45:00,0,%,numeric,0.0,,,,3747,,,event,lab_event
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106790,1060607,51049.0,LABEVENTS,"Bilirubin, Total, Pleural",2106-11-29 10:19:00,4.8,mg/dL,numeric,0.0,,,,67017,114886.0,,event,lab_event
2347168,1180796,51533.0,LABEVENTS,WBCP,2115-03-09 12:19:00,1,%,numeric,0.0,,,,92644,,,event,lab_event
2369564,1191994,51223.0,LABEVENTS,Hemoglobin A2,2158-12-19 15:00:00,2.7,%,numeric,0.0,,,,93774,,,event,lab_event
2369566,1191995,51224.0,LABEVENTS,Hemoglobin C,2158-12-19 15:00:00,0,%,numeric,0.0,,,,93774,,,event,lab_event


In [45]:
condition = (
    (labevents_quiq["Variable_type"] == "numeric") &
    (labevents_quiq["Unit"].notna()) &
    (labevents_quiq["Is_categorical"] == 1)
)

# 해당 조건에 맞는 행들의 Is_categorical 값을 0으로 변경
labevents_quiq.loc[condition, "Is_categorical"] = 0
labevents_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,7212,50868.0,LABEVENTS,Anion Gap,2133-02-20 17:52:00,8,mEq/L,numeric,0.0,,,,46,144073.0,,event,lab_event
1,7212,,LABEVENTS,FLAG,2133-02-20 17:52:00,,,,,,,,46,144073.0,,event,lab_event
2,7213,50878.0,LABEVENTS,Asparate Aminotransferase (AST),2133-02-20 17:52:00,180,IU/L,numeric,0.0,,,,46,144073.0,,event,lab_event
3,7213,,LABEVENTS,FLAG,2133-02-20 17:52:00,abnormal,,string,1.0,,,,46,144073.0,,event,lab_event
4,7214,50882.0,LABEVENTS,Bicarbonate,2133-02-20 17:52:00,17,mEq/L,numeric,0.0,,,,46,144073.0,,event,lab_event
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2414869,1214646,,LABEVENTS,FLAG,2194-05-08 04:26:00,abnormal,,string,1.0,,,,99288,179982.0,,event,lab_event
2414870,1214647,51279.0,LABEVENTS,Red Blood Cells,2194-05-08 04:26:00,4.30,m/uL,numeric,0.0,,,,99288,179982.0,,event,lab_event
2414871,1214647,,LABEVENTS,FLAG,2194-05-08 04:26:00,abnormal,,string,1.0,,,,99288,179982.0,,event,lab_event
2414872,1214648,51301.0,LABEVENTS,White Blood Cells,2194-05-08 04:26:00,12.9,K/uL,numeric,0.0,,,,99288,179982.0,,event,lab_event


In [47]:
labevents_quiq.to_csv('G:/2000/MIMIC_labevents_QUIQ.csv', index=False)

In [55]:
labevents_quiq['Variable_name'].unique()

array(['Anion Gap', 'FLAG', 'Asparate Aminotransferase (AST)',
       'Bicarbonate', 'Bilirubin, Direct', 'Bilirubin, Indirect',
       'Bilirubin, Total', 'Calcium, Total', 'Chloride', 'Creatinine',
       'Glucose', 'Green Top Hold (plasma)', 'Potassium', 'Sodium',
       'Urea Nitrogen', 'Vitamin B12', 'Anisocytosis', 'Basophils',
       'Eosinophils', 'Hematocrit', 'Hemoglobin', 'Hypochromia',
       'INR(PT)', 'Lymphocytes', 'Macrocytes', 'MCH', 'MCHC', 'MCV',
       'Monocytes', 'Neutrophils', 'Platelet Count', 'PT', 'PTT', 'RDW',
       'Red Blood Cells', 'White Blood Cells',
       'Alanine Aminotransferase (ALT)', 'Albumin',
       'Alkaline Phosphatase', 'Magnesium', 'Phosphate', 'Amylase',
       'Lactate Dehydrogenase (LD)', 'Lipase', 'Ketone', 'Leukocytes',
       'Nitrite', 'pH', 'Protein', 'RBC', 'Specific Gravity',
       'Urine Appearance', 'Urine Color', 'Urobilinogen', 'WBC', 'Yeast',
       'SPECIMEN TYPE', 'Base Excess', 'Calculated Total CO2',
       'Free Calcium

In [56]:
# VIA 테이블 생성
via_variable_names = ['Potassium', 'FLAG', 'Sodium', 'tacroFK', 'Urea Nitrogen',
       'Hematocrit', 'Hemoglobin', 'INR(PT)', 'MCH', 'MCHC', 'MCV',
       'Platelet Count', 'PT', 'PTT', 'RDW', 'Red Blood Cells',
       'White Blood Cells', 'Anion Gap', 'Bicarbonate', 'Calcium, Total',
       'Chloride', 'Creatinine', 'Glucose', 'Magnesium', 'Phosphate',
       'Hematocrit, Ascites', 'Bilirubin', 'Blood', 'Ketone',
       'Leukocytes', 'Nitrite', 'pH', 'Protein', 'Specific Gravity',
       'Urine Appearance', 'Urine Color', 'Urobilinogen',
       'Alanine Aminotransferase (ALT)', 'Albumin',
       'Asparate Aminotransferase (AST)', 'Bilirubin, Total', 'EDTA Hold',
       'Creatinine, Urine', 'Length of Urine Collection',
       'Total Protein, Urine', 'Anisocytosis', 'Atypical Lymphocytes',
       'Bands', 'Basophils', 'Eosinophils', 'Hypochromia', 'Lymphocytes',
       'Macrocytes', 'Metamyelocytes', 'Microcytes', 'Monocytes',
       'Myelocytes', 'Neutrophils', 'Ovalocytes', 'Poikilocytosis',
       'Polychromasia', 'Alkaline Phosphatase',
       'Cholesterol Ratio (Total/HDL)', 'Cholesterol, HDL',
       'Cholesterol, LDL, Calculated', 'Cholesterol, Total',
       'Protein, Total', 'Triglycerides', 'Platelet Smear',
       'Protein/Creatinine Ratio', 'Bacteria', 'Epithelial Cells', 'RBC',
       'SPECIMEN TYPE', 'Base Excess', 'Calculated Total CO2',
       'Chloride, Whole Blood', 'Free Calcium', 'Hematocrit, Calculated',
       'Lactate', 'pCO2', 'pO2', 'Potassium, Whole Blood',
       'Sodium, Whole Blood', 'Globulin', 'Burr Cells',
       'C-Reactive Protein', 'Fibrinogen, Functional', 'WBC', 'Yeast',
       'Ferritin', 'Iron', 'Iron Binding Capacity, Total',
       'Parathyroid Hormone', 'Transferrin', 'Hyaline Casts',
       'Renal Epithelial Cells', 'Green Top Hold (plasma)',
       'Estimated GFR (MDRD equation)', 'Phenytoin',
       'Osmolality, Measured', 'Intubated', 'Oxygen', 'Tidal Volume',
       'Ventilation Rate', 'Ventilator', 'Red Top Hold',
       'Thyroid Stimulating Hormone', 'Thyroxine (T4), Free', 'Uhold',
       'GR HOLD', 'Oxygen Saturation', 'Urine Mucous',
       'Creatine Kinase (CK)', 'Creatine Kinase, MB Isoenzyme',
       'Blue Top Hold', 'Troponin T', 'Amylase',
       'Lactate Dehydrogenase (LD)', 'Lipase', 'Cortisol', 'O2 Flow',
       'Temperature', '% Hemoglobin A1c', 'Absolute A1c',
       'Absolute Hemoglobin', 'Sedimentation Rate',
       'Anti-Neutrophil Cytoplasmic Antibody', 'Anti-Nuclear Antibody',
       'Anti-Nuclear Antibody, Titer', 'Rheumatoid Factor',
       'Acetaminophen', 'Barbiturate Screen', 'Benzodiazepine Screen',
       'Ethanol', 'Salicylate', 'Tricyclic Antidepressant Screen',
       'Alveolar-arterial Gradient', 'Amorphous Crystals',
       'Hepatitis B Surface Antibody', 'Hepatitis B Surface Antigen',
       'Hepatitis B Virus Core Antibody', 'Hepatitis C Virus Antibody',
       'Vancomycin', 'HIV Antibody', 'PEEP', 'Required O2',
       'Amphetamine Screen, Urine', 'Barbiturate Screen, Urine',
       'Benzodiazepine Screen, Urine', 'Ammonia', 'Teardrop Cells',
       'CK-MB Index', 'Cocaine, Urine', 'Methadone, Urine',
       'Opiate Screen, Urine', 'Transitional Epithelial Cells',
       'Phenytoin, Free', 'Phenytoin, Percent Free',
       'Reticulocyte Count, Automated', 'Cryoglobulin',
       'Granulocyte Count', 'Bilirubin, Direct', 'Bilirubin, Indirect',
       'Vitamin B12', 'Marijuana', 'Macrophage', 'Mesothelial Cell',
       'Polys', 'RBC, Ascites', 'WBC, Ascites', 'Haptoglobin',
       'Osmolality, Urine', 'Sodium, Urine', 'Immunoglobulin A',
       'Tissue Transglutaminase Ab, IgA', 'D-Dimer',
       'Fibrin Degradation Products', 'Hepatitis A Virus IgM Antibody',
       'Calculated Bicarbonate, Whole Blood', 'Gentamicin',
       'Nucleated Red Cells', 'Schistocytes', 'Troponin I',
       'Hepatitis A Virus Antibody', 'Urea Nitrogen, Urine',
       'Alpha-Fetoprotein', 'Gamma Glutamyltransferase', 'Target Cells',
       'NTproBNP', 'Folate', 'Human Chorionic Gonadotropin',
       'Albumin, Pleural', 'Amylase, Pleural', 'Glucose, Pleural',
       'Lactate Dehydrogenase, Pleural', 'Total Protein, Pleural',
       'Mesothelial Cells', 'Monos', 'RBC, Pleural', 'WBC, Pleural',
       'Glucose, CSF', 'Total Protein, CSF', 'Lymphs', 'RBC, CSF',
       'WBC, CSF', 'Calcium Oxalate Crystals', 'HCG, Urine, Qualitative',
       'Red Blood Cell Fragments', 'Reticulocyte Count, Manual',
       'Uric Acid', 'Leukocyte Alkaline Phosphatase', 'Spherocytes',
       'Heparin, LMW', 'Cholesterol, LDL, Measured', 'Granular Casts',
       'Triiodothyronine (T3)', 'Hepatitis B Core Antibody, IgM',
       'Basophilic Stippling', 'Eosinophil Count', 'Promyelocytes',
       'Prostate Specific Antigen', 'Urine Specimen Type', 'Acetone',
       'Large Platelets', 'WBC Clumps', 'Digoxin', 'Homocysteine',
       'Chloride, Urine', 'Potassium, Urine', 'Gastrin', 'Urine Volume',
       '24 hr Creatinine', 'Gray Top Hold (plasma)', 'Acanthocytes',
       'Anti-Smooth Muscle Antibody', 'Amylase, Ascites',
       'Glucose, Ascites', 'Lactate Dehydrogenase, Ascites',
       'Total Protein, Ascites', 'Plasma', 'Bite Cells',
       'Fragmented Cells', 'Carboxyhemoglobin', 'Albumin, Body Fluid',
       'Amylase, Body Fluid', 'Bilirubin, Total, Body Fluid',
       'Creatinine, Body Fluid', 'LD, Body Fluid',
       'Total Protein, Body Fluid', 'RBC, Other Fluid',
       'WBC, Other Fluid', 'FMC-7', 'HLA-DR', 'Immunophenotyping',
       'Kappa', 'Lambda', 'Immunoglobulin G', 'Immunoglobulin M',
       'Rapamycin', 'C4', 'Protein Electrophoresis', 'Tobramycin', 'CD10',
       'CD19', 'CD2', 'CD20', 'CD23', 'CD3', 'CD45', 'CD5', 'CD7', 'C3',
       'Light Green Top Hold', 'Quantitative G6PD', 'Absolute CD4 Count',
       'Absolute Lymphocyte Count', 'CD4 Cells, Percent',
       'Lymphocytes, Percent', 'WBC Count', 'Creatinine, Pleural',
       'Blasts', 'Blood Culture Hold', 'Absolute CD3 Count',
       'Absolute CD8 Count', 'CD3 Cells, Percent', 'CD4/CD8 Ratio',
       'CD8 Cells, Percent', 'Ammonium Biurate', 'Macrophages', 'Other',
       'Lactate Dehydrogenase, CSF', 'Calcium, Urine',
       'Joint Crystals, Number', 'RBC, Joint Fluid', 'WBC, Joint Fluid',
       'Urine Crystals, Other', 'Bicarbonate, Urine',
       'Triple Phosphate Crystals', 'Other Cell', 'Albumin, Urine',
       'Albumin/Creatinine, Urine', 'Methemoglobin', 'Carbamazepine',
       'Protein C, Functional', 'Uric Acid Crystals',
       'Prot. Electrophoresis, Urine', 'H/O Smear', 'Elliptocytes',
       'Envelope Cells', 'Blood, Occult', 'Thyroxine (T4)',
       'Inpatient Hematology/Oncology Smear', 'Sperm',
       'Carcinoembyronic Antigen (CEA)', 'Urea Nitrogen, Body Fluid',
       'Calculated TBG', 'Calculated Thyroxine (T4) Index',
       'Uptake Ratio', 'Hemoglobin C', 'Hemogloblin A', 'Hemogloblin S',
       'Immunofixation, Urine', 'Mesothelial cells', 'Immunofixation',
       'Phosphate, Urine', 'Platelet Clumps', 'Young Cells', 'Lithium',
       'Urine Casts, Other', 'Lupus Anticoagulant', 'Uric Acid, Urine',
       'Other Cells', 'Hematocrit, Joint Fluid', 'Pappenheimer Bodies',
       'Myoglobin, Urine', 'DHEA-Sulfate', 'Phenobarbital', 'Factor VII',
       'Miscellaneous, CSF', 'Prolactin', 'Hemosiderin', 'Amylase, Urine',
       'Amylase/Creatinine Ratio, Urine', 'Estradiol',
       'Testosterone, Free', 'Testosterone', 'Albumin, Ascites',
       'Glucose, Body Fluid', 'Hypersegmented Neutrophils', 'PEP, CSF',
       'Fluid Type', 'Thrombin', 'Joint Crystals, Birefringence',
       'Joint Crystals, Location', 'Joint Crystals, Shape',
       'Anti-DGP (IgA/IgG)', 'Follicle Stimulating Hormone',
       'Cyclosporin', 'Valproic Acid', 'Theophylline', 'Glucose, Urine',
       'Hematocrit, Other Fluid', 'Thyroglobulin',
       'Anti-Thyroglobulin Antibodies', 'Anti-Parietal Cell Antibody',
       'Miscellaneous, Body Fluid', 'Cancer Antigen 27.29', 'Waxy Casts',
       'Sulfonamides', 'Amylase, Joint Fluid', '24 hr Protein', 'NRBC',
       'Bilirubin, Total, Ascites', 'Creatinine, Ascites',
       'Triglycerides, Pleural', 'Creatinine Clearance',
       'Creatinine, Serum', 'Total Collection Time', 'Urine Creatinine',
       'Urine Volume, Total', 'Magnesium, Urine', 'Protein S, Antigen',
       'Porphobilinogen Screen', 'Thyroid Peroxidase Antibodies',
       'Heinz Body Prep', 'Lipase, Body Fluid', 'Pencil Cells',
       'Luteinizing Hormone', 'Estimated Actual Glucose',
       'Hematocrit, Pleural', 'Sickle Cell Preparation',
       'Howell-Jolly Bodies', 'Factor V', 'CD55', 'CD59',
       'Sugar Water Test', 'CD13', 'CD34', 'Wright Giemsa',
       'Antithrombin', 'Cholesterol, Pleural', 'CD117', 'CD11c', 'CD14',
       'CD15', 'CD16/56', 'CD33', 'CD41', 'CD56', 'CD64', 'CD71',
       'Glyco A', 'Iron Stain', 'Bicarbonate, Ascites',
       'NonSquamous Epithelial Cell', 'Plasma Cells',
       'Potassium, Ascites', 'Sodium, Ascites', '24 hr Calcium',
       'Bicarbonate, Other Fluid', 'Calcium, Body Fluid',
       'Chloride, Body Fluid', 'Magnesium, Body Fluid',
       'Potassium, Body Fluid', 'Sodium, Body Fluid', 'WBC Casts',
       'N-Acetylprocainamide (NAPA)', 'Procainamide',
       'Blood Parasite Smear', 'Anti-Mitochondrial Antibody',
       'Phosphate, Body Fluid', 'Triglycerides, Ascites',
       'Acid Phosphatase', 'Acid Phosphatase, Non-Prostatic',
       'ACID PHOSPHATASE, PROSTATIC', 'CD4', 'CD8', 'Nucleated RBC',
       'CA-125', 'Hemoglobin A2', 'Hemoglobin F', 'Double Stranded DNA',
       'Bilirubin, Total, Pleural', 'Factor VIII',
       'Von Willebrand Factor Activity', 'Von Willebrand Factor Antigen',
       'Reticulocyte, Cellular Hemoglobin', 'Cholesterol, Body Fluid',
       'Reptilase Time', 'Reptilase Time Control', 'MacroOvalocytes',
       'Osmolality, Body Fluid', 'Inhibitor Screen',
       'Cholesterol, Ascites', '<Albumin>', 'CD103', 'CD138', 'CD22',
       'CD25', 'Creatinine, Joint Fluid', 'Chloride, Ascites',
       'Osmolality, Ascites', 'Urea Nitrogen, Ascites', 'Chloride, Stool',
       'Potassium, Stool', 'Sodium, Stool', 'Bleeding Time',
       'Glucose, Joint Fluid', 'Serum Viscosity', 'Factor X',
       'LD, Joint Fluid', 'Total Protein, Joint Fluid', 'RBC Casts',
       'Billed', 'Hematocrit, CSF', 'Anticardiolipin Antibody IgG',
       'Anticardiolipin Antibody IgM', 'Cholesterol Crystals',
       'Sickle Cells', 'Protein S, Functional', 'Beta-2 Microglobulin',
       'Cellular Cast', 'Triglycer', 'Quinidine', 'Heparin', 'Factor IX',
       'Factor XI', 'Factor XII', 'Factor II', '25-OH Vitamin D',
       'Bicarbonate, Stool', 'Osmolality, Stool',
       'Reducing Substances, Urine', 'Oval Fat Body', 'Methotrexate',
       'Ethanol, Urine', 'Sex Hormone Binding Globulin',
       'Calculated Free Testosterone', 'Anti-Gliadin Antibody, IgA',
       'ADP', 'Arachadonic Acid', 'Collagen', 'Epinepherine',
       'Ristocetin', 'pCO2, Body Fluid', 'Urine Fat Bodies',
       'Lipase, Ascites', 'Reticulocyte Count, Absolute', 'CD38',
       'Alpha Antiplasmin', 'Factor XIII', 'Centromere',
       'CD16/56 Absolute Count', 'CD16/56%', 'CD3 %',
       'CD3 Absolute Count', 'CD16', 'Trichomonas', 'Protein C, Antigen',
       'RBC Clumps', 'Bilirubin, Total, CSF', 'Chloride, Pleural',
       'Bilirubin Crystals', 'AFP, Maternal Screen',
       'HCG, Maternal Screen', 'UE3, Maternal Screen', 'FRUCAMN+',
       'PLASMGN', 'Broad Casts', 'WBCP', 'Tyrosine Crystals', 'Free Fat',
       'Bicarbonate, Pleural', 'Potassium, Pleural', 'Sodium, Pleural',
       'Monocyte Count', 'pO2, Body Fluid',
       'Non-squamous Epithelial Cells', 'Miscellaneous, Ascites',
       'STDYURINE', 'Amikacin', 'Factor VIII Inhibitor',
       'Blue Top Hold Frozen', 'CD57', 'Calcium Phosphate Crystals',
       'Calcium Carbonate Crystals', 'Leucine Crystals', 'CD5 %',
       'CD5 Absolute Count', 'Epithelial Casts']

via_descriptions = ['Potassium', 'Whether the labratory value is considered abnormal or not, using pre-defined thresholds', 
                    'Sodium', 'tacroFK', 'Urea Nitrogen',
       'Hematocrit', 'Hemoglobin', 'INR(PT)', 'MCH', 'MCHC', 'MCV',
       'Platelet Count', 'PT', 'PTT', 'RDW', 'Red Blood Cells',
       'White Blood Cells', 'Anion Gap', 'Bicarbonate', 'Calcium, Total',
       'Chloride', 'Creatinine', 'Glucose', 'Magnesium', 'Phosphate',
       'Hematocrit, Ascites', 'Bilirubin', 'Blood', 'Ketone',
       'Leukocytes', 'Nitrite', 'pH', 'Protein', 'Specific Gravity',
       'Urine Appearance', 'Urine Color', 'Urobilinogen',
       'Alanine Aminotransferase (ALT)', 'Albumin',
       'Asparate Aminotransferase (AST)', 'Bilirubin, Total', 'EDTA Hold',
       'Creatinine, Urine', 'Length of Urine Collection',
       'Total Protein, Urine', 'Anisocytosis', 'Atypical Lymphocytes',
       'Bands', 'Basophils', 'Eosinophils', 'Hypochromia', 'Lymphocytes',
       'Macrocytes', 'Metamyelocytes', 'Microcytes', 'Monocytes',
       'Myelocytes', 'Neutrophils', 'Ovalocytes', 'Poikilocytosis',
       'Polychromasia', 'Alkaline Phosphatase',
       'Cholesterol Ratio (Total/HDL)', 'Cholesterol, HDL',
       'Cholesterol, LDL, Calculated', 'Cholesterol, Total',
       'Protein, Total', 'Triglycerides', 'Platelet Smear',
       'Protein/Creatinine Ratio', 'Bacteria', 'Epithelial Cells', 'RBC',
       'SPECIMEN TYPE', 'Base Excess', 'Calculated Total CO2',
       'Chloride, Whole Blood', 'Free Calcium', 'Hematocrit, Calculated',
       'Lactate', 'pCO2', 'pO2', 'Potassium, Whole Blood',
       'Sodium, Whole Blood', 'Globulin', 'Burr Cells',
       'C-Reactive Protein', 'Fibrinogen, Functional', 'WBC', 'Yeast',
       'Ferritin', 'Iron', 'Iron Binding Capacity, Total',
       'Parathyroid Hormone', 'Transferrin', 'Hyaline Casts',
       'Renal Epithelial Cells', 'Green Top Hold (plasma)',
       'Estimated GFR (MDRD equation)', 'Phenytoin',
       'Osmolality, Measured', 'Intubated', 'Oxygen', 'Tidal Volume',
       'Ventilation Rate', 'Ventilator', 'Red Top Hold',
       'Thyroid Stimulating Hormone', 'Thyroxine (T4), Free', 'Uhold',
       'GR HOLD', 'Oxygen Saturation', 'Urine Mucous',
       'Creatine Kinase (CK)', 'Creatine Kinase, MB Isoenzyme',
       'Blue Top Hold', 'Troponin T', 'Amylase',
       'Lactate Dehydrogenase (LD)', 'Lipase', 'Cortisol', 'O2 Flow',
       'Temperature', '% Hemoglobin A1c', 'Absolute A1c',
       'Absolute Hemoglobin', 'Sedimentation Rate',
       'Anti-Neutrophil Cytoplasmic Antibody', 'Anti-Nuclear Antibody',
       'Anti-Nuclear Antibody, Titer', 'Rheumatoid Factor',
       'Acetaminophen', 'Barbiturate Screen', 'Benzodiazepine Screen',
       'Ethanol', 'Salicylate', 'Tricyclic Antidepressant Screen',
       'Alveolar-arterial Gradient', 'Amorphous Crystals',
       'Hepatitis B Surface Antibody', 'Hepatitis B Surface Antigen',
       'Hepatitis B Virus Core Antibody', 'Hepatitis C Virus Antibody',
       'Vancomycin', 'HIV Antibody', 'PEEP', 'Required O2',
       'Amphetamine Screen, Urine', 'Barbiturate Screen, Urine',
       'Benzodiazepine Screen, Urine', 'Ammonia', 'Teardrop Cells',
       'CK-MB Index', 'Cocaine, Urine', 'Methadone, Urine',
       'Opiate Screen, Urine', 'Transitional Epithelial Cells',
       'Phenytoin, Free', 'Phenytoin, Percent Free',
       'Reticulocyte Count, Automated', 'Cryoglobulin',
       'Granulocyte Count', 'Bilirubin, Direct', 'Bilirubin, Indirect',
       'Vitamin B12', 'Marijuana', 'Macrophage', 'Mesothelial Cell',
       'Polys', 'RBC, Ascites', 'WBC, Ascites', 'Haptoglobin',
       'Osmolality, Urine', 'Sodium, Urine', 'Immunoglobulin A',
       'Tissue Transglutaminase Ab, IgA', 'D-Dimer',
       'Fibrin Degradation Products', 'Hepatitis A Virus IgM Antibody',
       'Calculated Bicarbonate, Whole Blood', 'Gentamicin',
       'Nucleated Red Cells', 'Schistocytes', 'Troponin I',
       'Hepatitis A Virus Antibody', 'Urea Nitrogen, Urine',
       'Alpha-Fetoprotein', 'Gamma Glutamyltransferase', 'Target Cells',
       'NTproBNP', 'Folate', 'Human Chorionic Gonadotropin',
       'Albumin, Pleural', 'Amylase, Pleural', 'Glucose, Pleural',
       'Lactate Dehydrogenase, Pleural', 'Total Protein, Pleural',
       'Mesothelial Cells', 'Monos', 'RBC, Pleural', 'WBC, Pleural',
       'Glucose, CSF', 'Total Protein, CSF', 'Lymphs', 'RBC, CSF',
       'WBC, CSF', 'Calcium Oxalate Crystals', 'HCG, Urine, Qualitative',
       'Red Blood Cell Fragments', 'Reticulocyte Count, Manual',
       'Uric Acid', 'Leukocyte Alkaline Phosphatase', 'Spherocytes',
       'Heparin, LMW', 'Cholesterol, LDL, Measured', 'Granular Casts',
       'Triiodothyronine (T3)', 'Hepatitis B Core Antibody, IgM',
       'Basophilic Stippling', 'Eosinophil Count', 'Promyelocytes',
       'Prostate Specific Antigen', 'Urine Specimen Type', 'Acetone',
       'Large Platelets', 'WBC Clumps', 'Digoxin', 'Homocysteine',
       'Chloride, Urine', 'Potassium, Urine', 'Gastrin', 'Urine Volume',
       '24 hr Creatinine', 'Gray Top Hold (plasma)', 'Acanthocytes',
       'Anti-Smooth Muscle Antibody', 'Amylase, Ascites',
       'Glucose, Ascites', 'Lactate Dehydrogenase, Ascites',
       'Total Protein, Ascites', 'Plasma', 'Bite Cells',
       'Fragmented Cells', 'Carboxyhemoglobin', 'Albumin, Body Fluid',
       'Amylase, Body Fluid', 'Bilirubin, Total, Body Fluid',
       'Creatinine, Body Fluid', 'LD, Body Fluid',
       'Total Protein, Body Fluid', 'RBC, Other Fluid',
       'WBC, Other Fluid', 'FMC-7', 'HLA-DR', 'Immunophenotyping',
       'Kappa', 'Lambda', 'Immunoglobulin G', 'Immunoglobulin M',
       'Rapamycin', 'C4', 'Protein Electrophoresis', 'Tobramycin', 'CD10',
       'CD19', 'CD2', 'CD20', 'CD23', 'CD3', 'CD45', 'CD5', 'CD7', 'C3',
       'Light Green Top Hold', 'Quantitative G6PD', 'Absolute CD4 Count',
       'Absolute Lymphocyte Count', 'CD4 Cells, Percent',
       'Lymphocytes, Percent', 'WBC Count', 'Creatinine, Pleural',
       'Blasts', 'Blood Culture Hold', 'Absolute CD3 Count',
       'Absolute CD8 Count', 'CD3 Cells, Percent', 'CD4/CD8 Ratio',
       'CD8 Cells, Percent', 'Ammonium Biurate', 'Macrophages', 'Other',
       'Lactate Dehydrogenase, CSF', 'Calcium, Urine',
       'Joint Crystals, Number', 'RBC, Joint Fluid', 'WBC, Joint Fluid',
       'Urine Crystals, Other', 'Bicarbonate, Urine',
       'Triple Phosphate Crystals', 'Other Cell', 'Albumin, Urine',
       'Albumin/Creatinine, Urine', 'Methemoglobin', 'Carbamazepine',
       'Protein C, Functional', 'Uric Acid Crystals',
       'Prot. Electrophoresis, Urine', 'H/O Smear', 'Elliptocytes',
       'Envelope Cells', 'Blood, Occult', 'Thyroxine (T4)',
       'Inpatient Hematology/Oncology Smear', 'Sperm',
       'Carcinoembyronic Antigen (CEA)', 'Urea Nitrogen, Body Fluid',
       'Calculated TBG', 'Calculated Thyroxine (T4) Index',
       'Uptake Ratio', 'Hemoglobin C', 'Hemogloblin A', 'Hemogloblin S',
       'Immunofixation, Urine', 'Mesothelial cells', 'Immunofixation',
       'Phosphate, Urine', 'Platelet Clumps', 'Young Cells', 'Lithium',
       'Urine Casts, Other', 'Lupus Anticoagulant', 'Uric Acid, Urine',
       'Other Cells', 'Hematocrit, Joint Fluid', 'Pappenheimer Bodies',
       'Myoglobin, Urine', 'DHEA-Sulfate', 'Phenobarbital', 'Factor VII',
       'Miscellaneous, CSF', 'Prolactin', 'Hemosiderin', 'Amylase, Urine',
       'Amylase/Creatinine Ratio, Urine', 'Estradiol',
       'Testosterone, Free', 'Testosterone', 'Albumin, Ascites',
       'Glucose, Body Fluid', 'Hypersegmented Neutrophils', 'PEP, CSF',
       'Fluid Type', 'Thrombin', 'Joint Crystals, Birefringence',
       'Joint Crystals, Location', 'Joint Crystals, Shape',
       'Anti-DGP (IgA/IgG)', 'Follicle Stimulating Hormone',
       'Cyclosporin', 'Valproic Acid', 'Theophylline', 'Glucose, Urine',
       'Hematocrit, Other Fluid', 'Thyroglobulin',
       'Anti-Thyroglobulin Antibodies', 'Anti-Parietal Cell Antibody',
       'Miscellaneous, Body Fluid', 'Cancer Antigen 27.29', 'Waxy Casts',
       'Sulfonamides', 'Amylase, Joint Fluid', '24 hr Protein', 'NRBC',
       'Bilirubin, Total, Ascites', 'Creatinine, Ascites',
       'Triglycerides, Pleural', 'Creatinine Clearance',
       'Creatinine, Serum', 'Total Collection Time', 'Urine Creatinine',
       'Urine Volume, Total', 'Magnesium, Urine', 'Protein S, Antigen',
       'Porphobilinogen Screen', 'Thyroid Peroxidase Antibodies',
       'Heinz Body Prep', 'Lipase, Body Fluid', 'Pencil Cells',
       'Luteinizing Hormone', 'Estimated Actual Glucose',
       'Hematocrit, Pleural', 'Sickle Cell Preparation',
       'Howell-Jolly Bodies', 'Factor V', 'CD55', 'CD59',
       'Sugar Water Test', 'CD13', 'CD34', 'Wright Giemsa',
       'Antithrombin', 'Cholesterol, Pleural', 'CD117', 'CD11c', 'CD14',
       'CD15', 'CD16/56', 'CD33', 'CD41', 'CD56', 'CD64', 'CD71',
       'Glyco A', 'Iron Stain', 'Bicarbonate, Ascites',
       'NonSquamous Epithelial Cell', 'Plasma Cells',
       'Potassium, Ascites', 'Sodium, Ascites', '24 hr Calcium',
       'Bicarbonate, Other Fluid', 'Calcium, Body Fluid',
       'Chloride, Body Fluid', 'Magnesium, Body Fluid',
       'Potassium, Body Fluid', 'Sodium, Body Fluid', 'WBC Casts',
       'N-Acetylprocainamide (NAPA)', 'Procainamide',
       'Blood Parasite Smear', 'Anti-Mitochondrial Antibody',
       'Phosphate, Body Fluid', 'Triglycerides, Ascites',
       'Acid Phosphatase', 'Acid Phosphatase, Non-Prostatic',
       'ACID PHOSPHATASE, PROSTATIC', 'CD4', 'CD8', 'Nucleated RBC',
       'CA-125', 'Hemoglobin A2', 'Hemoglobin F', 'Double Stranded DNA',
       'Bilirubin, Total, Pleural', 'Factor VIII',
       'Von Willebrand Factor Activity', 'Von Willebrand Factor Antigen',
       'Reticulocyte, Cellular Hemoglobin', 'Cholesterol, Body Fluid',
       'Reptilase Time', 'Reptilase Time Control', 'MacroOvalocytes',
       'Osmolality, Body Fluid', 'Inhibitor Screen',
       'Cholesterol, Ascites', '<Albumin>', 'CD103', 'CD138', 'CD22',
       'CD25', 'Creatinine, Joint Fluid', 'Chloride, Ascites',
       'Osmolality, Ascites', 'Urea Nitrogen, Ascites', 'Chloride, Stool',
       'Potassium, Stool', 'Sodium, Stool', 'Bleeding Time',
       'Glucose, Joint Fluid', 'Serum Viscosity', 'Factor X',
       'LD, Joint Fluid', 'Total Protein, Joint Fluid', 'RBC Casts',
       'Billed', 'Hematocrit, CSF', 'Anticardiolipin Antibody IgG',
       'Anticardiolipin Antibody IgM', 'Cholesterol Crystals',
       'Sickle Cells', 'Protein S, Functional', 'Beta-2 Microglobulin',
       'Cellular Cast', 'Triglycer', 'Quinidine', 'Heparin', 'Factor IX',
       'Factor XI', 'Factor XII', 'Factor II', '25-OH Vitamin D',
       'Bicarbonate, Stool', 'Osmolality, Stool',
       'Reducing Substances, Urine', 'Oval Fat Body', 'Methotrexate',
       'Ethanol, Urine', 'Sex Hormone Binding Globulin',
       'Calculated Free Testosterone', 'Anti-Gliadin Antibody, IgA',
       'ADP', 'Arachadonic Acid', 'Collagen', 'Epinepherine',
       'Ristocetin', 'pCO2, Body Fluid', 'Urine Fat Bodies',
       'Lipase, Ascites', 'Reticulocyte Count, Absolute', 'CD38',
       'Alpha Antiplasmin', 'Factor XIII', 'Centromere',
       'CD16/56 Absolute Count', 'CD16/56%', 'CD3 %',
       'CD3 Absolute Count', 'CD16', 'Trichomonas', 'Protein C, Antigen',
       'RBC Clumps', 'Bilirubin, Total, CSF', 'Chloride, Pleural',
       'Bilirubin Crystals', 'AFP, Maternal Screen',
       'HCG, Maternal Screen', 'UE3, Maternal Screen', 'FRUCAMN+',
       'PLASMGN', 'Broad Casts', 'WBCP', 'Tyrosine Crystals', 'Free Fat',
       'Bicarbonate, Pleural', 'Potassium, Pleural', 'Sodium, Pleural',
       'Monocyte Count', 'pO2, Body Fluid',
       'Non-squamous Epithelial Cells', 'Miscellaneous, Ascites',
       'STDYURINE', 'Amikacin', 'Factor VIII Inhibitor',
       'Blue Top Hold Frozen', 'CD57', 'Calcium Phosphate Crystals',
       'Calcium Carbonate Crystals', 'Leucine Crystals', 'CD5 %',
       'CD5 Absolute Count', 'Epithelial Casts'
]

via_lab = pd.DataFrame({
    'Original_table_name': 'LABEVENTS',
    'Variable_name': via_variable_names,
    'Description': via_descriptions
})
via_lab

Unnamed: 0,Original_table_name,Variable_name,Description
0,LABEVENTS,Potassium,Potassium
1,LABEVENTS,FLAG,Whether the labratory value is considered abno...
2,LABEVENTS,Sodium,Sodium
3,LABEVENTS,tacroFK,tacroFK
4,LABEVENTS,Urea Nitrogen,Urea Nitrogen
...,...,...,...
546,LABEVENTS,Calcium Carbonate Crystals,Calcium Carbonate Crystals
547,LABEVENTS,Leucine Crystals,Leucine Crystals
548,LABEVENTS,CD5 %,CD5 %
549,LABEVENTS,CD5 Absolute Count,CD5 Absolute Count


In [57]:
# csv 사용
via_lab.to_csv('G:/2000/MIMIC_labevents_VIA.csv', index=False)

----

# Call out

In [48]:
callout = pd.read_csv('CALLOUT.csv.gz', compression='gzip')
callout

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SUBMIT_WARDID,SUBMIT_CAREUNIT,CURR_WARDID,CURR_CAREUNIT,CALLOUT_WARDID,CALLOUT_SERVICE,REQUEST_TELE,...,CALLOUT_STATUS,CALLOUT_OUTCOME,DISCHARGE_WARDID,ACKNOWLEDGE_STATUS,CREATETIME,UPDATETIME,ACKNOWLEDGETIME,OUTCOMETIME,FIRSTRESERVATIONTIME,CURRENTRESERVATIONTIME
0,402,854,175684,52.0,,29.0,MICU,1,MED,0,...,Inactive,Discharged,29.0,Acknowledged,2146-10-05 13:16:55,2146-10-05 13:16:55,2146-10-05 13:24:00,2146-10-05 18:55:22,2146-10-05 15:27:44,
1,403,864,138624,15.0,,55.0,CSRU,55,CSURG,0,...,Inactive,Discharged,55.0,Acknowledged,2114-11-28 08:31:39,2114-11-28 09:42:08,2114-11-28 09:43:08,2114-11-28 12:10:02,,
2,404,864,138624,12.0,,55.0,CSRU,55,CSURG,1,...,Inactive,Discharged,55.0,Acknowledged,2114-11-30 10:24:25,2114-12-01 09:06:18,2114-12-01 12:26:05,2114-12-01 21:55:05,,
3,405,867,184298,7.0,,17.0,CCU,17,CCU,1,...,Inactive,Discharged,17.0,Acknowledged,2136-12-29 08:45:42,2136-12-29 10:17:16,2136-12-29 10:33:51,2136-12-29 18:10:02,,
4,157,306,167129,57.0,,3.0,SICU,44,NSURG,1,...,Inactive,Discharged,3.0,Acknowledged,2199-09-18 11:47:47,2199-09-18 11:47:47,2199-09-18 11:58:33,2199-09-18 15:10:02,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34494,32674,94046,199742,23.0,,54.0,MICU,1,MED,1,...,Inactive,Discharged,54.0,Acknowledged,2112-11-01 11:13:04,2112-11-01 11:13:04,2112-11-01 11:16:14,2112-11-01 19:42:07,2112-11-01 16:12:28,
34495,32675,94049,135809,23.0,,45.0,MICU,1,MED,0,...,Inactive,Discharged,45.0,Acknowledged,2112-10-09 14:12:09,2112-10-09 14:12:09,2112-10-09 14:23:52,2112-10-09 17:40:05,,
34496,32676,94050,173013,23.0,,45.0,MICU,1,MED,0,...,Inactive,Discharged,45.0,Acknowledged,2165-10-07 10:18:42,2165-10-07 10:18:42,2165-10-07 10:20:31,2165-10-07 19:10:11,2165-10-07 17:10:10,
34497,32677,94056,172374,50.0,,45.0,MICU,1,MED,0,...,Inactive,Discharged,45.0,Acknowledged,2199-10-30 15:35:30,2199-10-30 15:35:30,2199-10-30 16:16:09,2199-10-30 22:10:04,,


In [49]:
callout_df = callout[callout["SUBJECT_ID"].isin(la)].reset_index(drop=True)
callout_df

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SUBMIT_WARDID,SUBMIT_CAREUNIT,CURR_WARDID,CURR_CAREUNIT,CALLOUT_WARDID,CALLOUT_SERVICE,REQUEST_TELE,...,CALLOUT_STATUS,CALLOUT_OUTCOME,DISCHARGE_WARDID,ACKNOWLEDGE_STATUS,CREATETIME,UPDATETIME,ACKNOWLEDGETIME,OUTCOMETIME,FIRSTRESERVATIONTIME,CURRENTRESERVATIONTIME
0,402,854,175684,52.0,,29.0,MICU,1,MED,0,...,Inactive,Discharged,29.0,Acknowledged,2146-10-05 13:16:55,2146-10-05 13:16:55,2146-10-05 13:24:00,2146-10-05 18:55:22,2146-10-05 15:27:44,
1,164,322,177634,12.0,,55.0,CSRU,55,CSURG,1,...,Inactive,Discharged,55.0,Acknowledged,2135-05-04 11:04:57,2135-05-04 11:04:57,2135-05-04 11:10:33,2135-05-04 20:40:02,,
2,136,253,176189,7.0,,2.0,CCU,2,CCU,1,...,Inactive,Discharged,2.0,Acknowledged,2174-01-23 09:57:24,2174-01-23 10:44:12,2174-01-23 11:10:50,2174-01-23 13:40:02,,
3,84,140,165618,23.0,MICU,23.0,MICU,1,MED,0,...,Inactive,Cancelled,,Acknowledged,2160-09-22 09:51:16,2160-09-23 12:09:02,2160-09-22 09:59:17,2160-09-23 12:09:02,,
4,85,143,143808,33.0,,54.0,SICU,54,TSURG,0,...,Inactive,Discharged,54.0,Acknowledged,2155-01-21 08:17:03,2155-01-21 08:17:03,2155-01-21 08:28:41,2155-01-21 14:40:02,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1436,32864,94719,166273,50.0,,45.0,MICU,1,MED,0,...,Inactive,Discharged,45.0,Acknowledged,2173-06-13 10:43:49,2173-06-13 10:43:49,2173-06-13 11:20:14,2173-06-13 15:40:10,2173-06-13 11:55:09,
1437,34103,98697,166893,52.0,,29.0,MICU,1,MED,1,...,Inactive,Discharged,29.0,Acknowledged,2195-09-09 11:01:27,2195-09-09 11:01:27,2195-09-09 11:11:00,2195-09-09 21:27:48,2195-09-09 11:42:49,
1438,32632,93938,190498,52.0,,29.0,MICU,1,MED,0,...,Inactive,Discharged,29.0,Acknowledged,2170-02-04 12:09:00,2170-02-04 12:17:10,2170-02-04 12:17:13,2170-02-04 14:40:24,2170-02-04 12:27:05,
1439,33143,95578,142856,52.0,,24.0,MICU,1,MED,0,...,Inactive,Discharged,24.0,Acknowledged,2197-05-13 13:23:34,2197-05-13 13:24:52,2197-05-13 14:37:43,2197-05-13 19:43:54,2197-05-13 15:41:59,


In [50]:
callout_df.columns

Index(['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'SUBMIT_WARDID', 'SUBMIT_CAREUNIT',
       'CURR_WARDID', 'CURR_CAREUNIT', 'CALLOUT_WARDID', 'CALLOUT_SERVICE',
       'REQUEST_TELE', 'REQUEST_RESP', 'REQUEST_CDIFF', 'REQUEST_MRSA',
       'REQUEST_VRE', 'CALLOUT_STATUS', 'CALLOUT_OUTCOME', 'DISCHARGE_WARDID',
       'ACKNOWLEDGE_STATUS', 'CREATETIME', 'UPDATETIME', 'ACKNOWLEDGETIME',
       'OUTCOMETIME', 'FIRSTRESERVATIONTIME', 'CURRENTRESERVATIONTIME'],
      dtype='object')

In [52]:
# 결과 저장을 위한 리스트
rows = []


for index, row in callout_df.iterrows():
    patient_id = row["SUBJECT_ID"]
    admission_id = row["HADM_ID"]

    for col in callout_df.columns:
        if col not in ['ROW_ID', 'SUBJECT_ID', 'HADM_ID']:  # 특정 열 제외

            rows.append({
                "Primary_key": index + 1214649,
                "Variable_ID": np.nan,
                "Original_table_name": "CALLOUT",
                "Variable_name": col,
                "Record_datetime": np.nan,
                "Value": row[col],
                "Unit": np.nan,
                "Variable_type": np.nan,
                "Recorder": np.nan,
                "Recorder_position": np.nan,
                "Recorder_affiliation": np.nan,
                "Patient_id": patient_id,
                "Admission_id": admission_id,
                "Ground_truth": np.nan,
                "Mapping_info_1": np.nan,
                "Mapping_info_2": np.nan
            })

# QUIQ 테이블 컬럼 순서 맞추기
callout_quiq = pd.DataFrame(rows, columns=QUIQ_cols)

# Variable_type 추론 함수
def infer_variable_type(val):
    if pd.isna(val):
        return np.nan
    elif isinstance(val, (int, float)):
        return "numeric"
    elif isinstance(val, pd.Timestamp):
        return "timestamp"
    elif isinstance(val, str):
        try:
            pd.to_datetime(val, errors='raise')
            return "timestamp"
        except Exception:
            return "string"
    else:
        return "unknown"

# 변수 타입 자동 추론
callout_quiq["Variable_type"] = callout_quiq["Value"].apply(infer_variable_type)

# CATEGORICAL_THRESHOLD 기준 설정
CATEGORICAL_THRESHOLD = 10

# 각 Variable_name 별 고유값 수 계산
value_counts = callout_quiq.groupby("Variable_name")["Value"].nunique(dropna=True)

# 고유값 수가 기준 이하인 변수 목록 추출
categorical_vars = value_counts[value_counts <= CATEGORICAL_THRESHOLD].index

# Is_categorical 컬럼 설정
callout_quiq["Is_categorical"] = callout_quiq.apply(
    lambda row: np.nan if pd.isna(row["Value"])
    else 1 if row["Variable_name"] in categorical_vars
    else 0,
    axis=1
)
callout_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1214649,,CALLOUT,SUBMIT_WARDID,,52.0,,numeric,1.0,,,,854,175684,,,
1,1214649,,CALLOUT,SUBMIT_CAREUNIT,,,,,,,,,854,175684,,,
2,1214649,,CALLOUT,CURR_WARDID,,29.0,,numeric,0.0,,,,854,175684,,,
3,1214649,,CALLOUT,CURR_CAREUNIT,,MICU,,string,1.0,,,,854,175684,,,
4,1214649,,CALLOUT,CALLOUT_WARDID,,1,,numeric,0.0,,,,854,175684,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30256,1216089,,CALLOUT,UPDATETIME,,2189-03-09 12:45:30,,timestamp,0.0,,,,97143,122472,,,
30257,1216089,,CALLOUT,ACKNOWLEDGETIME,,2189-03-09 12:46:10,,timestamp,0.0,,,,97143,122472,,,
30258,1216089,,CALLOUT,OUTCOMETIME,,2189-03-09 17:55:08,,timestamp,0.0,,,,97143,122472,,,
30259,1216089,,CALLOUT,FIRSTRESERVATIONTIME,,2189-03-09 15:25:07,,timestamp,0.0,,,,97143,122472,,,


In [55]:
callout_quiq.head(25)

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1214649,,CALLOUT,SUBMIT_WARDID,,52.0,,numeric,1.0,,,,854,175684,,,
1,1214649,,CALLOUT,SUBMIT_CAREUNIT,,,,,,,,,854,175684,,,
2,1214649,,CALLOUT,CURR_WARDID,,29.0,,numeric,0.0,,,,854,175684,,,
3,1214649,,CALLOUT,CURR_CAREUNIT,,MICU,,string,1.0,,,,854,175684,,,
4,1214649,,CALLOUT,CALLOUT_WARDID,,1,,numeric,0.0,,,,854,175684,,,
5,1214649,,CALLOUT,CALLOUT_SERVICE,,MED,,string,0.0,,,,854,175684,,,
6,1214649,,CALLOUT,REQUEST_TELE,,0,,numeric,1.0,,,,854,175684,,,
7,1214649,,CALLOUT,REQUEST_RESP,,0,,numeric,1.0,,,,854,175684,,,
8,1214649,,CALLOUT,REQUEST_CDIFF,,0,,numeric,1.0,,,,854,175684,,,
9,1214649,,CALLOUT,REQUEST_MRSA,,0,,numeric,1.0,,,,854,175684,,,


In [53]:
# 1. 각 Variable_name의 고유 Value 개수 계산
value_counts = callout_quiq.groupby("Variable_name")["Value"].nunique(dropna=True)

# 2. 고유값 개수가 1개인 Variable_name만 추출
single_value_vars = value_counts[value_counts == 1].index

# 3. 해당 Variable_name들의 행 중에서 variable_type이 numeric이고, Unit이 있는 경우만 필터
filtered_rows = callout_quiq[
    (callout_quiq["Variable_name"].isin(single_value_vars)) &
    (callout_quiq["Variable_type"] == "numeric") &
    (callout_quiq["Unit"].notna())
]
filtered_rows

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2


In [56]:
condition = (
    (callout_quiq["Variable_type"] == "numeric") &
    (callout_quiq["Is_categorical"] == 0)
)

# 해당 조건에 맞는 행들의 Is_categorical 값을 0으로 변경
callout_quiq.loc[condition, "Is_categorical"] = 1
callout_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1214649,,CALLOUT,SUBMIT_WARDID,,52.0,,numeric,1.0,,,,854,175684,,,
1,1214649,,CALLOUT,SUBMIT_CAREUNIT,,,,,,,,,854,175684,,,
2,1214649,,CALLOUT,CURR_WARDID,,29.0,,numeric,1.0,,,,854,175684,,,
3,1214649,,CALLOUT,CURR_CAREUNIT,,MICU,,string,1.0,,,,854,175684,,,
4,1214649,,CALLOUT,CALLOUT_WARDID,,1,,numeric,1.0,,,,854,175684,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30256,1216089,,CALLOUT,UPDATETIME,,2189-03-09 12:45:30,,timestamp,0.0,,,,97143,122472,,,
30257,1216089,,CALLOUT,ACKNOWLEDGETIME,,2189-03-09 12:46:10,,timestamp,0.0,,,,97143,122472,,,
30258,1216089,,CALLOUT,OUTCOMETIME,,2189-03-09 17:55:08,,timestamp,0.0,,,,97143,122472,,,
30259,1216089,,CALLOUT,FIRSTRESERVATIONTIME,,2189-03-09 15:25:07,,timestamp,0.0,,,,97143,122472,,,


In [57]:
# Mapping
# 1. 매핑 룰 정의
mapping_rules = {
    'CREATETIME': ("date", np.nan),
    'UPDATETIME': ("date", np.nan),
    'ACKNOWLEDGETIME': ("date", np.nan),
    'OUTCOMETIME': ("date", np.nan),
    'FIRSTRESERVATIONTIME': ("date", np.nan),
    'CURRENTRESERVATIONTIME': ("date", np.nan),
}

# 2. 매핑 함수
def map_mapping_info(row):
    var_name = row["Variable_name"]
    value = row["Value"]

    if pd.isna(value):
        return pd.Series([np.nan, np.nan])  # ✅ value가 NaN이면 매핑 안함
    elif var_name == "DIAGNOSIS":
        return pd.Series(["diagnosis", np.nan])
    else:
        return pd.Series(mapping_rules.get(var_name, (np.nan, np.nan)))

# 3. 적용
callout_quiq[["Mapping_info_1", "Mapping_info_2"]] = callout_quiq.apply(map_mapping_info, axis=1)

In [58]:
callout_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1214649,,CALLOUT,SUBMIT_WARDID,,52.0,,numeric,1.0,,,,854,175684,,,
1,1214649,,CALLOUT,SUBMIT_CAREUNIT,,,,,,,,,854,175684,,,
2,1214649,,CALLOUT,CURR_WARDID,,29.0,,numeric,1.0,,,,854,175684,,,
3,1214649,,CALLOUT,CURR_CAREUNIT,,MICU,,string,1.0,,,,854,175684,,,
4,1214649,,CALLOUT,CALLOUT_WARDID,,1,,numeric,1.0,,,,854,175684,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30256,1216089,,CALLOUT,UPDATETIME,,2189-03-09 12:45:30,,timestamp,0.0,,,,97143,122472,,date,
30257,1216089,,CALLOUT,ACKNOWLEDGETIME,,2189-03-09 12:46:10,,timestamp,0.0,,,,97143,122472,,date,
30258,1216089,,CALLOUT,OUTCOMETIME,,2189-03-09 17:55:08,,timestamp,0.0,,,,97143,122472,,date,
30259,1216089,,CALLOUT,FIRSTRESERVATIONTIME,,2189-03-09 15:25:07,,timestamp,0.0,,,,97143,122472,,date,


In [59]:
callout_quiq.to_csv('G:/2000/MIMIC_callout_QUIQ.csv', index=False)

In [65]:
# VIA 테이블 생성
via_variable_names = ['SUBMIT_WARDID', 'SUBMIT_CAREUNIT',
       'CURR_WARDID', 'CURR_CAREUNIT', 'CALLOUT_WARDID', 'CALLOUT_SERVICE',
       'REQUEST_TELE', 'REQUEST_RESP', 'REQUEST_CDIFF', 'REQUEST_MRSA',
       'REQUEST_VRE', 'CALLOUT_STATUS', 'CALLOUT_OUTCOME', 'DISCHARGE_WARDID',
       'ACKNOWLEDGE_STATUS', 'CREATETIME', 'UPDATETIME', 'ACKNOWLEDGETIME',
       'OUTCOMETIME', 'FIRSTRESERVATIONTIME', 'CURRENTRESERVATIONTIME']
via_descriptions = ["the ward from which the request was submitted", 
                    "indicates whether the SUBMIT_WARDID corresponds to an ICU cost center, and if so, what type of ICU cost center", 
                    "the ward in which the patient resides when called out (i.e. prior to discharge/transfer)",
                    "indicates which ICU cost center the CURR_WARDID corresponds to (note: since all patient are being discharged from an ICU, all patients should reside in an ICU cost center)", 
                    "the ward to which the patients should be discharged. CALLOUT_WARDID=0 represents 'Home' and CALLOUT_WARDID=1 represents 'First available ward'.",
                    "service under which the patients should be discharged", 
                    "Request for telemetry monitoring", 
                    "Request for respiratory precautions", 
                    "Request for C. difficile precautions", 
                    "Request for MRSA precautions", 
                    "Request for VRE precautions", 
                    "the call out is still active or not; if a call out is answered it should be flagged as inactive", 
                    "the patient finally called out(i.e. discharged, cancelled)or not", 
                    "the ward to which the patient was actually discharged. DISCHARGE_WARDID = 0 indicates home and other values correspond to dinstinct wards in the hospital", 
                    "the response to the callout event: Acknowledged, Revised, Unacknowledged or Reactivated", 
                    "Time the callout was created", 
                    "Time the callout was last updated", 
                    "Time the callout was acknowledged", 
                    "Time the outcome of the callout was recorded", 
                    "Time of the first reservation for transfer", 
                    "Time of the current reservation for transfer"

]

via_callout = pd.DataFrame({
    'Original_table_name': 'CALLOUT',
    'Variable_name': via_variable_names,
    'Description': via_descriptions
})
via_callout

Unnamed: 0,Original_table_name,Variable_name,Description
0,CALLOUT,SUBMIT_WARDID,the ward from which the request was submitted
1,CALLOUT,SUBMIT_CAREUNIT,indicates whether the SUBMIT_WARDID correspond...
2,CALLOUT,CURR_WARDID,the ward in which the patient resides when cal...
3,CALLOUT,CURR_CAREUNIT,indicates which ICU cost center the CURR_WARDI...
4,CALLOUT,CALLOUT_WARDID,the ward to which the patients should be disch...
5,CALLOUT,CALLOUT_SERVICE,service under which the patients should be dis...
6,CALLOUT,REQUEST_TELE,Request for telemetry monitoring
7,CALLOUT,REQUEST_RESP,Request for respiratory precautions
8,CALLOUT,REQUEST_CDIFF,Request for C. difficile precautions
9,CALLOUT,REQUEST_MRSA,Request for MRSA precautions


In [67]:
via_callout.to_csv('G:/2000/MIMIC_callout_VIA.csv', index=False)

# Services<br>

Lists services that a patient was admitted/transferred under

In [60]:
services = pd.read_csv('SERVICES.csv.gz', compression='gzip')
services

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,TRANSFERTIME,PREV_SERVICE,CURR_SERVICE
0,758,471,135879,2122-07-22 14:07:27,TSURG,MED
1,759,471,135879,2122-07-26 18:31:49,MED,TSURG
2,760,472,173064,2172-09-28 19:22:15,,CMED
3,761,473,129194,2201-01-09 20:16:45,,NB
4,762,474,194246,2181-03-23 08:24:41,,NB
...,...,...,...,...,...,...
73338,72914,98932,174244,2176-08-13 20:28:00,,CMED
73339,72915,98939,115549,2166-11-15 01:21:49,,NMED
73340,72916,98941,141129,2118-02-08 01:52:28,,CSURG
73341,72917,98943,193747,2164-11-14 20:04:12,,TRAUM


- CMED	Cardiac Medical - for non-surgical cardiac related admissions
- CSURG	Cardiac Surgery - for surgical cardiac admissions
- DENT	Dental - for dental/jaw related admissions
- ENT	Ear, nose, and throat - conditions primarily affecting these areas
- GU	Genitourinary - reproductive organs/urinary system
- GYN	Gynecological - female reproductive systems and breasts
- MED	Medical - general service for internal medicine
- NB	Newborn - infants born at the hospital
- NBB	Newborn baby - infants born at the hospital
- NMED	Neurologic Medical - non-surgical, relating to the brain
- NSURG	Neurologic Surgical - surgical, relating to the brain
- OBS	Obstetrics - conerned with childbirth and the care of women giving birth
- ORTHO	Orthopaedic - surgical, relating to the musculoskeletal system
- OMED	Oncologic Medical - non-surgical, relating to cancer
- PSURG	Plastic - restortation/reconstruction of the human body (including cosmetic or aesthetic)
- PSYCH	Psychiatric - mental disorders relating to mood, behaviour, cognition, or perceptions
- SURG	Surgical - general surgical service not classified elsewhere
- TRAUM	Trauma - injury or damage caused by physical harm from an external source
- TSURG	Thoracic Surgical - surgery on the thorax, located between the neck and the abdomen
- VSURG	Vascular Surgical - surgery relating to the circulatory system

In [62]:
services_df = services[services["SUBJECT_ID"].isin(la)].reset_index(drop=True)
services_df

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,TRANSFERTIME,PREV_SERVICE,CURR_SERVICE
0,761,473,129194,2201-01-09 20:16:45,,NB
1,791,501,190462,2190-01-28 02:05:14,,MED
2,807,511,136962,2166-11-17 19:58:34,,CMED
3,808,511,136962,2166-11-18 14:00:05,CMED,CSURG
4,878,556,162724,2197-07-20 10:19:44,,NB
...,...,...,...,...,...,...
3181,70756,93648,173303,2170-09-23 10:09:53,SURG,MED
3182,70757,93648,173303,2170-09-23 10:11:20,MED,SURG
3183,70759,93653,156750,2133-11-11 00:42:01,,ENT
3184,72841,98748,122488,2166-12-31 02:35:20,,CSURG


In [63]:
services_df.columns

Index(['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'TRANSFERTIME', 'PREV_SERVICE',
       'CURR_SERVICE'],
      dtype='object')

In [64]:
# 결과 저장을 위한 리스트
rows = []

for index, row in services_df.iterrows():
    patient_id = row["SUBJECT_ID"]
    admission_id = row["HADM_ID"]

    for col in services_df.columns:
        if col not in ["SUBJECT_ID", "HADM_ID", "ROW_ID"]:  # 특정 열 제외
            
            rows.append({
                "Primary_key": index + 1216090,  # 원본 테이블의 각 row 마다 부여
                "Variable_ID": np.nan,
                "Original_table_name": "SERVICES",  # 테이블 이름
                "Variable_name": col,  # 컬럼명이 변수명
                "Record_datetime": np.nan,
                "Value": row[col],  # 해당 변수 값
                "Unit": np.nan,
                "Variable_type": np.nan,  # 추후 설정
                "Recorder": np.nan,
                "Recorder_position": np.nan,
                "Recorder_affiliation": np.nan,
                "Patient_id": patient_id, 
                "Admission_id": admission_id,
                "Ground_truth": np.nan,
                "Mapping_info_1": np.nan, 
                "Mapping_info_2": np.nan
            })

# QUIQ 테이블 컬럼 순서 맞추기
service_quiq = pd.DataFrame(rows, columns=QUIQ_cols)

def infer_variable_type(val):
    if pd.isna(val):
        return np.nan
    elif isinstance(val, bool):
        return "boolean"
    elif isinstance(val, (int, float)):
        return "numeric"
    elif isinstance(val, pd.Timestamp):
        return "timestamp"
    elif isinstance(val, str):
        # 문자열인데 datetime처럼 보이는 경우 처리
        try:
            parsed = pd.to_datetime(val, errors='raise')
            return "timestamp"
        except Exception:
            return "string"
    else:
        return "unknown"

service_quiq["Variable_type"] = service_quiq["Value"].apply(infer_variable_type)

# CATEGORICAL_THRESHOLD 설정
CATEGORICAL_THRESHOLD = 21

# 각 Variable_name 별 고유값 수 계산 (NaN 제외)
value_counts = service_quiq.groupby("Variable_name")["Value"].nunique(dropna=True)

# 고유값 수가 기준 이하인 변수 목록 추출
categorical_vars = value_counts[value_counts <= CATEGORICAL_THRESHOLD].index

# Is_categorical 컬럼 채우기
service_quiq["Is_categorical"] = service_quiq.apply(
    lambda row: np.nan if pd.isna(row["Value"])
    else 1 if row["Variable_name"] in categorical_vars
    else 0,
    axis=1
)
service_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1216090,,SERVICES,TRANSFERTIME,,2201-01-09 20:16:45,,timestamp,0.0,,,,473,129194,,,
1,1216090,,SERVICES,PREV_SERVICE,,,,,,,,,473,129194,,,
2,1216090,,SERVICES,CURR_SERVICE,,NB,,string,1.0,,,,473,129194,,,
3,1216091,,SERVICES,TRANSFERTIME,,2190-01-28 02:05:14,,timestamp,0.0,,,,501,190462,,,
4,1216091,,SERVICES,PREV_SERVICE,,,,,,,,,501,190462,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9553,1219274,,SERVICES,PREV_SERVICE,,,,,,,,,98748,122488,,,
9554,1219274,,SERVICES,CURR_SERVICE,,CSURG,,string,1.0,,,,98748,122488,,,
9555,1219275,,SERVICES,TRANSFERTIME,,2118-02-08 01:52:28,,timestamp,0.0,,,,98941,141129,,,
9556,1219275,,SERVICES,PREV_SERVICE,,,,,,,,,98941,141129,,,


In [65]:
service_quiq.head(20)

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1216090,,SERVICES,TRANSFERTIME,,2201-01-09 20:16:45,,timestamp,0.0,,,,473,129194,,,
1,1216090,,SERVICES,PREV_SERVICE,,,,,,,,,473,129194,,,
2,1216090,,SERVICES,CURR_SERVICE,,NB,,string,1.0,,,,473,129194,,,
3,1216091,,SERVICES,TRANSFERTIME,,2190-01-28 02:05:14,,timestamp,0.0,,,,501,190462,,,
4,1216091,,SERVICES,PREV_SERVICE,,,,,,,,,501,190462,,,
5,1216091,,SERVICES,CURR_SERVICE,,MED,,string,1.0,,,,501,190462,,,
6,1216092,,SERVICES,TRANSFERTIME,,2166-11-17 19:58:34,,timestamp,0.0,,,,511,136962,,,
7,1216092,,SERVICES,PREV_SERVICE,,,,,,,,,511,136962,,,
8,1216092,,SERVICES,CURR_SERVICE,,CMED,,string,1.0,,,,511,136962,,,
9,1216093,,SERVICES,TRANSFERTIME,,2166-11-18 14:00:05,,timestamp,0.0,,,,511,136962,,,


In [66]:
# Mapping
mapping_rules = {
    'TRANSFERTIME': ("date", np.nan)
}

# 2. 매핑 함수
def map_mapping_info(row):
    var_name = row["Variable_name"]
    value = row["Value"]

    if pd.isna(value):
        return pd.Series([np.nan, np.nan])  # ✅ value가 NaN이면 매핑 안함
    else:
        return pd.Series(mapping_rules.get(var_name, (np.nan, np.nan)))

# 3. 적용
service_quiq[["Mapping_info_1", "Mapping_info_2"]] = service_quiq.apply(map_mapping_info, axis=1)

In [67]:
service_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1216090,,SERVICES,TRANSFERTIME,,2201-01-09 20:16:45,,timestamp,0.0,,,,473,129194,,date,
1,1216090,,SERVICES,PREV_SERVICE,,,,,,,,,473,129194,,,
2,1216090,,SERVICES,CURR_SERVICE,,NB,,string,1.0,,,,473,129194,,,
3,1216091,,SERVICES,TRANSFERTIME,,2190-01-28 02:05:14,,timestamp,0.0,,,,501,190462,,date,
4,1216091,,SERVICES,PREV_SERVICE,,,,,,,,,501,190462,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9553,1219274,,SERVICES,PREV_SERVICE,,,,,,,,,98748,122488,,,
9554,1219274,,SERVICES,CURR_SERVICE,,CSURG,,string,1.0,,,,98748,122488,,,
9555,1219275,,SERVICES,TRANSFERTIME,,2118-02-08 01:52:28,,timestamp,0.0,,,,98941,141129,,date,
9556,1219275,,SERVICES,PREV_SERVICE,,,,,,,,,98941,141129,,,


In [75]:
service_quiq.to_csv('G:/2000/MIMIC_service_QUIQ.csv', index=False)

In [76]:
# VIA 테이블 생성
via_variable_names = ['TRANSFERTIME', 'PREV_SERVICE', 'CURR_SERVICE']
via_descriptions = ["the time at shich the patient moved from the PREV_SERVICE (if present) to the CURR_SERVICE",
                    "previous service that the patient resides under",
                    "current service that the patient resides under"

]

via_service = pd.DataFrame({
    'Original_table_name': 'SERVICE',
    'Variable_name': via_variable_names,
    'Description': via_descriptions
})
via_service

Unnamed: 0,Original_table_name,Variable_name,Description
0,SERVICE,TRANSFERTIME,the time at shich the patient moved from the P...
1,SERVICE,PREV_SERVICE,previous service that the patient resides under
2,SERVICE,CURR_SERVICE,current service that the patient resides under


In [77]:
via_service.to_csv('G:/2000/MIMIC_service_VIA.csv', index=False)

# Transfers

In [68]:
transfers = pd.read_csv('TRANSFERS.csv.gz', compression='gzip')
transfers

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,DBSOURCE,EVENTTYPE,PREV_CAREUNIT,CURR_CAREUNIT,PREV_WARDID,CURR_WARDID,INTIME,OUTTIME,LOS
0,657,111,192123,254245.0,carevue,transfer,CCU,MICU,7.0,23.0,2142-04-29 15:27:11,2142-05-04 20:38:33,125.19
1,658,111,192123,,carevue,transfer,MICU,,23.0,45.0,2142-05-04 20:38:33,2142-05-05 11:46:32,15.13
2,659,111,192123,,carevue,discharge,,,45.0,,2142-05-05 11:46:32,,
3,660,111,155897,249202.0,metavision,admit,,MICU,,52.0,2144-07-01 04:13:59,2144-07-01 05:19:39,1.09
4,661,111,155897,,metavision,transfer,MICU,,52.0,32.0,2144-07-01 05:19:39,2144-07-01 06:28:29,1.15
...,...,...,...,...,...,...,...,...,...,...,...,...,...
261892,259671,98385,195599,,metavision,transfer,,,36.0,49.0,2108-10-06 11:27:11,2108-10-06 13:05:57,1.65
261893,259672,98385,195599,292167.0,metavision,transfer,,SICU,49.0,33.0,2108-10-06 13:05:57,2108-10-11 17:00:31,123.91
261894,259673,98385,195599,,metavision,discharge,SICU,,33.0,,2108-10-11 17:00:31,,
261895,259674,98389,155368,,metavision,admit,,,,29.0,2153-10-14 22:12:58,2153-10-14 22:21:06,0.14


In [69]:
transfer_df = transfers[transfers["SUBJECT_ID"].isin(la)].reset_index(drop=True)
transfer_df

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,DBSOURCE,EVENTTYPE,PREV_CAREUNIT,CURR_CAREUNIT,PREV_WARDID,CURR_WARDID,INTIME,OUTTIME,LOS
0,699,118,147035,237541.0,carevue,admit,,NICU,,56.0,2103-08-11 14:20:55,2103-08-11 15:57:15,1.61
1,700,118,147035,237541.0,carevue,transfer,NICU,NICU,56.0,56.0,2103-08-11 15:57:15,2103-08-11 17:06:30,1.15
2,701,118,147035,,carevue,transfer,NICU,NWARD,56.0,43.0,2103-08-11 17:06:30,2103-08-13 13:53:07,44.78
3,702,118,147035,,carevue,discharge,NWARD,,43.0,,2103-08-13 13:53:07,,
4,223,46,144073,268016.0,carevue,admit,,MICU,,52.0,2133-02-20 18:22:30,2133-02-21 12:49:55,18.46
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11462,258700,97670,130598,,metavision,transfer,,,4.0,4.0,2120-12-02 15:51:12,2120-12-04 12:05:21,44.24
11463,258701,97670,130598,,metavision,discharge,,,4.0,,2120-12-04 12:05:21,,
11464,258702,97672,196856,290553.0,metavision,admit,,CCU,,7.0,2195-02-13 21:12:21,2195-02-14 18:04:52,20.88
11465,258703,97672,196856,,metavision,transfer,CCU,,7.0,55.0,2195-02-14 18:04:52,2195-02-19 15:26:36,117.36


In [70]:
transfer_df.columns

Index(['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'DBSOURCE',
       'EVENTTYPE', 'PREV_CAREUNIT', 'CURR_CAREUNIT', 'PREV_WARDID',
       'CURR_WARDID', 'INTIME', 'OUTTIME', 'LOS'],
      dtype='object')

In [71]:
# 결과 저장을 위한 리스트
rows = []

for index, row in transfer_df.iterrows():
    patient_id = row["SUBJECT_ID"]
    admission_id = row["HADM_ID"]

    for col in transfer_df.columns:
        if col not in ['ROW_ID', 'SUBJECT_ID', 'HADM_ID']:  # 특정 열 제외

            event_date_val = row["INTIME"] if col == "CURR_CAREUNIT" and "INTIME" in transfer_df.columns else np.nan

            rows.append({
                "Primary_key": index + 1219276,
                "Variable_ID": np.nan,
                "Original_table_name": "TRANSFERS",
                "Variable_name": col,
                "Record_datetime": np.nan,
                "Value": row[col],
                "Unit": np.nan,
                "Variable_type": np.nan,
                "Recorder": np.nan,
                "Recorder_position": np.nan,
                "Recorder_affiliation": np.nan,
                "Patient_id": patient_id,
                "Admission_id": admission_id,
                "Ground_truth": np.nan,
                "Mapping_info_1": np.nan,
                "Mapping_info_2": np.nan,
                "Event_date": event_date_val  
            })


# QUIQ 테이블 컬럼 순서 맞추기
transfers_quiq = pd.DataFrame(rows, columns=QUIQ_cols)

# Variable_type 추론 함수
def infer_variable_type(val):
    if pd.isna(val):
        return np.nan
    elif isinstance(val, (int, float)):
        return "numeric"
    elif isinstance(val, pd.Timestamp):
        return "timestamp"
    elif isinstance(val, str):
        try:
            pd.to_datetime(val, errors='raise')
            return "timestamp"
        except Exception:
            return "string"
    else:
        return "unknown"

# 변수 타입 자동 추론
transfers_quiq["Variable_type"] = transfers_quiq["Value"].apply(infer_variable_type)

# CATEGORICAL_THRESHOLD 기준 설정
CATEGORICAL_THRESHOLD = 10

# 각 Variable_name 별 고유값 수 계산
value_counts = transfers_quiq.groupby("Variable_name")["Value"].nunique(dropna=True)

# 고유값 수가 기준 이하인 변수 목록 추출
categorical_vars = value_counts[value_counts <= CATEGORICAL_THRESHOLD].index

# Is_categorical 컬럼 설정
transfers_quiq["Is_categorical"] = transfers_quiq.apply(
    lambda row: np.nan if pd.isna(row["Value"])
    else 1 if row["Variable_name"] in categorical_vars
    else 0,
    axis=1
)
transfers_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1219276,,TRANSFERS,ICUSTAY_ID,,237541.0,,numeric,0.0,,,,118,147035,,,
1,1219276,,TRANSFERS,DBSOURCE,,carevue,,string,1.0,,,,118,147035,,,
2,1219276,,TRANSFERS,EVENTTYPE,,admit,,string,1.0,,,,118,147035,,,
3,1219276,,TRANSFERS,PREV_CAREUNIT,,,,,,,,,118,147035,,,
4,1219276,,TRANSFERS,CURR_CAREUNIT,2103-08-11 14:20:55,NICU,,string,1.0,,,,118,147035,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114665,1230742,,TRANSFERS,PREV_WARDID,,55.0,,numeric,0.0,,,,97672,196856,,,
114666,1230742,,TRANSFERS,CURR_WARDID,,,,,,,,,97672,196856,,,
114667,1230742,,TRANSFERS,INTIME,,2195-02-19 15:26:36,,timestamp,0.0,,,,97672,196856,,,
114668,1230742,,TRANSFERS,OUTTIME,,,,,,,,,97672,196856,,,


In [72]:
transfers_quiq.head(20)

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1219276,,TRANSFERS,ICUSTAY_ID,,237541.0,,numeric,0.0,,,,118,147035,,,
1,1219276,,TRANSFERS,DBSOURCE,,carevue,,string,1.0,,,,118,147035,,,
2,1219276,,TRANSFERS,EVENTTYPE,,admit,,string,1.0,,,,118,147035,,,
3,1219276,,TRANSFERS,PREV_CAREUNIT,,,,,,,,,118,147035,,,
4,1219276,,TRANSFERS,CURR_CAREUNIT,2103-08-11 14:20:55,NICU,,string,1.0,,,,118,147035,,,
5,1219276,,TRANSFERS,PREV_WARDID,,,,,,,,,118,147035,,,
6,1219276,,TRANSFERS,CURR_WARDID,,56.0,,numeric,0.0,,,,118,147035,,,
7,1219276,,TRANSFERS,INTIME,,2103-08-11 14:20:55,,timestamp,0.0,,,,118,147035,,,
8,1219276,,TRANSFERS,OUTTIME,,2103-08-11 15:57:15,,timestamp,0.0,,,,118,147035,,,
9,1219276,,TRANSFERS,LOS,,1.61,,numeric,0.0,,,,118,147035,,,


In [73]:
condition = (
    (transfers_quiq["Variable_type"] == "numeric") &
    (transfers_quiq["Is_categorical"] == 0)
)

# 해당 조건에 맞는 행들의 Is_categorical 값을 0으로 변경
transfers_quiq.loc[condition, "Is_categorical"] = 1
transfers_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1219276,,TRANSFERS,ICUSTAY_ID,,237541.0,,numeric,1.0,,,,118,147035,,,
1,1219276,,TRANSFERS,DBSOURCE,,carevue,,string,1.0,,,,118,147035,,,
2,1219276,,TRANSFERS,EVENTTYPE,,admit,,string,1.0,,,,118,147035,,,
3,1219276,,TRANSFERS,PREV_CAREUNIT,,,,,,,,,118,147035,,,
4,1219276,,TRANSFERS,CURR_CAREUNIT,2103-08-11 14:20:55,NICU,,string,1.0,,,,118,147035,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114665,1230742,,TRANSFERS,PREV_WARDID,,55.0,,numeric,1.0,,,,97672,196856,,,
114666,1230742,,TRANSFERS,CURR_WARDID,,,,,,,,,97672,196856,,,
114667,1230742,,TRANSFERS,INTIME,,2195-02-19 15:26:36,,timestamp,0.0,,,,97672,196856,,,
114668,1230742,,TRANSFERS,OUTTIME,,,,,,,,,97672,196856,,,


In [74]:
# Mapping
mapping_rules = {
    'INTIME':("date", np.nan), 
    'OUTTIME':("date", np.nan)
}

# 2. 매핑 함수
def map_mapping_info(row):
    var_name = row["Variable_name"]
    value = row["Value"]

    if pd.isna(value):
        return pd.Series([np.nan, np.nan])  # ✅ value가 NaN이면 매핑 안함
    elif var_name == "DIAGNOSIS":
        return pd.Series(["diagnosis", np.nan])
    else:
        return pd.Series(mapping_rules.get(var_name, (np.nan, np.nan)))

# 3. 적용
transfers_quiq[["Mapping_info_1", "Mapping_info_2"]] = transfers_quiq.apply(map_mapping_info, axis=1)

In [75]:
transfers_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1219276,,TRANSFERS,ICUSTAY_ID,,237541.0,,numeric,1.0,,,,118,147035,,,
1,1219276,,TRANSFERS,DBSOURCE,,carevue,,string,1.0,,,,118,147035,,,
2,1219276,,TRANSFERS,EVENTTYPE,,admit,,string,1.0,,,,118,147035,,,
3,1219276,,TRANSFERS,PREV_CAREUNIT,,,,,,,,,118,147035,,,
4,1219276,,TRANSFERS,CURR_CAREUNIT,2103-08-11 14:20:55,NICU,,string,1.0,,,,118,147035,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114665,1230742,,TRANSFERS,PREV_WARDID,,55.0,,numeric,1.0,,,,97672,196856,,,
114666,1230742,,TRANSFERS,CURR_WARDID,,,,,,,,,97672,196856,,,
114667,1230742,,TRANSFERS,INTIME,,2195-02-19 15:26:36,,timestamp,0.0,,,,97672,196856,,date,
114668,1230742,,TRANSFERS,OUTTIME,,,,,,,,,97672,196856,,,


In [76]:
transfers_quiq.to_csv('G:/2000/MIMIC_transfers_QUIQ.csv', index=False)

In [85]:
# VIA 테이블 생성
via_variable_names = ['DBSOURCE',
       'EVENTTYPE', 'PREV_CAREUNIT', 'CURR_CAREUNIT', 'PREV_WARDID',
       'CURR_WARDID', 'INTIME', 'OUTTIME', 'LOS']
via_descriptions = ["the original ICU database the data was source from",
                    "what transfer event occurred: 'admit' for an admission, 'transfer' for an intra-hospital transfer and 'discharge' for a discharge from the hospital",
                    "the care unit in which the patient previously resided",
                    "the care unit in which the patient currently resides",
                    "the previous ward in which the patient stayed",
                    "the current ward in which the patient stayed",
                    "the date and time the patient was transferred into the current care unit from the previous care unit",
                    "the date and time the patient was transferred out of the current care unit",
                    "the length of stay for the patient for the given ward stay, which may be within or outside of the ICU"      

]

via_transfers = pd.DataFrame({
    'Original_table_name': 'TRANSFERS',
    'Variable_name': via_variable_names,
    'Description': via_descriptions
})
via_transfers

Unnamed: 0,Original_table_name,Variable_name,Description
0,TRANSFERS,DBSOURCE,the original ICU database the data was source ...
1,TRANSFERS,EVENTTYPE,what transfer event occurred: 'admit' for an a...
2,TRANSFERS,PREV_CAREUNIT,the care unit in which the patient previously ...
3,TRANSFERS,CURR_CAREUNIT,the care unit in which the patient currently r...
4,TRANSFERS,PREV_WARDID,the previous ward in which the patient stayed
5,TRANSFERS,CURR_WARDID,the current ward in which the patient stayed
6,TRANSFERS,INTIME,the date and time the patient was transferred ...
7,TRANSFERS,OUTTIME,the date and time the patient was transferred ...
8,TRANSFERS,LOS,the length of stay for the patient for the giv...


In [86]:
via_transfers.to_csv('G:/2000/MIMIC_transfers_VIA.csv', index=False)

# CPTevents

In [77]:
cptevents = pd.read_csv('CPTEVENTS.csv.gz', compression='gzip')
cptevents

  cptevents = pd.read_csv('CPTEVENTS.csv.gz', compression='gzip')


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,COSTCENTER,CHARTDATE,CPT_CD,CPT_NUMBER,CPT_SUFFIX,TICKET_ID_SEQ,SECTIONHEADER,SUBSECTIONHEADER,DESCRIPTION
0,317,11743,129545,ICU,,99232,99232.0,,6.0,Evaluation and management,Hospital inpatient services,
1,318,11743,129545,ICU,,99232,99232.0,,7.0,Evaluation and management,Hospital inpatient services,
2,319,11743,129545,ICU,,99232,99232.0,,8.0,Evaluation and management,Hospital inpatient services,
3,320,11743,129545,ICU,,99232,99232.0,,9.0,Evaluation and management,Hospital inpatient services,
4,321,6185,183725,ICU,,99223,99223.0,,1.0,Evaluation and management,Hospital inpatient services,
...,...,...,...,...,...,...,...,...,...,...,...,...
573141,573142,78876,163404,Resp,2105-09-01 00:00:00,94003,94003.0,,,Medicine,Pulmonary,VENT MGMT;SUBSQ DAYS(INVASIVE)
573142,573143,78879,136071,Resp,2150-08-29 00:00:00,94003,94003.0,,,Medicine,Pulmonary,VENT MGMT;SUBSQ DAYS(INVASIVE)
573143,573144,78879,136071,Resp,2150-08-28 00:00:00,94002,94002.0,,,Medicine,Pulmonary,"VENT MGMT, 1ST DAY (INVASIVE)"
573144,573145,78892,175171,Resp,2125-06-11 00:00:00,94003,94003.0,,,Medicine,Pulmonary,VENT MGMT;SUBSQ DAYS(INVASIVE)


In [78]:
cptevents_df = cptevents[cptevents["SUBJECT_ID"].isin(la)].reset_index(drop=True)
cptevents_df

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,COSTCENTER,CHARTDATE,CPT_CD,CPT_NUMBER,CPT_SUFFIX,TICKET_ID_SEQ,SECTIONHEADER,SUBSECTIONHEADER,DESCRIPTION
0,8,9882,107530,ICU,,99232,99232.0,,10.0,Evaluation and management,Hospital inpatient services,
1,9,9882,107530,ICU,,99232,99232.0,,11.0,Evaluation and management,Hospital inpatient services,
2,10,9882,107530,ICU,,99232,99232.0,,12.0,Evaluation and management,Hospital inpatient services,
3,11,9882,107530,ICU,,99232,99232.0,,13.0,Evaluation and management,Hospital inpatient services,
4,12,9882,107530,ICU,,32000,32000.0,,14.0,Surgery,Respiratory system,
...,...,...,...,...,...,...,...,...,...,...,...,...
23984,572913,88953,128732,Resp,2184-07-24 00:00:00,94003,94003.0,,,Medicine,Pulmonary,VENT MGMT;SUBSQ DAYS(INVASIVE)
23985,572914,88953,128732,Resp,2184-07-25 00:00:00,94003,94003.0,,,Medicine,Pulmonary,VENT MGMT;SUBSQ DAYS(INVASIVE)
23986,572915,88953,128732,Resp,2184-07-26 00:00:00,94003,94003.0,,,Medicine,Pulmonary,VENT MGMT;SUBSQ DAYS(INVASIVE)
23987,572916,88953,128732,Resp,2184-07-27 00:00:00,94003,94003.0,,,Medicine,Pulmonary,VENT MGMT;SUBSQ DAYS(INVASIVE)


In [79]:
cptevents_df.columns

Index(['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'COSTCENTER', 'CHARTDATE', 'CPT_CD',
       'CPT_NUMBER', 'CPT_SUFFIX', 'TICKET_ID_SEQ', 'SECTIONHEADER',
       'SUBSECTIONHEADER', 'DESCRIPTION'],
      dtype='object')

In [80]:
# 결과 저장을 위한 리스트
rows = []

for index, row in cptevents_df.iterrows():
    patient_id = row["SUBJECT_ID"]
    admission_id = row["HADM_ID"]

    for col in cptevents_df.columns:
        # 'CHARTDATE'는 완전히 제외하여 Variable_name에 들어가지 않도록 함
        if col not in ['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'CHARTDATE']:

            # CPT_CD인 경우에만 CHARTDATE를 Event_date로 넣음
            event_date_val = row["CHARTDATE"] if col == "CPT_CD" else np.nan

            rows.append({
                "Primary_key": index + 1230743,
                "Variable_ID": np.nan,
                "Original_table_name": "CPTEVENTS",
                "Variable_name": col,
                "Record_datetime": np.nan,
                "Value": row[col],
                "Unit": np.nan,
                "Variable_type": np.nan,
                "Recorder": np.nan,
                "Recorder_position": np.nan,
                "Recorder_affiliation": np.nan,
                "Patient_id": patient_id,
                "Admission_id": admission_id,
                "Ground_truth": np.nan,
                "Mapping_info_1": np.nan,
                "Mapping_info_2": np.nan,
                "Event_date": event_date_val
            })


# QUIQ 테이블 컬럼 순서 맞추기
cptevents_quiq = pd.DataFrame(rows, columns=QUIQ_cols)

# Variable_type 추론 함수
def infer_variable_type(val):
    if pd.isna(val):
        return np.nan
    elif isinstance(val, (int, float)):
        return "numeric"
    elif isinstance(val, pd.Timestamp):
        return "timestamp"
    elif isinstance(val, str):
        try:
            pd.to_datetime(val, errors='raise')
            return "timestamp"
        except Exception:
            return "string"
    else:
        return "unknown"

# 변수 타입 자동 추론
cptevents_quiq["Variable_type"] = cptevents_quiq["Value"].apply(infer_variable_type)

# CATEGORICAL_THRESHOLD 기준 설정
CATEGORICAL_THRESHOLD = 10

# 각 Variable_name 별 고유값 수 계산
value_counts = cptevents_quiq.groupby("Variable_name")["Value"].nunique(dropna=True)

# 고유값 수가 기준 이하인 변수 목록 추출
categorical_vars = value_counts[value_counts <= CATEGORICAL_THRESHOLD].index

# Is_categorical 컬럼 설정
cptevents_quiq["Is_categorical"] = cptevents_quiq.apply(
    lambda row: np.nan if pd.isna(row["Value"])
    else 1 if row["Variable_name"] in categorical_vars
    else 0,
    axis=1
)
cptevents_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1230743,,CPTEVENTS,COSTCENTER,,ICU,,string,1.0,,,,9882,107530,,,
1,1230743,,CPTEVENTS,CPT_CD,,99232,,numeric,0.0,,,,9882,107530,,,
2,1230743,,CPTEVENTS,CPT_NUMBER,,99232.0,,numeric,0.0,,,,9882,107530,,,
3,1230743,,CPTEVENTS,CPT_SUFFIX,,,,,,,,,9882,107530,,,
4,1230743,,CPTEVENTS,TICKET_ID_SEQ,,10.0,,numeric,0.0,,,,9882,107530,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191907,1254731,,CPTEVENTS,CPT_SUFFIX,,,,,,,,,89040,190532,,,
191908,1254731,,CPTEVENTS,TICKET_ID_SEQ,,,,,,,,,89040,190532,,,
191909,1254731,,CPTEVENTS,SECTIONHEADER,,Medicine,,string,1.0,,,,89040,190532,,,
191910,1254731,,CPTEVENTS,SUBSECTIONHEADER,,Pulmonary,,string,0.0,,,,89040,190532,,,


In [81]:
cptevents_quiq.head(20)

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1230743,,CPTEVENTS,COSTCENTER,,ICU,,string,1.0,,,,9882,107530,,,
1,1230743,,CPTEVENTS,CPT_CD,,99232,,numeric,0.0,,,,9882,107530,,,
2,1230743,,CPTEVENTS,CPT_NUMBER,,99232.0,,numeric,0.0,,,,9882,107530,,,
3,1230743,,CPTEVENTS,CPT_SUFFIX,,,,,,,,,9882,107530,,,
4,1230743,,CPTEVENTS,TICKET_ID_SEQ,,10.0,,numeric,0.0,,,,9882,107530,,,
5,1230743,,CPTEVENTS,SECTIONHEADER,,Evaluation and management,,string,1.0,,,,9882,107530,,,
6,1230743,,CPTEVENTS,SUBSECTIONHEADER,,Hospital inpatient services,,string,0.0,,,,9882,107530,,,
7,1230743,,CPTEVENTS,DESCRIPTION,,,,,,,,,9882,107530,,,
8,1230744,,CPTEVENTS,COSTCENTER,,ICU,,string,1.0,,,,9882,107530,,,
9,1230744,,CPTEVENTS,CPT_CD,,99232,,numeric,0.0,,,,9882,107530,,,


In [82]:
condition = (
    (cptevents_quiq["Variable_type"] == "numeric") &
    (cptevents_quiq["Is_categorical"] == 0)
)

# 해당 조건에 맞는 행들의 Is_categorical 값을 0으로 변경
cptevents_quiq.loc[condition, "Is_categorical"] = 1
cptevents_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1230743,,CPTEVENTS,COSTCENTER,,ICU,,string,1.0,,,,9882,107530,,,
1,1230743,,CPTEVENTS,CPT_CD,,99232,,numeric,1.0,,,,9882,107530,,,
2,1230743,,CPTEVENTS,CPT_NUMBER,,99232.0,,numeric,1.0,,,,9882,107530,,,
3,1230743,,CPTEVENTS,CPT_SUFFIX,,,,,,,,,9882,107530,,,
4,1230743,,CPTEVENTS,TICKET_ID_SEQ,,10.0,,numeric,1.0,,,,9882,107530,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191907,1254731,,CPTEVENTS,CPT_SUFFIX,,,,,,,,,89040,190532,,,
191908,1254731,,CPTEVENTS,TICKET_ID_SEQ,,,,,,,,,89040,190532,,,
191909,1254731,,CPTEVENTS,SECTIONHEADER,,Medicine,,string,1.0,,,,89040,190532,,,
191910,1254731,,CPTEVENTS,SUBSECTIONHEADER,,Pulmonary,,string,0.0,,,,89040,190532,,,


In [83]:
condition = (
    (cptevents_quiq["Variable_name"] == "SUBSECTIONHEADER") &
    (cptevents_quiq["Is_categorical"] == 0)
)

# 해당 조건에 맞는 행들의 Is_categorical 값을 0으로 변경
cptevents_quiq.loc[condition, "Is_categorical"] = 1
cptevents_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1230743,,CPTEVENTS,COSTCENTER,,ICU,,string,1.0,,,,9882,107530,,,
1,1230743,,CPTEVENTS,CPT_CD,,99232,,numeric,1.0,,,,9882,107530,,,
2,1230743,,CPTEVENTS,CPT_NUMBER,,99232.0,,numeric,1.0,,,,9882,107530,,,
3,1230743,,CPTEVENTS,CPT_SUFFIX,,,,,,,,,9882,107530,,,
4,1230743,,CPTEVENTS,TICKET_ID_SEQ,,10.0,,numeric,1.0,,,,9882,107530,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191907,1254731,,CPTEVENTS,CPT_SUFFIX,,,,,,,,,89040,190532,,,
191908,1254731,,CPTEVENTS,TICKET_ID_SEQ,,,,,,,,,89040,190532,,,
191909,1254731,,CPTEVENTS,SECTIONHEADER,,Medicine,,string,1.0,,,,89040,190532,,,
191910,1254731,,CPTEVENTS,SUBSECTIONHEADER,,Pulmonary,,string,1.0,,,,89040,190532,,,


In [84]:
# Mapping
mapping_rules = {
    'CPT_CD':("medical_code", np.nan), 
    'CHARTDATE':("date", np.nan)
}

# 2. 매핑 함수
def map_mapping_info(row):
    var_name = row["Variable_name"]
    value = row["Value"]

    if pd.isna(value):
        return pd.Series([np.nan, np.nan])  # ✅ value가 NaN이면 매핑 안함
    else:
        return pd.Series(mapping_rules.get(var_name, (np.nan, np.nan)))

# 3. 적용
cptevents_quiq[["Mapping_info_1", "Mapping_info_2"]] = cptevents_quiq.apply(map_mapping_info, axis=1)

In [85]:
cptevents_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1230743,,CPTEVENTS,COSTCENTER,,ICU,,string,1.0,,,,9882,107530,,,
1,1230743,,CPTEVENTS,CPT_CD,,99232,,numeric,1.0,,,,9882,107530,,medical_code,
2,1230743,,CPTEVENTS,CPT_NUMBER,,99232.0,,numeric,1.0,,,,9882,107530,,,
3,1230743,,CPTEVENTS,CPT_SUFFIX,,,,,,,,,9882,107530,,,
4,1230743,,CPTEVENTS,TICKET_ID_SEQ,,10.0,,numeric,1.0,,,,9882,107530,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191907,1254731,,CPTEVENTS,CPT_SUFFIX,,,,,,,,,89040,190532,,,
191908,1254731,,CPTEVENTS,TICKET_ID_SEQ,,,,,,,,,89040,190532,,,
191909,1254731,,CPTEVENTS,SECTIONHEADER,,Medicine,,string,1.0,,,,89040,190532,,,
191910,1254731,,CPTEVENTS,SUBSECTIONHEADER,,Pulmonary,,string,1.0,,,,89040,190532,,,


In [86]:
cptevents_quiq.to_csv('g:/2000/MIMIC_cptevents_QUIQ.csv', index=False)

In [95]:
# VIA 테이블 생성
via_variable_names = ['COSTCENTER', 'CPT_CD',
       'CPT_NUMBER', 'CPT_SUFFIX', 'TICKET_ID_SEQ', 'SECTIONHEADER',
       'SUBSECTIONHEADER', 'DESCRIPTION']

via_descriptions = ['cost center which billed for the corresponding CPT codes. two possible cost centers: Resp, ICU',
                    'the original CPT code',
                    'numeric version of the CPT_CD column which allows for easier range comparisons in querying',
                    'the text suffix when the CPT_CD contains non-numeric characters',
                    'the order of the CPT_CD',
                    'provide a category for the given CPT code',
                    'assigned using the D_CPT table',
                    'provides information about the meaning of the CPT code'


]
via_cptevents = pd.DataFrame({
    'Original_table_name': 'CPTEVENTS',
    'Variable_name': via_variable_names,
    'Description': via_descriptions
})
via_cptevents

Unnamed: 0,Original_table_name,Variable_name,Description
0,CPTEVENTS,COSTCENTER,cost center which billed for the corresponding...
1,CPTEVENTS,CPT_CD,the original CPT code
2,CPTEVENTS,CPT_NUMBER,numeric version of the CPT_CD column which all...
3,CPTEVENTS,CPT_SUFFIX,the text suffix when the CPT_CD contains non-n...
4,CPTEVENTS,TICKET_ID_SEQ,the order of the CPT_CD
5,CPTEVENTS,SECTIONHEADER,provide a category for the given CPT code
6,CPTEVENTS,SUBSECTIONHEADER,assigned using the D_CPT table
7,CPTEVENTS,DESCRIPTION,provides information about the meaning of the ...


In [96]:
via_cptevents.to_csv('G:/2000/MIMIC_cptevents_VIA.csv', index=False)

# DiagnosisICD

In [87]:
diagnosisICD = pd.read_csv('DIAGNOSES_ICD.csv.gz', compression='gzip')
diagnosisICD

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
0,1297,109,172335,1.0,40301
1,1298,109,172335,2.0,486
2,1299,109,172335,3.0,58281
3,1300,109,172335,4.0,5855
4,1301,109,172335,5.0,4254
...,...,...,...,...,...
651042,639798,97503,188195,2.0,20280
651043,639799,97503,188195,3.0,V5869
651044,639800,97503,188195,4.0,V1279
651045,639801,97503,188195,5.0,5275


In [88]:
diagnosisICD_df = diagnosisICD[diagnosisICD["SUBJECT_ID"].isin(la)].reset_index(drop=True)
diagnosisICD_df

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
0,1554,118,147035,1.0,V3000
1,1555,118,147035,2.0,V053
2,634,79,181542,1.0,41011
3,635,79,181542,2.0,4271
4,636,79,181542,3.0,41401
...,...,...,...,...,...
27978,638368,97143,122472,22.0,412
27979,638369,97143,122472,23.0,28529
27980,638370,97143,122472,24.0,27650
27981,638371,97143,122472,25.0,V1302


In [89]:
# 결과 저장 리스트
rows = []

# 제외할 컬럼
exclude_cols = {"ROW_ID", "SUBJECT_ID", "HADM_ID"}

# admission 반복 처리
for index, row in diagnosisICD_df.iterrows():
    patient_id = row["SUBJECT_ID"]
    admission_id = row["HADM_ID"]
    
    for col in diagnosisICD_df.columns:
        if col not in exclude_cols:
            value = row[col]
            rows.append({
                "Primary_key": index + 1254732,
                "Variable_ID": np.nan,
                "Original_table_name": "DIAGNOSIS_ICD",
                "Variable_name": col,
                "Event_date": np.nan,
                "Value": value,
                "Unit": np.nan,
                "Variable_type": np.nan,      # 나중에 설정
                "Is_categorical": np.nan,     # 나중에 설정
                "Recorder": np.nan,
                "Recorder_position": np.nan,
                "Recorder_affiliation": np.nan,
                "Patient_id": patient_id,
                "Admission_id": admission_id,
                "Ground_truth": np.nan,
                "Mapping_info_1": np.nan,
                "Mapping_info_2": np.nan
            })

# QUIQ 테이블로 변환
diagnosisicd_quiq = pd.DataFrame(rows, columns=QUIQ_cols)

# -------------------------------
# Variable_type 추론 함수
def infer_variable_type(val):
    if pd.isna(val):
        return np.nan
    elif isinstance(val, (int, float)):
        return "numeric"
    elif isinstance(val, pd.Timestamp):
        return "timestamp"
    elif isinstance(val, str):
        try:
            pd.to_datetime(val, errors="raise")
            return "timestamp"
        except Exception:
            return "string"
    else:
        return "unknown"

# Variable_type 컬럼 채우기
diagnosisicd_quiq["Variable_type"] = diagnosisicd_quiq["Value"].apply(infer_variable_type)

# -------------------------------
# Is_categorical 판단: 고유값 수가 적은 변수는 범주형으로 간주
CATEGORICAL_THRESHOLD = 10

# 각 Variable_name 별 고유값 수 계산
value_counts = diagnosisicd_quiq.groupby("Variable_name")["Value"].nunique()

# 고유값 수가 기준 이하인 변수 목록 추출
categorical_vars = value_counts[value_counts <= CATEGORICAL_THRESHOLD].index

# Is_categorical 컬럼 채우기: 범주형이면 1, 아니면 0
diagnosisicd_quiq["Is_categorical"] = diagnosisicd_quiq["Variable_name"].apply(
    lambda var: 1 if var in categorical_vars else 0
)
diagnosisicd_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1254732,,DIAGNOSIS_ICD,SEQ_NUM,,1.0,,numeric,0,,,,118,147035,,,
1,1254732,,DIAGNOSIS_ICD,ICD9_CODE,,V3000,,string,0,,,,118,147035,,,
2,1254733,,DIAGNOSIS_ICD,SEQ_NUM,,2.0,,numeric,0,,,,118,147035,,,
3,1254733,,DIAGNOSIS_ICD,ICD9_CODE,,V053,,string,0,,,,118,147035,,,
4,1254734,,DIAGNOSIS_ICD,SEQ_NUM,,1.0,,numeric,0,,,,79,181542,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55961,1282712,,DIAGNOSIS_ICD,ICD9_CODE,,27650,,string,0,,,,97143,122472,,,
55962,1282713,,DIAGNOSIS_ICD,SEQ_NUM,,25.0,,numeric,0,,,,97143,122472,,,
55963,1282713,,DIAGNOSIS_ICD,ICD9_CODE,,V1302,,string,0,,,,97143,122472,,,
55964,1282714,,DIAGNOSIS_ICD,SEQ_NUM,,26.0,,numeric,0,,,,97143,122472,,,


In [91]:
condition = (
    (diagnosisicd_quiq["Is_categorical"] == 0)
)

# 해당 조건에 맞는 행들의 Is_categorical 값을 0으로 변경
diagnosisicd_quiq.loc[condition, "Is_categorical"] = 1
diagnosisicd_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1254732,,DIAGNOSIS_ICD,SEQ_NUM,,1.0,,numeric,1,,,,118,147035,,,
1,1254732,,DIAGNOSIS_ICD,ICD9_CODE,,V3000,,string,1,,,,118,147035,,,
2,1254733,,DIAGNOSIS_ICD,SEQ_NUM,,2.0,,numeric,1,,,,118,147035,,,
3,1254733,,DIAGNOSIS_ICD,ICD9_CODE,,V053,,string,1,,,,118,147035,,,
4,1254734,,DIAGNOSIS_ICD,SEQ_NUM,,1.0,,numeric,1,,,,79,181542,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55961,1282712,,DIAGNOSIS_ICD,ICD9_CODE,,27650,,string,1,,,,97143,122472,,,
55962,1282713,,DIAGNOSIS_ICD,SEQ_NUM,,25.0,,numeric,1,,,,97143,122472,,,
55963,1282713,,DIAGNOSIS_ICD,ICD9_CODE,,V1302,,string,1,,,,97143,122472,,,
55964,1282714,,DIAGNOSIS_ICD,SEQ_NUM,,26.0,,numeric,1,,,,97143,122472,,,


In [93]:
# Mapping
mapping_rules = {
    'ICD9_CODE':("medical_code", np.nan), 
}

# 2. 매핑 함수
def map_mapping_info(row):
    var_name = row["Variable_name"]
    value = row["Value"]

    if pd.isna(value):
        return pd.Series([np.nan, np.nan])  # ✅ value가 NaN이면 매핑 안함
    elif var_name == "DIAGNOSIS":
        return pd.Series(["diagnosis", np.nan])
    else:
        return pd.Series(mapping_rules.get(var_name, (np.nan, np.nan)))

# 3. 적용
diagnosisicd_quiq[["Mapping_info_1", "Mapping_info_2"]] = diagnosisicd_quiq.apply(map_mapping_info, axis=1)
diagnosisicd_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1254732,,DIAGNOSIS_ICD,SEQ_NUM,,1.0,,numeric,1,,,,118,147035,,,
1,1254732,,DIAGNOSIS_ICD,ICD9_CODE,,V3000,,string,1,,,,118,147035,,medical_code,
2,1254733,,DIAGNOSIS_ICD,SEQ_NUM,,2.0,,numeric,1,,,,118,147035,,,
3,1254733,,DIAGNOSIS_ICD,ICD9_CODE,,V053,,string,1,,,,118,147035,,medical_code,
4,1254734,,DIAGNOSIS_ICD,SEQ_NUM,,1.0,,numeric,1,,,,79,181542,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55961,1282712,,DIAGNOSIS_ICD,ICD9_CODE,,27650,,string,1,,,,97143,122472,,medical_code,
55962,1282713,,DIAGNOSIS_ICD,SEQ_NUM,,25.0,,numeric,1,,,,97143,122472,,,
55963,1282713,,DIAGNOSIS_ICD,ICD9_CODE,,V1302,,string,1,,,,97143,122472,,medical_code,
55964,1282714,,DIAGNOSIS_ICD,SEQ_NUM,,26.0,,numeric,1,,,,97143,122472,,,


In [94]:
diagnosisicd_quiq.to_csv('G:/2000/MIMIC_diagnosisicd_QUIQ.csv', index=False)

In [102]:
# VIA 테이블 생성
via_variable_names = ['SEQ_NUM','ICD9_CODE']

via_descriptions = ['provides the order in which the ICD diagnoses relate to the patient. ICD diagnoses are ordered by priority-and the order does have an impact n the reimbursement for treatment',
                    'contains the actual code corresponding to the diagnosis assigned to the patient for the given row. Note that all codes, as of MIMIC-III v1.0, are ICD-9 codes'


]
via_diagnosisicd = pd.DataFrame({
    'Original_table_name': 'DIAGNOSIS_ICD',
    'Variable_name': via_variable_names,
    'Description': via_descriptions
})
via_diagnosisicd

Unnamed: 0,Original_table_name,Variable_name,Description
0,DIAGNOSIS_ICD,SEQ_NUM,provides the order in which the ICD diagnoses ...
1,DIAGNOSIS_ICD,ICD9_CODE,contains the actual code corresponding to the ...


In [103]:
via_diagnosisicd.to_csv('G:/2000/MIMIC_diagnosisicd_VIA.csv', index=False)

# DRGcodes

In [95]:
drgcodes = pd.read_csv('DRGCODES.csv.gz', compression='gzip')
drgcodes

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,DRG_TYPE,DRG_CODE,DESCRIPTION,DRG_SEVERITY,DRG_MORTALITY
0,342,2491,144486,HCFA,28,"TRAUMATIC STUPOR & COMA, COMA <1 HR AGE >17 WI...",,
1,343,24958,162910,HCFA,110,MAJOR CARDIOVASCULAR PROCEDURES WITH COMPLICAT...,,
2,344,18325,153751,HCFA,390,NEONATE WITH OTHER SIGNIFICANT PROBLEMS,,
3,345,17887,182692,HCFA,14,SPECIFIC CEREBROVASCULAR DISORDERS EXCEPT TRAN...,,
4,346,11113,157980,HCFA,390,NEONATE WITH OTHER SIGNIFICANT PROBLEMS,,
...,...,...,...,...,...,...,...,...
125552,123452,71582,101422,MS,221,CARDIAC VALVE & OTH MAJ CARDIOTHORACIC PROC W/...,,
125553,123453,46449,110075,APR,1653,Coronary Bypass w/ Cardiac Cath Or Percutaneou...,3.0,2.0
125554,123454,46449,110075,APR,1653,Coronary Bypass w/ Cardiac Cath Or Percutaneou...,3.0,2.0
125555,123455,46449,110075,MS,234,CORONARY BYPASS W CARDIAC CATH W/O MCC,,


In [96]:
drgcodes_df = drgcodes[drgcodes["SUBJECT_ID"].isin(la)].reset_index(drop=True)
drgcodes_df

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,DRG_TYPE,DRG_CODE,DESCRIPTION,DRG_SEVERITY,DRG_MORTALITY
0,377,12744,116766,HCFA,87,PULMONARY EDEMA & RESPIRATORY FAILURE,,
1,381,12203,105801,HCFA,107,CORONARY BYPASS WITH CARDIAC CATHETER,,
2,406,18779,188333,HCFA,110,MAJOR CARDIOVASCULAR PROCEDURES WITH COMPLICAT...,,
3,424,12948,152782,HCFA,108,OTHER CARDIOTHORACIC PROCEDURES,,
4,427,10339,144796,HCFA,318,KIDNEY & URINARY TRACT NEOPLASMS WITH COMPLICA...,,
...,...,...,...,...,...,...,...,...
5361,125238,73770,186640,APR,54,Tracheostomy W Long Term Mechanical Ventilatio...,4.0,4.0
5362,125239,73770,186640,APR,54,Tracheostomy W Long Term Mechanical Ventilatio...,4.0,4.0
5363,125240,73770,186640,MS,4,"TRACH W MV 96+ HRS OR PDX EXC FACE, MOUTH & NE...",,
5364,123436,73615,172441,APR,1662,Coronary Bypass w/o Cardiac Cath Or Percutaneo...,2.0,2.0


In [97]:
drgcodes_df.columns

Index(['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'DRG_TYPE', 'DRG_CODE',
       'DESCRIPTION', 'DRG_SEVERITY', 'DRG_MORTALITY'],
      dtype='object')

In [100]:
# 결과 저장 리스트
rows = []

# 제외할 컬럼
exclude_cols = {"ROW_ID", "SUBJECT_ID", "HADM_ID"}

# admission 반복 처리
for index, row in drgcodes_df.iterrows():
    patient_id = row["SUBJECT_ID"]
    admission_id = row["HADM_ID"]
    
    for col in drgcodes_df.columns:
        if col not in exclude_cols:
            value = row[col]
            rows.append({
                "Primary_key": index + 1282715,
                "Variable_ID": np.nan,
                "Original_table_name": "DRGCODES",
                "Variable_name": col,
                "Event_date": np.nan,
                "Value": value,
                "Unit": np.nan,
                "Variable_type": np.nan,      # 나중에 설정
                "Is_categorical": np.nan,     # 나중에 설정
                "Recorder": np.nan,
                "Recorder_position": np.nan,
                "Recorder_affiliation": np.nan,
                "Patient_id": patient_id,
                "Admission_id": admission_id,
                "Ground_truth": np.nan,
                "Mapping_info_1": np.nan,
                "Mapping_info_2": np.nan
            })

# QUIQ 테이블로 변환
drgcodes_quiq = pd.DataFrame(rows, columns=QUIQ_cols)

# -------------------------------
# Variable_type 추론 함수
def infer_variable_type(val):
    if pd.isna(val):
        return np.nan
    elif isinstance(val, (int, float)):
        return "numeric"
    elif isinstance(val, pd.Timestamp):
        return "timestamp"
    elif isinstance(val, str):
        try:
            pd.to_datetime(val, errors="raise")
            return "timestamp"
        except Exception:
            return "string"
    else:
        return "unknown"

# Variable_type 컬럼 채우기
drgcodes_quiq["Variable_type"] = drgcodes_quiq["Value"].apply(infer_variable_type)

# -------------------------------
CATEGORICAL_THRESHOLD = 10

# 각 Variable_name 별 고유값 수 계산
value_counts = drgcodes_quiq.groupby("Variable_name")["Value"].nunique()

# 고유값 수가 기준 이하인 변수 목록 추출
categorical_vars = value_counts[value_counts <= CATEGORICAL_THRESHOLD].index

# Is_categorical 컬럼 채우기: 범주형이면 1, 아니면 0, Value가 NaN이면 NaN
drgcodes_quiq["Is_categorical"] = drgcodes_quiq.apply(
    lambda row: np.nan if pd.isna(row["Value"])
    else 1 if row["Variable_name"] in categorical_vars
    else 0,
    axis=1
)

drgcodes_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1282715,,DRGCODES,DRG_TYPE,,HCFA,,string,1.0,,,,12744,116766,,,
1,1282715,,DRGCODES,DRG_CODE,,87,,numeric,0.0,,,,12744,116766,,,
2,1282715,,DRGCODES,DESCRIPTION,,PULMONARY EDEMA & RESPIRATORY FAILURE,,string,0.0,,,,12744,116766,,,
3,1282715,,DRGCODES,DRG_SEVERITY,,,,,,,,,12744,116766,,,
4,1282715,,DRGCODES,DRG_MORTALITY,,,,,,,,,12744,116766,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26825,1288080,,DRGCODES,DRG_TYPE,,MS,,string,1.0,,,,73615,172441,,,
26826,1288080,,DRGCODES,DRG_CODE,,236,,numeric,0.0,,,,73615,172441,,,
26827,1288080,,DRGCODES,DESCRIPTION,,CORONARY BYPASS W/O CARDIAC CATH W/O MCC,,string,0.0,,,,73615,172441,,,
26828,1288080,,DRGCODES,DRG_SEVERITY,,,,,,,,,73615,172441,,,


In [101]:
drgcodes_quiq.head(20)

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1282715,,DRGCODES,DRG_TYPE,,HCFA,,string,1.0,,,,12744,116766,,,
1,1282715,,DRGCODES,DRG_CODE,,87,,numeric,0.0,,,,12744,116766,,,
2,1282715,,DRGCODES,DESCRIPTION,,PULMONARY EDEMA & RESPIRATORY FAILURE,,string,0.0,,,,12744,116766,,,
3,1282715,,DRGCODES,DRG_SEVERITY,,,,,,,,,12744,116766,,,
4,1282715,,DRGCODES,DRG_MORTALITY,,,,,,,,,12744,116766,,,
5,1282716,,DRGCODES,DRG_TYPE,,HCFA,,string,1.0,,,,12203,105801,,,
6,1282716,,DRGCODES,DRG_CODE,,107,,numeric,0.0,,,,12203,105801,,,
7,1282716,,DRGCODES,DESCRIPTION,,CORONARY BYPASS WITH CARDIAC CATHETER,,string,0.0,,,,12203,105801,,,
8,1282716,,DRGCODES,DRG_SEVERITY,,,,,,,,,12203,105801,,,
9,1282716,,DRGCODES,DRG_MORTALITY,,,,,,,,,12203,105801,,,


In [102]:
condition = (
    (drgcodes_quiq["Variable_type"] == "numeric") &
    (drgcodes_quiq["Is_categorical"] == 1)
)

# 해당 조건에 맞는 행들의 Is_categorical 값을 0으로 변경
drgcodes_quiq.loc[condition, "Is_categorical"] = 0
drgcodes_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1282715,,DRGCODES,DRG_TYPE,,HCFA,,string,1.0,,,,12744,116766,,,
1,1282715,,DRGCODES,DRG_CODE,,87,,numeric,0.0,,,,12744,116766,,,
2,1282715,,DRGCODES,DESCRIPTION,,PULMONARY EDEMA & RESPIRATORY FAILURE,,string,0.0,,,,12744,116766,,,
3,1282715,,DRGCODES,DRG_SEVERITY,,,,,,,,,12744,116766,,,
4,1282715,,DRGCODES,DRG_MORTALITY,,,,,,,,,12744,116766,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26825,1288080,,DRGCODES,DRG_TYPE,,MS,,string,1.0,,,,73615,172441,,,
26826,1288080,,DRGCODES,DRG_CODE,,236,,numeric,0.0,,,,73615,172441,,,
26827,1288080,,DRGCODES,DESCRIPTION,,CORONARY BYPASS W/O CARDIAC CATH W/O MCC,,string,0.0,,,,73615,172441,,,
26828,1288080,,DRGCODES,DRG_SEVERITY,,,,,,,,,73615,172441,,,


In [103]:
condition = (
    (drgcodes_quiq["Variable_name"] == "DESCRIPTION") &
    (drgcodes_quiq["Is_categorical"] == 0)
)

# 해당 조건에 맞는 행들의 Is_categorical 값을 0으로 변경
drgcodes_quiq.loc[condition, "Is_categorical"] = 1
drgcodes_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1282715,,DRGCODES,DRG_TYPE,,HCFA,,string,1.0,,,,12744,116766,,,
1,1282715,,DRGCODES,DRG_CODE,,87,,numeric,0.0,,,,12744,116766,,,
2,1282715,,DRGCODES,DESCRIPTION,,PULMONARY EDEMA & RESPIRATORY FAILURE,,string,1.0,,,,12744,116766,,,
3,1282715,,DRGCODES,DRG_SEVERITY,,,,,,,,,12744,116766,,,
4,1282715,,DRGCODES,DRG_MORTALITY,,,,,,,,,12744,116766,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26825,1288080,,DRGCODES,DRG_TYPE,,MS,,string,1.0,,,,73615,172441,,,
26826,1288080,,DRGCODES,DRG_CODE,,236,,numeric,0.0,,,,73615,172441,,,
26827,1288080,,DRGCODES,DESCRIPTION,,CORONARY BYPASS W/O CARDIAC CATH W/O MCC,,string,1.0,,,,73615,172441,,,
26828,1288080,,DRGCODES,DRG_SEVERITY,,,,,,,,,73615,172441,,,


In [104]:
# 1. 매핑 룰 정의
mapping_rules = {
    "DRG_CODE": ("medical_code", np.nan)
}

# 2. 매핑 함수
def map_mapping_info(row):
    var_name = row["Variable_name"]
    value = row["Value"]

    if pd.isna(value):
        return pd.Series([np.nan, np.nan])  # ✅ value가 NaN이면 매핑 안함
    elif var_name == "DIAGNOSIS":
        return pd.Series(["diagnosis", np.nan])
    else:
        return pd.Series(mapping_rules.get(var_name, (np.nan, np.nan)))

# 3. 적용
drgcodes_quiq[["Mapping_info_1", "Mapping_info_2"]] = drgcodes_quiq.apply(map_mapping_info, axis=1)
drgcodes_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1282715,,DRGCODES,DRG_TYPE,,HCFA,,string,1.0,,,,12744,116766,,,
1,1282715,,DRGCODES,DRG_CODE,,87,,numeric,0.0,,,,12744,116766,,medical_code,
2,1282715,,DRGCODES,DESCRIPTION,,PULMONARY EDEMA & RESPIRATORY FAILURE,,string,1.0,,,,12744,116766,,,
3,1282715,,DRGCODES,DRG_SEVERITY,,,,,,,,,12744,116766,,,
4,1282715,,DRGCODES,DRG_MORTALITY,,,,,,,,,12744,116766,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26825,1288080,,DRGCODES,DRG_TYPE,,MS,,string,1.0,,,,73615,172441,,,
26826,1288080,,DRGCODES,DRG_CODE,,236,,numeric,0.0,,,,73615,172441,,medical_code,
26827,1288080,,DRGCODES,DESCRIPTION,,CORONARY BYPASS W/O CARDIAC CATH W/O MCC,,string,1.0,,,,73615,172441,,,
26828,1288080,,DRGCODES,DRG_SEVERITY,,,,,,,,,73615,172441,,,


In [105]:
drgcodes_quiq.to_csv('G:/2000/MIMIC_drgcodes_QUIQ.csv', index=False)

In [110]:
# VIA 테이블 생성
via_variable_names = ['DRG_TYPE', 'DRG_CODE',
       'DESCRIPTION', 'DRG_SEVERITY', 'DRG_MORTALITY']

via_descriptions = ['provides the type of DRG code in the entry. The three types of DRG codes in the MIMIC-III database are HCFA, MS, APR.',
                    'contains a code which represents the diagnosis billed for by the hospital',
                    'provides a human understandable summary of the meaning of the given DRG code.',
                    'provide additional granularity to DRG codes in the APR DRG type. Severity and mortality allow for higher billing costs when a diagnosis is more severe.',
                    'provide additional granularity to DRG codes in the APR DRG type. Severity and mortality allow for higher billing costs when a diagnosis is more severe. '
]
via_drgcodes = pd.DataFrame({
    'Original_table_name': 'DRGCODES',
    'Variable_name': via_variable_names,
    'Description': via_descriptions
})
via_drgcodes

Unnamed: 0,Original_table_name,Variable_name,Description
0,DRGCODES,DRG_TYPE,provides the type of DRG code in the entry. Th...
1,DRGCODES,DRG_CODE,contains a code which represents the diagnosis...
2,DRGCODES,DESCRIPTION,provides a human understandable summary of the...
3,DRGCODES,DRG_SEVERITY,provide additional granularity to DRG codes in...
4,DRGCODES,DRG_MORTALITY,provide additional granularity to DRG codes in...


In [111]:
via_drgcodes.to_csv('G:/2000/MIMIC_drgcodes_VIA.csv', index=False)

# Prescriptions

In [106]:
prescriptions = pd.read_csv('PRESCRIPTIONS.csv.gz', compression='gzip')
prescriptions

  prescriptions = pd.read_csv('PRESCRIPTIONS.csv.gz', compression='gzip')


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,STARTDATE,ENDDATE,DRUG_TYPE,DRUG,DRUG_NAME_POE,DRUG_NAME_GENERIC,FORMULARY_DRUG_CD,GSN,NDC,PROD_STRENGTH,DOSE_VAL_RX,DOSE_UNIT_RX,FORM_VAL_DISP,FORM_UNIT_DISP,ROUTE
0,2214776,6,107064,,2175-06-11 00:00:00,2175-06-12 00:00:00,MAIN,Tacrolimus,Tacrolimus,Tacrolimus,TACR1,021796,4.690617e+08,1mg Capsule,2,mg,2,CAP,PO
1,2214775,6,107064,,2175-06-11 00:00:00,2175-06-12 00:00:00,MAIN,Warfarin,Warfarin,Warfarin,WARF5,006562,5.601728e+07,5mg Tablet,5,mg,1,TAB,PO
2,2215524,6,107064,,2175-06-11 00:00:00,2175-06-12 00:00:00,MAIN,Heparin Sodium,,,HEPAPREMIX,006522,3.380550e+08,"25,000 unit Premix Bag",25000,UNIT,1,BAG,IV
3,2216265,6,107064,,2175-06-11 00:00:00,2175-06-12 00:00:00,BASE,D5W,,,HEPBASE,,0.000000e+00,HEPARIN BASE,250,ml,250,ml,IV
4,2214773,6,107064,,2175-06-11 00:00:00,2175-06-12 00:00:00,MAIN,Furosemide,Furosemide,Furosemide,FURO20,008208,5.482972e+07,20mg Tablet,20,mg,1,TAB,PO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4156445,3996662,98887,121032,238144.0,2144-09-06 00:00:00,2144-09-06 00:00:00,MAIN,PredniSONE,PredniSONE,PredniSONE,PRED20,006751,5.400182e+07,20 mg Tablet,40,mg,2,TAB,PO/NG
4156446,3996070,98887,121032,238144.0,2144-09-06 00:00:00,2144-09-06 00:00:00,MAIN,Ipratropium Bromide Neb,Ipratropium Bromide Neb,Ipratropium Bromide Neb,IPRA2H,021700,4.879801e+08,2.5mL Vial,1,NEB,1,VIAL,IH
4156447,3996063,98887,121032,238144.0,2144-09-06 00:00:00,2144-09-06 00:00:00,MAIN,HYDROmorphone (Dilaudid),HYDROmorphone (Dilaudid),HYDROmorphone,HYDR20/100NS,048078,6.155302e+10,20 mg / 100 mL Premix Bag,0.12,mg,0.01,BAG,IVPCA
4156448,3996062,98887,121032,238144.0,2144-09-06 00:00:00,2144-09-06 00:00:00,MAIN,Docusate Sodium,Docusate Sodium,Docusate Sodium,DOCU100,003009,9.042245e+08,100mg Capsule,100,mg,1,CAP,PO


In [107]:
prescriptions_df = prescriptions[prescriptions["SUBJECT_ID"].isin(la)].reset_index(drop=True)
prescriptions_df

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,STARTDATE,ENDDATE,DRUG_TYPE,DRUG,DRUG_NAME_POE,DRUG_NAME_GENERIC,FORMULARY_DRUG_CD,GSN,NDC,PROD_STRENGTH,DOSE_VAL_RX,DOSE_UNIT_RX,FORM_VAL_DISP,FORM_UNIT_DISP,ROUTE
0,1943685,46,144073,,2133-02-20 00:00:00,2133-02-20 00:00:00,MAIN,Docusate Sodium,Docusate Sodium,Docusate Sodium,DOCU100,003009,5.107900e+10,100MG CAP,100,mg,1,CAP,PO
1,1943686,46,144073,,2133-02-20 00:00:00,2133-02-20 00:00:00,MAIN,Zolpidem Tartrate,Zolpidem Tartrate,Zolpidem Tartrate,AMBI5,019187,2.554013e+07,5MG TAB,5,mg,1,TAB,PO
2,1943700,46,144073,,2133-02-20 00:00:00,2133-02-21 00:00:00,MAIN,Levofloxacin,,,LEVO500PM,029929,4.500680e+07,500MG PM BAG,500,mg,1,BAG,IV
3,1943703,46,144073,,2133-02-20 00:00:00,2133-02-21 00:00:00,BASE,NS,,,NS100,001210,3.380049e+08,100ML BAG,100,ml,100,ml,IV
4,1943704,46,144073,,2133-02-20 00:00:00,2133-02-21 00:00:00,BASE,NS,,,NS50,001210,3.380049e+08,50ML BAG,50,ml,50,ml,IV
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179980,3907361,99068,164100,201560.0,2193-05-24 00:00:00,2193-05-26 00:00:00,MAIN,Ampicillin-Sulbactam,,,UNAS3I,008921,6.332304e+10,3g Vial,3,g,1,VIAL,IV
179981,3907332,99068,164100,201560.0,2193-05-24 00:00:00,2193-05-26 00:00:00,MAIN,Pneumococcal Vac Polyvalent,Pneumococcal Vac Polyvalent,PNEUMOcoccal Vac Polyvalent,PNEU25I,048548,6.473900e+06,25mcg/0.5mL Vial,0.5,mL,1,VIAL,IM
179982,3907350,99068,164100,201560.0,2193-05-24 00:00:00,2193-05-27 00:00:00,BASE,Vial,,,VIAL,,0.000000e+00,Send Vial,1,VIAL,1,VIAL,IV
179983,3907363,99068,164100,201560.0,2193-05-24 00:00:00,2193-05-27 00:00:00,MAIN,Pantoprazole,,,PANT40I,047635,8.092355e+06,40 mg Vial,40,mg,1,VIAL,IV


In [108]:
prescriptions_df.columns

Index(['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'STARTDATE', 'ENDDATE',
       'DRUG_TYPE', 'DRUG', 'DRUG_NAME_POE', 'DRUG_NAME_GENERIC',
       'FORMULARY_DRUG_CD', 'GSN', 'NDC', 'PROD_STRENGTH', 'DOSE_VAL_RX',
       'DOSE_UNIT_RX', 'FORM_VAL_DISP', 'FORM_UNIT_DISP', 'ROUTE'],
      dtype='object')

In [109]:
# 결과 저장 리스트
rows = []

# 제외할 컬럼
exclude_cols = {'ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID',  'DOSE_UNIT_RX', 'FORM_UNIT_DISP',}


# admission 반복 처리
for index, row in prescriptions_df.iterrows():
    patient_id = row["SUBJECT_ID"]
    admission_id = row["HADM_ID"]
    
    for col in prescriptions_df.columns:
        if col not in exclude_cols:
            value = row[col]

            # Unit 처리 조건
            if col == "DOSE_VAL_RX":
                unit_val = row["DOSE_UNIT_RX"] if "DOSE_UNIT_RX" in prescriptions_df.columns else np.nan
            elif col == "FORM_VAL_DISP":
                unit_val = row["FORM_UNIT_DISP"] if "FORM_UNIT_DISP" in prescriptions_df.columns else np.nan
            else:
                unit_val = np.nan
                
            # Event_date 설정
            if col == "DRUG":
                event_date_val = row["STARTDATE"] if "STARTDATE" in prescriptions_df.columns else np.nan
            else:
                event_date_val = np.nan

            rows.append({
                "Primary_key": index + 1288081,
                "Variable_ID": np.nan,
                "Original_table_name": "PRESCRIPTIONS",
                "Variable_name": col,
                "Event_date": event_date_val,
                "Value": value,
                "Unit": unit_val,
                "Variable_type": np.nan,      # 나중에 설정
                "Is_categorical": np.nan,     # 나중에 설정
                "Recorder": np.nan,
                "Recorder_position": np.nan,
                "Recorder_affiliation": np.nan,
                "Patient_id": patient_id,
                "Admission_id": admission_id,
                "Ground_truth": np.nan,
                "Mapping_info_1": np.nan,
                "Mapping_info_2": np.nan
            })

# QUIQ 테이블로 변환
prescriptions_quiq = pd.DataFrame(rows, columns=QUIQ_cols)

# -------------------------------
# Variable_type 추론 함수
def infer_variable_type(val):
    if pd.isna(val):
        return np.nan
    elif isinstance(val, (int, float)):
        return "numeric"
    elif isinstance(val, pd.Timestamp):
        return "timestamp"
    elif isinstance(val, str):
        try:
            pd.to_datetime(val, errors="raise")
            return "timestamp"
        except Exception:
            return "string"
    else:
        return "unknown"

# Variable_type 컬럼 채우기
prescriptions_quiq["Variable_type"] = prescriptions_quiq["Value"].apply(infer_variable_type)

# -------------------------------
# Is_categorical 판단: 고유값 수가 적은 변수는 범주형으로 간주
CATEGORICAL_THRESHOLD = 10

# 각 Variable_name 별 고유값 수 계산
value_counts = prescriptions_quiq.groupby("Variable_name")["Value"].nunique()

# 고유값 수가 기준 이하인 변수 목록 추출
categorical_vars = value_counts[value_counts <= CATEGORICAL_THRESHOLD].index

# Is_categorical 컬럼 채우기: 범주형이면 1, 아니면 0
prescriptions_quiq["Is_categorical"] = prescriptions_quiq["Variable_name"].apply(
    lambda var: 1 if var in categorical_vars else 0
)
prescriptions_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1288081,,PRESCRIPTIONS,STARTDATE,,2133-02-20 00:00:00,,timestamp,0,,,,46,144073,,,
1,1288081,,PRESCRIPTIONS,ENDDATE,,2133-02-20 00:00:00,,timestamp,0,,,,46,144073,,,
2,1288081,,PRESCRIPTIONS,DRUG_TYPE,,MAIN,,string,1,,,,46,144073,,,
3,1288081,,PRESCRIPTIONS,DRUG,2133-02-20 00:00:00,Docusate Sodium,,string,0,,,,46,144073,,,
4,1288081,,PRESCRIPTIONS,DRUG_NAME_POE,,Docusate Sodium,,string,0,,,,46,144073,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2339800,1468065,,PRESCRIPTIONS,NDC,,781305714.0,,numeric,0,,,,99068,164100,,,
2339801,1468065,,PRESCRIPTIONS,PROD_STRENGTH,,2mg/mL-2mL,,string,0,,,,99068,164100,,,
2339802,1468065,,PRESCRIPTIONS,DOSE_VAL_RX,,4,mg,string,0,,,,99068,164100,,,
2339803,1468065,,PRESCRIPTIONS,FORM_VAL_DISP,,1,VIAL,string,0,,,,99068,164100,,,


In [111]:
# 1. Variable_type 수동 지정 (Value가 있는 경우만)
cond_numeric_override = (
    prescriptions_quiq["Variable_name"].isin(["DOSE_VAL_RX", "FORM_VAL_DISP"]) &
    prescriptions_quiq["Value"].notna()
)
prescriptions_quiq.loc[cond_numeric_override, "Variable_type"] = "numeric"

cond_string_override = (
    (prescriptions_quiq["Variable_name"] == "NDC") &
    prescriptions_quiq["Value"].notna()
)
prescriptions_quiq.loc[cond_string_override, "Variable_type"] = "string"

# 2. string인데 Is_categorical이 0인 경우 → 1로 수정
cond_string = (
    (prescriptions_quiq["Variable_type"] == "string") &
    (prescriptions_quiq["Is_categorical"] == 0)
)
prescriptions_quiq.loc[cond_string, "Is_categorical"] = 1

# 3. numeric인데 Is_categorical이 1인 경우 → 0으로 수정
cond_numeric = (
    (prescriptions_quiq["Variable_type"] == "numeric") &
    (prescriptions_quiq["Is_categorical"] == 1)
)
prescriptions_quiq.loc[cond_numeric, "Is_categorical"] = 0
prescriptions_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1288081,,PRESCRIPTIONS,STARTDATE,,2133-02-20 00:00:00,,timestamp,0,,,,46,144073,,,
1,1288081,,PRESCRIPTIONS,ENDDATE,,2133-02-20 00:00:00,,timestamp,0,,,,46,144073,,,
2,1288081,,PRESCRIPTIONS,DRUG_TYPE,,MAIN,,string,1,,,,46,144073,,,
3,1288081,,PRESCRIPTIONS,DRUG,2133-02-20 00:00:00,Docusate Sodium,,string,1,,,,46,144073,,,
4,1288081,,PRESCRIPTIONS,DRUG_NAME_POE,,Docusate Sodium,,string,1,,,,46,144073,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2339800,1468065,,PRESCRIPTIONS,NDC,,781305714.0,,string,1,,,,99068,164100,,,
2339801,1468065,,PRESCRIPTIONS,PROD_STRENGTH,,2mg/mL-2mL,,string,1,,,,99068,164100,,,
2339802,1468065,,PRESCRIPTIONS,DOSE_VAL_RX,,4,mg,numeric,0,,,,99068,164100,,,
2339803,1468065,,PRESCRIPTIONS,FORM_VAL_DISP,,1,VIAL,numeric,0,,,,99068,164100,,,


In [112]:
prescriptions_quiq.head(20)

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1288081,,PRESCRIPTIONS,STARTDATE,,2133-02-20 00:00:00,,timestamp,0,,,,46,144073,,,
1,1288081,,PRESCRIPTIONS,ENDDATE,,2133-02-20 00:00:00,,timestamp,0,,,,46,144073,,,
2,1288081,,PRESCRIPTIONS,DRUG_TYPE,,MAIN,,string,1,,,,46,144073,,,
3,1288081,,PRESCRIPTIONS,DRUG,2133-02-20 00:00:00,Docusate Sodium,,string,1,,,,46,144073,,,
4,1288081,,PRESCRIPTIONS,DRUG_NAME_POE,,Docusate Sodium,,string,1,,,,46,144073,,,
5,1288081,,PRESCRIPTIONS,DRUG_NAME_GENERIC,,Docusate Sodium,,string,1,,,,46,144073,,,
6,1288081,,PRESCRIPTIONS,FORMULARY_DRUG_CD,,DOCU100,,string,1,,,,46,144073,,,
7,1288081,,PRESCRIPTIONS,GSN,,003009,,string,1,,,,46,144073,,,
8,1288081,,PRESCRIPTIONS,NDC,,51079001920.0,,string,1,,,,46,144073,,,
9,1288081,,PRESCRIPTIONS,PROD_STRENGTH,,100MG CAP,,string,1,,,,46,144073,,,


In [113]:
# 1. 매핑 룰 정의
mapping_rules = {'STARTDATE': ("date", np.nan), 
                 'ENDDATE': ("date", np.nan),   
                 'DRUG': ("prescription", "drug"),
                 'DRUG_NAME_POE':  ("prescription", "drug"),
                 "DRUG_NAME_GENERIC": ("prescription", "drug"),
                 'GSN': ("medical_code", np.nan), 
                 'NDC': ("medical_code", np.nan),   
                 'PROD_STRENTH': ("prescription", "prescription_info"),
                 'DOSE_VAL_RX': ("prescription", "prescription_info"), 
                 'FORM_VAL_DISP': ("prescription", "prescription_info")
}

# 2. 매핑 함수
def map_mapping_info(row):
    var_name = row["Variable_name"]
    value = row["Value"]

    if pd.isna(value):
        return pd.Series([np.nan, np.nan])  # ✅ value가 NaN이면 매핑 안함
    else:
        return pd.Series(mapping_rules.get(var_name, (np.nan, np.nan)))

# 3. 적용
prescriptions_quiq[["Mapping_info_1", "Mapping_info_2"]] = prescriptions_quiq.apply(map_mapping_info, axis=1)
prescriptions_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1288081,,PRESCRIPTIONS,STARTDATE,,2133-02-20 00:00:00,,timestamp,0,,,,46,144073,,date,
1,1288081,,PRESCRIPTIONS,ENDDATE,,2133-02-20 00:00:00,,timestamp,0,,,,46,144073,,date,
2,1288081,,PRESCRIPTIONS,DRUG_TYPE,,MAIN,,string,1,,,,46,144073,,,
3,1288081,,PRESCRIPTIONS,DRUG,2133-02-20 00:00:00,Docusate Sodium,,string,1,,,,46,144073,,prescription,drug
4,1288081,,PRESCRIPTIONS,DRUG_NAME_POE,,Docusate Sodium,,string,1,,,,46,144073,,prescription,drug
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2339800,1468065,,PRESCRIPTIONS,NDC,,781305714.0,,string,1,,,,99068,164100,,medical_code,
2339801,1468065,,PRESCRIPTIONS,PROD_STRENGTH,,2mg/mL-2mL,,string,1,,,,99068,164100,,,
2339802,1468065,,PRESCRIPTIONS,DOSE_VAL_RX,,4,mg,numeric,0,,,,99068,164100,,prescription,prescription_info
2339803,1468065,,PRESCRIPTIONS,FORM_VAL_DISP,,1,VIAL,numeric,0,,,,99068,164100,,prescription,prescription_info


In [114]:
prescriptions_quiq.to_csv('G:/2000/MIMIC_prescriptions_QUIQ.csv', index=False)

In [119]:
# VIA 테이블 생성
via_variable_names = ['STARTDATE', 'ENDDATE',
       'DRUG_TYPE', 'DRUG', 'DRUG_NAME_POE', 'DRUG_NAME_GENERIC',
       'FORMULARY_DRUG_CD', 'GSN', 'NDC', 'PROD_STRENGTH', 'DOSE_VAL_RX', 'FORM_VAL_DISP', 'ROUTE']

via_descriptions = ['specify the date period for which the prescription was valid - start', 
                    'specify the date period for which the prescription was valid - end',
                    'provides the typeof drug prescribed',
                    'representations of the drug prescribed to the patient',
                    'representations of the drug prescribed to the patient',
                    'representations of the drug prescribed to the patient',
                    'representation of the drug in various coding systems',
                    'representation of the drug in vraious coding systems, Generic Sequence Number',
                    'representation of the drug in vraious coding systems, National Drug Code',
                    "Strength of the drug product (e.g., 500 mg)",
                    "Dose amount prescribed to the patient",
                    "Dispensed amount of the drug formulation",
                    "Route of drug administration (e.g., PO, IV)"
                   ]


via_prescriptions = pd.DataFrame({
    'Original_table_name': 'PRESCRIPTIONS',
    'Variable_name': via_variable_names,
    'Description': via_descriptions
})
via_prescriptions

Unnamed: 0,Original_table_name,Variable_name,Description
0,PRESCRIPTIONS,STARTDATE,specify the date period for which the prescrip...
1,PRESCRIPTIONS,ENDDATE,specify the date period for which the prescrip...
2,PRESCRIPTIONS,DRUG_TYPE,provides the typeof drug prescribed
3,PRESCRIPTIONS,DRUG,representations of the drug prescribed to the ...
4,PRESCRIPTIONS,DRUG_NAME_POE,representations of the drug prescribed to the ...
5,PRESCRIPTIONS,DRUG_NAME_GENERIC,representations of the drug prescribed to the ...
6,PRESCRIPTIONS,FORMULARY_DRUG_CD,representation of the drug in various coding s...
7,PRESCRIPTIONS,GSN,representation of the drug in vraious coding s...
8,PRESCRIPTIONS,NDC,representation of the drug in vraious coding s...
9,PRESCRIPTIONS,PROD_STRENGTH,"Strength of the drug product (e.g., 500 mg)"


In [120]:
via_prescriptions.to_csv('G:/2000/MIMIC_prescriptions_VIA.csv', index=False)

# ProceduresICD

In [115]:
procedureICD = pd.read_csv('PROCEDURES_ICD.csv.gz', compression='gzip')
procedureICD

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
0,944,62641,154460,3,3404
1,945,2592,130856,1,9671
2,946,2592,130856,2,3893
3,947,55357,119355,1,9672
4,948,55357,119355,2,331
...,...,...,...,...,...
240090,228330,67415,150871,5,3736
240091,228331,67415,150871,6,3893
240092,228332,67415,150871,7,8872
240093,228333,67415,150871,8,3893


In [116]:
procedureICD_df = procedureICD[procedureICD["SUBJECT_ID"].isin(la)].reset_index(drop=True)
procedureICD_df

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
0,1045,4803,159789,1,9672
1,1046,4803,159789,2,966
2,1061,20957,113808,1,3323
3,1062,20957,113808,2,9656
4,1063,20957,113808,3,9604
...,...,...,...,...,...
10525,230557,94977,121035,2,9604
10526,225927,76435,187938,1,3721
10527,225928,76435,187938,2,8964
10528,228307,41035,102460,1,3228


In [117]:
# 결과 저장 리스트
rows = []

# 제외할 컬럼
exclude_cols = {"ROW_ID", "SUBJECT_ID", "HADM_ID"}

# admission 반복 처리
for index, row in procedureICD_df.iterrows():
    patient_id = row["SUBJECT_ID"]
    admission_id = row["HADM_ID"]
    
    for col in procedureICD_df.columns:
        if col not in exclude_cols:
            value = row[col]
            rows.append({
                "Primary_key": index + 1468066,
                "Variable_ID": np.nan,
                "Original_table_name": "PROCEDURES_ICD",
                "Variable_name": col,
                "Event_date": np.nan,
                "Value": value,
                "Unit": np.nan,
                "Variable_type": np.nan,      # 나중에 설정
                "Is_categorical": np.nan,     # 나중에 설정
                "Recorder": np.nan,
                "Recorder_position": np.nan,
                "Recorder_affiliation": np.nan,
                "Patient_id": patient_id,
                "Admission_id": admission_id,
                "Ground_truth": np.nan,
                "Mapping_info_1": np.nan,
                "Mapping_info_2": np.nan
            })

# QUIQ 테이블로 변환
procedureicd_quiq = pd.DataFrame(rows, columns=QUIQ_cols)

# -------------------------------
# Variable_type 추론 함수
def infer_variable_type(val):
    if pd.isna(val):
        return np.nan
    elif isinstance(val, (int, float)):
        return "numeric"
    elif isinstance(val, pd.Timestamp):
        return "timestamp"
    elif isinstance(val, str):
        try:
            pd.to_datetime(val, errors="raise")
            return "timestamp"
        except Exception:
            return "string"
    else:
        return "unknown"

# Variable_type 컬럼 채우기
procedureicd_quiq["Variable_type"] = procedureicd_quiq["Value"].apply(infer_variable_type)

# -------------------------------
# Is_categorical 판단: 고유값 수가 적은 변수는 범주형으로 간주
CATEGORICAL_THRESHOLD = 10

# 각 Variable_name 별 고유값 수 계산
value_counts = procedureicd_quiq.groupby("Variable_name")["Value"].nunique()

# 고유값 수가 기준 이하인 변수 목록 추출
categorical_vars = value_counts[value_counts <= CATEGORICAL_THRESHOLD].index

# Is_categorical 컬럼 채우기: 범주형이면 1, 아니면 0
procedureicd_quiq["Is_categorical"] = procedureicd_quiq["Variable_name"].apply(
    lambda var: 1 if var in categorical_vars else 0
)
procedureicd_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1468066,,PROCEDURES_ICD,SEQ_NUM,,1,,numeric,0,,,,4803,159789,,,
1,1468066,,PROCEDURES_ICD,ICD9_CODE,,9672,,numeric,0,,,,4803,159789,,,
2,1468067,,PROCEDURES_ICD,SEQ_NUM,,2,,numeric,0,,,,4803,159789,,,
3,1468067,,PROCEDURES_ICD,ICD9_CODE,,966,,numeric,0,,,,4803,159789,,,
4,1468068,,PROCEDURES_ICD,SEQ_NUM,,1,,numeric,0,,,,20957,113808,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21055,1478593,,PROCEDURES_ICD,ICD9_CODE,,8964,,numeric,0,,,,76435,187938,,,
21056,1478594,,PROCEDURES_ICD,SEQ_NUM,,1,,numeric,0,,,,41035,102460,,,
21057,1478594,,PROCEDURES_ICD,ICD9_CODE,,3228,,numeric,0,,,,41035,102460,,,
21058,1478595,,PROCEDURES_ICD,SEQ_NUM,,2,,numeric,0,,,,41035,102460,,,


In [118]:
# 1. Variable_name이 ICD9_CODE이고 Value가 있는 경우 → Variable_type을 string으로 설정
cond_icd9_string = (
    (procedureicd_quiq["Variable_name"] == "ICD9_CODE") &
    (procedureicd_quiq["Value"].notna())
)
procedureicd_quiq.loc[cond_icd9_string, "Variable_type"] = "string"

# 2. Is_categorical이 0인 경우 → 1로 설정
condition = (procedureicd_quiq["Is_categorical"] == 0)
procedureicd_quiq.loc[condition, "Is_categorical"] = 1
procedureicd_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1468066,,PROCEDURES_ICD,SEQ_NUM,,1,,numeric,1,,,,4803,159789,,,
1,1468066,,PROCEDURES_ICD,ICD9_CODE,,9672,,string,1,,,,4803,159789,,,
2,1468067,,PROCEDURES_ICD,SEQ_NUM,,2,,numeric,1,,,,4803,159789,,,
3,1468067,,PROCEDURES_ICD,ICD9_CODE,,966,,string,1,,,,4803,159789,,,
4,1468068,,PROCEDURES_ICD,SEQ_NUM,,1,,numeric,1,,,,20957,113808,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21055,1478593,,PROCEDURES_ICD,ICD9_CODE,,8964,,string,1,,,,76435,187938,,,
21056,1478594,,PROCEDURES_ICD,SEQ_NUM,,1,,numeric,1,,,,41035,102460,,,
21057,1478594,,PROCEDURES_ICD,ICD9_CODE,,3228,,string,1,,,,41035,102460,,,
21058,1478595,,PROCEDURES_ICD,SEQ_NUM,,2,,numeric,1,,,,41035,102460,,,


In [119]:
# Mapping
mapping_rules = {
    'ICD9_CODE':("medical_code", np.nan), 
}

# 2. 매핑 함수
def map_mapping_info(row):
    var_name = row["Variable_name"]
    value = row["Value"]

    if pd.isna(value):
        return pd.Series([np.nan, np.nan])  # ✅ value가 NaN이면 매핑 안함
    else:
        return pd.Series(mapping_rules.get(var_name, (np.nan, np.nan)))

# 3. 적용
procedureicd_quiq[["Mapping_info_1", "Mapping_info_2"]] = procedureicd_quiq.apply(map_mapping_info, axis=1)

In [120]:
procedureicd_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1468066,,PROCEDURES_ICD,SEQ_NUM,,1,,numeric,1,,,,4803,159789,,,
1,1468066,,PROCEDURES_ICD,ICD9_CODE,,9672,,string,1,,,,4803,159789,,medical_code,
2,1468067,,PROCEDURES_ICD,SEQ_NUM,,2,,numeric,1,,,,4803,159789,,,
3,1468067,,PROCEDURES_ICD,ICD9_CODE,,966,,string,1,,,,4803,159789,,medical_code,
4,1468068,,PROCEDURES_ICD,SEQ_NUM,,1,,numeric,1,,,,20957,113808,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21055,1478593,,PROCEDURES_ICD,ICD9_CODE,,8964,,string,1,,,,76435,187938,,medical_code,
21056,1478594,,PROCEDURES_ICD,SEQ_NUM,,1,,numeric,1,,,,41035,102460,,,
21057,1478594,,PROCEDURES_ICD,ICD9_CODE,,3228,,string,1,,,,41035,102460,,medical_code,
21058,1478595,,PROCEDURES_ICD,SEQ_NUM,,2,,numeric,1,,,,41035,102460,,,


In [121]:
prescriptions_quiq.to_csv('G:/2000/MIMIC_procedureicd_QUIQ.csv', index=False)

In [127]:
# VIA 테이블 생성
via_variable_names = ['SEQ_NUM','ICD9_CODE']

via_descriptions = ['provides the order in which the ICD procedure relate to the patient. ICD procedures are ordered by priority-and the order does have an impact n the reimbursement for treatment',
                    'contains the actual code corresponding to the procedure assigned to the patient for the given row. Note that all codes, as of MIMIC-III v1.0, are ICD-9 codes'


]
via_procedureicd = pd.DataFrame({
    'Original_table_name': 'PROCEURES_ICD',
    'Variable_name': via_variable_names,
    'Description': via_descriptions
})
via_procedureicd

Unnamed: 0,Original_table_name,Variable_name,Description
0,PROCEURES_ICD,SEQ_NUM,provides the order in which the ICD procedure ...
1,PROCEURES_ICD,ICD9_CODE,contains the actual code corresponding to the ...


In [128]:
via_procedureicd.to_csv("G:/2000/MIMIC_procedureicd_VIA.csv", index=False)

# Microbiologyevents

In [122]:
microbiologyevents = pd.read_csv('MICROBIOLOGYEVENTS.csv.gz', compression='gzip')
microbiologyevents

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,SPEC_ITEMID,SPEC_TYPE_DESC,ORG_ITEMID,ORG_NAME,ISOLATE_NUM,AB_ITEMID,AB_NAME,DILUTION_TEXT,DILUTION_COMPARISON,DILUTION_VALUE,INTERPRETATION
0,744,96,170324,2156-04-13 00:00:00,2156-04-13 14:18:00,70021.0,BRONCHOALVEOLAR LAVAGE,80026.0,PSEUDOMONAS AERUGINOSA,1.0,,,,,,
1,745,96,170324,2156-04-20 00:00:00,2156-04-20 13:10:00,70062.0,SPUTUM,,,,,,,,,
2,746,96,170324,2156-04-20 00:00:00,2156-04-20 16:00:00,70012.0,BLOOD CULTURE,,,,,,,,,
3,747,96,170324,2156-04-20 00:00:00,,70012.0,BLOOD CULTURE,,,,,,,,,
4,748,96,170324,2156-04-20 00:00:00,,70079.0,URINE,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
631721,630931,99783,126090,2125-11-05 00:00:00,2125-11-05 13:15:00,70076.0,TISSUE,80066.0,ASPERGILLUS FUMIGATUS,2.0,,,,,,
631722,630932,99783,126090,2125-11-06 00:00:00,2125-11-06 10:24:00,70076.0,TISSUE,80066.0,ASPERGILLUS FUMIGATUS,1.0,,,,,,
631723,630933,99783,126090,2125-11-06 00:00:00,2125-11-06 10:24:00,70076.0,TISSUE,80066.0,ASPERGILLUS FUMIGATUS,2.0,,,,,,
631724,630934,99783,126090,2125-11-07 00:00:00,2125-11-07 12:40:00,70012.0,BLOOD CULTURE,,,,,,,,,


In [123]:
microbiologyevents_df = microbiologyevents[microbiologyevents["SUBJECT_ID"].isin(la)].reset_index(drop=True)
microbiologyevents_df

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,SPEC_ITEMID,SPEC_TYPE_DESC,ORG_ITEMID,ORG_NAME,ISOLATE_NUM,AB_ITEMID,AB_NAME,DILUTION_TEXT,DILUTION_COMPARISON,DILUTION_VALUE,INTERPRETATION
0,763,102,195700,2196-02-27 00:00:00,2196-02-27 12:30:00,70014.0,BLOOD CULTURE - NEONATE,,,,,,,,,
1,1031,110,154943,2110-06-02 00:00:00,2110-06-02 03:10:00,70014.0,BLOOD CULTURE - NEONATE,,,,,,,,,
2,1032,110,154943,2110-06-02 00:00:00,2110-06-02 07:30:00,70026.0,CSF;SPINAL FLUID,,,,,,,,,
3,1033,110,154943,2110-06-03 00:00:00,2110-06-03 11:00:00,70069.0,SWAB,,,,,,,,,
4,1034,110,154943,2110-06-03 00:00:00,2110-06-03 11:00:00,70070.0,SWAB,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26088,629944,99491,125502,2139-01-27 00:00:00,2139-01-27 16:34:00,70012.0,BLOOD CULTURE,,,,,,,,,
26089,629945,99491,125502,2139-02-01 00:00:00,2139-02-01 21:19:00,70012.0,BLOOD CULTURE,,,,,,,,,
26090,629946,99491,125502,2139-02-01 00:00:00,2139-02-01 21:20:00,70012.0,BLOOD CULTURE,,,,,,,,,
26091,629947,99491,125502,2139-02-01 00:00:00,2139-02-01 21:20:00,70079.0,URINE,80075.0,YEAST,1.0,,,,,,


In [124]:
microbiologyevents_df.columns

Index(['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'CHARTDATE', 'CHARTTIME',
       'SPEC_ITEMID', 'SPEC_TYPE_DESC', 'ORG_ITEMID', 'ORG_NAME',
       'ISOLATE_NUM', 'AB_ITEMID', 'AB_NAME', 'DILUTION_TEXT',
       'DILUTION_COMPARISON', 'DILUTION_VALUE', 'INTERPRETATION'],
      dtype='object')

In [139]:
rows = []
exclude_cols = {"ROW_ID", "SUBJECT_ID", "HADM_ID", "CHARTTIME", 
                "ITEMID", "SPEC_ITEMID", "ORG_ITEMID", "AB_ITEMID",
                "SPEC_TYPE_DESC", "ORG_NAME", "AB_NAME"}  # ← 여기에 추가!

for index, row in microbiologyevents_df.iterrows():
    patient_id = row["SUBJECT_ID"]
    admission_id = row["HADM_ID"]

    for label_val, itemid_col in [
        ("SPEC_TYPE_DESC", "SPEC_ITEMID"),
        ("ORG_NAME", "ORG_ITEMID"),
        ("AB_NAME", "AB_ITEMID")
    ]:
        # variable_id_val 설정
        variable_id_val = row[itemid_col] if itemid_col in microbiologyevents_df.columns else np.nan

        # variable_id가 있을 때만 event_date_val 할당
        if not pd.isna(variable_id_val) and "CHARTTIME" in microbiologyevents_df.columns:
            event_date_val = row["CHARTTIME"]
        else:
            event_date_val = np.nan

        # value 값 설정
        value_val = row[label_val] if label_val in microbiologyevents_df.columns else np.nan

        # append
        rows.append({
            "Primary_key": len(rows) + 1478596,
            "Variable_ID": variable_id_val,
            "Original_table_name": "MICROBIOLOGYEVENTS",
            "Variable_name": label_val,
            "Event_date": event_date_val,
            "Value": value_val,
            "Unit": np.nan,
            "Variable_type": np.nan,
            "Is_categorical": np.nan,
            "Recorder": np.nan,
            "Recorder_position": np.nan,
            "Recorder_affiliation": np.nan,
            "Patient_id": patient_id,
            "Admission_id": admission_id,
            "Ground_truth": np.nan,
            "Mapping_info_1": "event",
            "Mapping_info_2": np.nan
        })
        
        

    # ✅ 2. 나머지 열들에 대해 반복 (컬럼명을 Variable_name으로)
    for col in microbiologyevents_df.columns:
        if col not in exclude_cols:
            value = row[col]
            rows.append({
                "Primary_key": index + 1478596,  # 고유성 확보용 소수 해시
                "Variable_ID": np.nan,
                "Original_table_name": "MICROBIOLOGYEVENTS",
                "Variable_name": col,
                "Event_date": np.nan,
                "Value": value,
                "Unit": unit_val,
                "Variable_type": np.nan,
                "Is_categorical": np.nan,
                "Recorder": np.nan,
                "Recorder_position": np.nan,
                "Recorder_affiliation": np.nan,
                "Patient_id": patient_id,
                "Admission_id": admission_id,
                "Ground_truth": np.nan,
                "Mapping_info_1": np.nan,
                "Mapping_info_2": np.nan
            })


# QUIQ 테이블로 변환
micro_quiq = pd.DataFrame(rows, columns=QUIQ_cols)

# -------------------------------
# Variable_type 추론 함수
def infer_variable_type(val):
    if pd.isna(val):
        return np.nan
    elif isinstance(val, (int, float)):
        return "numeric"
    elif isinstance(val, pd.Timestamp):
        return "timestamp"
    elif isinstance(val, str):
        try:
            pd.to_datetime(val, errors="raise")
            return "timestamp"
        except Exception:
            return "string"
    else:
        return "unknown"

# Variable_type 컬럼 채우기
micro_quiq["Variable_type"] = micro_quiq["Value"].apply(infer_variable_type)

# -------------------------------
# Is_categorical 판단
CATEGORICAL_THRESHOLD = 10

# 각 Variable_name 별 고유값 수 계산
value_counts = micro_quiq.groupby("Variable_name")["Value"].nunique()

# 고유값 수가 기준 이하인 변수 목록 추출
categorical_vars = value_counts[value_counts <= CATEGORICAL_THRESHOLD].index

# Is_categorical 컬럼 채우기: 범주형이면 1, 아니면 0, Value가 NaN이면 NaN
micro_quiq["Is_categorical"] = micro_quiq.apply(
    lambda row: np.nan if pd.isna(row["Value"])
    else 1 if row["Variable_name"] in categorical_vars
    else 0,
    axis=1
)
micro_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1478596,70014.0,MICROBIOLOGYEVENTS,SPEC_TYPE_DESC,2196-02-27 12:30:00,BLOOD CULTURE - NEONATE,,string,0.0,,,,102,195700,,event,
1,1478597,,MICROBIOLOGYEVENTS,ORG_NAME,,,,,,,,,102,195700,,event,
2,1478598,,MICROBIOLOGYEVENTS,AB_NAME,,,,,,,,,102,195700,,event,
3,1478596,,MICROBIOLOGYEVENTS,CHARTDATE,,2196-02-27 00:00:00,,timestamp,0.0,,,,102,195700,,,
4,1478596,,MICROBIOLOGYEVENTS,ISOLATE_NUM,,,,,,,,,102,195700,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234832,1504688,,MICROBIOLOGYEVENTS,ISOLATE_NUM,,1.0,,numeric,1.0,,,,99491,125502,,,
234833,1504688,,MICROBIOLOGYEVENTS,DILUTION_TEXT,,<=1,,string,0.0,,,,99491,125502,,,
234834,1504688,,MICROBIOLOGYEVENTS,DILUTION_COMPARISON,,<=,,string,1.0,,,,99491,125502,,,
234835,1504688,,MICROBIOLOGYEVENTS,DILUTION_VALUE,,1.0,,numeric,0.0,,,,99491,125502,,,


In [140]:
condition = (
    (micro_quiq["Variable_type"] == "string") &
    (micro_quiq["Is_categorical"] == 0)
)

# 해당 조건에 맞는 행들의 Is_categorical 값을 0으로 변경
micro_quiq.loc[condition, "Is_categorical"] = 1
micro_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1478596,70014.0,MICROBIOLOGYEVENTS,SPEC_TYPE_DESC,2196-02-27 12:30:00,BLOOD CULTURE - NEONATE,,string,1.0,,,,102,195700,,event,
1,1478597,,MICROBIOLOGYEVENTS,ORG_NAME,,,,,,,,,102,195700,,event,
2,1478598,,MICROBIOLOGYEVENTS,AB_NAME,,,,,,,,,102,195700,,event,
3,1478596,,MICROBIOLOGYEVENTS,CHARTDATE,,2196-02-27 00:00:00,,timestamp,0.0,,,,102,195700,,,
4,1478596,,MICROBIOLOGYEVENTS,ISOLATE_NUM,,,,,,,,,102,195700,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234832,1504688,,MICROBIOLOGYEVENTS,ISOLATE_NUM,,1.0,,numeric,1.0,,,,99491,125502,,,
234833,1504688,,MICROBIOLOGYEVENTS,DILUTION_TEXT,,<=1,,string,1.0,,,,99491,125502,,,
234834,1504688,,MICROBIOLOGYEVENTS,DILUTION_COMPARISON,,<=,,string,1.0,,,,99491,125502,,,
234835,1504688,,MICROBIOLOGYEVENTS,DILUTION_VALUE,,1.0,,numeric,0.0,,,,99491,125502,,,


In [141]:
micro_quiq.loc[micro_quiq["Value"].isna(), ["Mapping_info_1", "Mapping_info_2"]] = np.nan
micro_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1478596,70014.0,MICROBIOLOGYEVENTS,SPEC_TYPE_DESC,2196-02-27 12:30:00,BLOOD CULTURE - NEONATE,,string,1.0,,,,102,195700,,event,
1,1478597,,MICROBIOLOGYEVENTS,ORG_NAME,,,,,,,,,102,195700,,,
2,1478598,,MICROBIOLOGYEVENTS,AB_NAME,,,,,,,,,102,195700,,,
3,1478596,,MICROBIOLOGYEVENTS,CHARTDATE,,2196-02-27 00:00:00,,timestamp,0.0,,,,102,195700,,,
4,1478596,,MICROBIOLOGYEVENTS,ISOLATE_NUM,,,,,,,,,102,195700,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234832,1504688,,MICROBIOLOGYEVENTS,ISOLATE_NUM,,1.0,,numeric,1.0,,,,99491,125502,,,
234833,1504688,,MICROBIOLOGYEVENTS,DILUTION_TEXT,,<=1,,string,1.0,,,,99491,125502,,,
234834,1504688,,MICROBIOLOGYEVENTS,DILUTION_COMPARISON,,<=,,string,1.0,,,,99491,125502,,,
234835,1504688,,MICROBIOLOGYEVENTS,DILUTION_VALUE,,1.0,,numeric,0.0,,,,99491,125502,,,


In [138]:
micro_quiq.head(20)

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1478596,70014.0,MICROBIOLOGYEVENTS,SPEC_TYPE_DESC,2196-02-27 12:30:00,BLOOD CULTURE - NEONATE,,string,1.0,,,,102,195700,,event,lab_event
1,1478597,,MICROBIOLOGYEVENTS,ORG_NAME,,,,,,,,,102,195700,,,
2,1478598,,MICROBIOLOGYEVENTS,AB_NAME,,,,,,,,,102,195700,,,
3,1478596,,MICROBIOLOGYEVENTS,CHARTDATE,,2196-02-27 00:00:00,,timestamp,0.0,,,,102,195700,,,
4,1478596,,MICROBIOLOGYEVENTS,ISOLATE_NUM,,,,,,,,,102,195700,,,
5,1478596,,MICROBIOLOGYEVENTS,DILUTION_TEXT,,,,,,,,,102,195700,,,
6,1478596,,MICROBIOLOGYEVENTS,DILUTION_COMPARISON,,,,,,,,,102,195700,,,
7,1478596,,MICROBIOLOGYEVENTS,DILUTION_VALUE,,,,,,,,,102,195700,,,
8,1478596,,MICROBIOLOGYEVENTS,INTERPRETATION,,,,,,,,,102,195700,,,
9,1478605,70014.0,MICROBIOLOGYEVENTS,SPEC_TYPE_DESC,2110-06-02 03:10:00,BLOOD CULTURE - NEONATE,,string,1.0,,,,110,154943,,event,lab_event


In [142]:
# 매핑 룰 정의
mapping_rules = {
    'CHARTDATE': ("date", np.nan),
}

# 매핑 함수 정의
def map_mapping_info_conditional(row):
    # 기존 매핑 정보가 이미 존재하는 경우 → 그대로 유지
    if not pd.isna(row["Mapping_info_1"]) or not pd.isna(row["Mapping_info_2"]):
        return pd.Series([row["Mapping_info_1"], row["Mapping_info_2"]])
    
    var_name = row["Variable_name"]
    value = row["Value"]

    # Value가 NaN이면 매핑하지 않음
    if pd.isna(value):
        return pd.Series([np.nan, np.nan])
    else:
        return pd.Series(mapping_rules.get(var_name, (np.nan, np.nan)))

# 적용
micro_quiq[["Mapping_info_1", "Mapping_info_2"]] = micro_quiq.apply(map_mapping_info_conditional, axis=1)


In [143]:
micro_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1478596,70014.0,MICROBIOLOGYEVENTS,SPEC_TYPE_DESC,2196-02-27 12:30:00,BLOOD CULTURE - NEONATE,,string,1.0,,,,102,195700,,event,
1,1478597,,MICROBIOLOGYEVENTS,ORG_NAME,,,,,,,,,102,195700,,,
2,1478598,,MICROBIOLOGYEVENTS,AB_NAME,,,,,,,,,102,195700,,,
3,1478596,,MICROBIOLOGYEVENTS,CHARTDATE,,2196-02-27 00:00:00,,timestamp,0.0,,,,102,195700,,date,
4,1478596,,MICROBIOLOGYEVENTS,ISOLATE_NUM,,,,,,,,,102,195700,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234832,1504688,,MICROBIOLOGYEVENTS,ISOLATE_NUM,,1.0,,numeric,1.0,,,,99491,125502,,,
234833,1504688,,MICROBIOLOGYEVENTS,DILUTION_TEXT,,<=1,,string,1.0,,,,99491,125502,,,
234834,1504688,,MICROBIOLOGYEVENTS,DILUTION_COMPARISON,,<=,,string,1.0,,,,99491,125502,,,
234835,1504688,,MICROBIOLOGYEVENTS,DILUTION_VALUE,,1.0,,numeric,0.0,,,,99491,125502,,,


In [145]:
condition = (
    (micro_quiq["Variable_name"] == "DILUTION_VALUE") &
    (micro_quiq["Is_categorical"] == 0)
)

# 해당 조건에 맞는 행들의 Is_categorical 값을 0으로 변경
micro_quiq.loc[condition, "Is_categorical"] = 1
micro_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1478596,70014.0,MICROBIOLOGYEVENTS,SPEC_TYPE_DESC,2196-02-27 12:30:00,BLOOD CULTURE - NEONATE,,string,1.0,,,,102,195700,,event,
1,1478597,,MICROBIOLOGYEVENTS,ORG_NAME,,,,,,,,,102,195700,,,
2,1478598,,MICROBIOLOGYEVENTS,AB_NAME,,,,,,,,,102,195700,,,
3,1478596,,MICROBIOLOGYEVENTS,CHARTDATE,,2196-02-27 00:00:00,,timestamp,0.0,,,,102,195700,,date,
4,1478596,,MICROBIOLOGYEVENTS,ISOLATE_NUM,,,,,,,,,102,195700,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234832,1504688,,MICROBIOLOGYEVENTS,ISOLATE_NUM,,1.0,,numeric,1.0,,,,99491,125502,,,
234833,1504688,,MICROBIOLOGYEVENTS,DILUTION_TEXT,,<=1,,string,1.0,,,,99491,125502,,,
234834,1504688,,MICROBIOLOGYEVENTS,DILUTION_COMPARISON,,<=,,string,1.0,,,,99491,125502,,,
234835,1504688,,MICROBIOLOGYEVENTS,DILUTION_VALUE,,1.0,,numeric,1.0,,,,99491,125502,,,


In [146]:
micro_quiq.to_csv("G:/2000/MIMIC_microevents_QUIQ.csv", index=False)

In [165]:
# VIA 테이블 생성
via_variable_names = ['SPEC_TYPE_DESC','ORG_NAME', 'AB_NAME', "CHARTDATE", "ISOLATE_NUM", "DILUTION_TEXT", "DILUTION_COMPARISON", 
                      "DILUTION_VALUE", "INTERPRETATION"]

via_descriptions = ["Specimen type description", "Organism name", "Antibiotic name", "Specimen collection date", "Isolate number", "Dilution text result", "Dilution comparison operator", "Dilution numeric value", "Antibiotic susceptibility interpretation"

]
via_micro = pd.DataFrame({
    'Original_table_name': 'MICROBIOLOGYEVENTS',
    'Variable_name': via_variable_names,
    'Description': via_descriptions
})
via_micro

Unnamed: 0,Original_table_name,Variable_name,Description
0,MICROBIOLOGYEVENTS,SPEC_TYPE_DESC,Specimen type description
1,MICROBIOLOGYEVENTS,ORG_NAME,Organism name
2,MICROBIOLOGYEVENTS,AB_NAME,Antibiotic name
3,MICROBIOLOGYEVENTS,CHARTDATE,Specimen collection date
4,MICROBIOLOGYEVENTS,ISOLATE_NUM,Isolate number
5,MICROBIOLOGYEVENTS,DILUTION_TEXT,Dilution text result
6,MICROBIOLOGYEVENTS,DILUTION_COMPARISON,Dilution comparison operator
7,MICROBIOLOGYEVENTS,DILUTION_VALUE,Dilution numeric value
8,MICROBIOLOGYEVENTS,INTERPRETATION,Antibiotic susceptibility interpretation


In [166]:
via_micro.to_csv("G:/2000/MIMIC_microevents_VIA.csv", index=False)

# Chart events

In [16]:
chunk_size = 100_000

chunks=[]

for chunk in pd.read_csv('CHARTEVENTS.csv.gz', compression='gzip', chunksize=chunk_size):
    chunks.append(chunk)
    
chartevents = pd.concat(chunks, ignore_index=True)

chartevents

  for chunk in pd.read_csv('CHARTEVENTS.csv.gz', compression='gzip', chunksize=chunk_size):


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,ITEMID,CHARTTIME,STORETIME,CGID,VALUE,VALUENUM,VALUEUOM,WARNING,ERROR,RESULTSTATUS,STOPPED
0,788,36,165660,241249.0,223834,2134-05-12 12:00:00,2134-05-12 13:56:00,17525.0,15.0,15.00,L/min,0.0,0.0,,
1,789,36,165660,241249.0,223835,2134-05-12 12:00:00,2134-05-12 13:56:00,17525.0,100.0,100.00,,0.0,0.0,,
2,790,36,165660,241249.0,224328,2134-05-12 12:00:00,2134-05-12 12:18:00,20823.0,0.37,0.37,,0.0,0.0,,
3,791,36,165660,241249.0,224329,2134-05-12 12:00:00,2134-05-12 12:19:00,20823.0,6.0,6.00,min,0.0,0.0,,
4,792,36,165660,241249.0,224330,2134-05-12 12:00:00,2134-05-12 12:19:00,20823.0,2.5,2.50,,0.0,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
330712478,330471885,99781,147562,200664.0,224847,2133-08-02 08:30:00,2133-08-02 08:30:00,14357.0,Moderate,,,0.0,0.0,,
330712479,330471886,99781,147562,200664.0,224889,2133-08-02 08:30:00,2133-08-02 08:31:00,14357.0,Not applicable,,,0.0,0.0,,
330712480,330471887,99781,147562,200664.0,224903,2133-08-02 08:30:00,2133-08-02 08:31:00,14357.0,,,,0.0,0.0,,
330712481,330471888,99781,147562,200664.0,224910,2133-08-02 08:30:00,2133-08-02 08:31:00,14357.0,,,,0.0,0.0,,


In [17]:
chartevents_df = chartevents[chartevents["SUBJECT_ID"].isin(la)].reset_index(drop=True)
chartevents_df

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,ITEMID,CHARTTIME,STORETIME,CGID,VALUE,VALUENUM,VALUEUOM,WARNING,ERROR,RESULTSTATUS,STOPPED
0,28145,188,123860,213646.0,225664,2161-01-02 06:00:00,2161-01-02 06:35:00,14657.0,141.0,141.0,,0.0,0.0,,
1,28146,188,123860,213646.0,220045,2161-01-02 07:00:00,2161-01-02 08:35:00,18602.0,53.0,53.0,bpm,0.0,0.0,,
2,28147,188,123860,213646.0,220179,2161-01-02 07:00:00,2161-01-02 08:35:00,18602.0,136.0,136.0,mmHg,0.0,0.0,,
3,28148,188,123860,213646.0,220180,2161-01-02 07:00:00,2161-01-02 08:35:00,18602.0,77.0,77.0,mmHg,0.0,0.0,,
4,28149,188,123860,213646.0,220181,2161-01-02 07:00:00,2161-01-02 08:35:00,18602.0,91.0,91.0,mmHg,0.0,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14215473,330440037,99740,161687,269160.0,223934,2135-05-14 00:00:00,2135-05-14 00:43:00,15667.0,Easily Palpable,,,0.0,0.0,,
14215474,330440038,99740,161687,269160.0,223935,2135-05-14 00:00:00,2135-05-14 00:43:00,15667.0,Easily Palpable,,,0.0,0.0,,
14215475,330440039,99740,161687,269160.0,223936,2135-05-14 00:00:00,2135-05-14 00:43:00,15667.0,Easily Palpable,,,0.0,0.0,,
14215476,330440040,99740,161687,269160.0,223943,2135-05-14 00:00:00,2135-05-14 00:43:00,15667.0,Easily Palpable,,,0.0,0.0,,


In [18]:
chartevents_df.to_csv("chartevents_test.csv", index=False)

In [147]:
chartevents_df = pd.read_csv("chartevents_test.csv")

  chartevents_df = pd.read_csv("chartevents_test.csv")


In [148]:
chart_df = chartevents_df.sample(n=1_000_000, random_state=42)
chart_df

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,ITEMID,CHARTTIME,STORETIME,CGID,VALUE,VALUENUM,VALUEUOM,WARNING,ERROR,RESULTSTATUS,STOPPED
415442,10844841,51013,182359,259631.0,225612,2172-04-06 03:00:00,2172-04-06 03:46:00,20889.0,32.0,32.0,IU/L,1.0,0.0,,
14025389,326822904,96232,153969,213668.0,224089,2162-04-11 20:49:00,2162-04-11 20:49:00,18928.0,,,,0.0,0.0,,
12295052,285397204,54922,148483,238943.0,224017,2102-10-30 04:25:00,2102-10-30 05:25:00,20871.0,16 French,,,0.0,0.0,,
12066175,277524681,47271,116612,275725.0,224007,2122-04-09 18:00:00,2122-04-09 18:18:00,20214.0,Right Nare,,,0.0,0.0,,
3224739,84820189,9818,149679,285205.0,1125,2153-11-16 19:00:00,2153-11-16 19:03:00,15225.0,MICU,,,,,,NotStopd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11217222,255788238,10302,119529,276105.0,226169,2151-07-18 22:00:00,2151-07-19 00:10:00,18469.0,1,1.0,,0.0,0.0,,
7130689,164242927,25788,175274,213272.0,496,2142-08-24 01:00:00,2142-08-24 05:33:00,17706.0,1,1.0,,,,,NotStopd
13690570,319709142,89132,198797,297111.0,223902,2151-06-29 23:00:00,2151-06-29 23:59:00,21386.0,,,,0.0,0.0,,
11820974,272149852,41438,131522,278264.0,224080,2193-03-15 14:00:00,2193-03-15 15:03:00,18901.0,30 Degrees,,,0.0,0.0,,


In [149]:
merged_df_chart = pd.merge(
    chart_df,
    d_item[['ITEMID', 'LABEL', 'CATEGORY']],
    how='left',
    on='ITEMID'
)

# 조인 안 된 행만 필터링 (LABEL 또는 CATEGORY가 NaN인 경우)
unmatched_chart = merged_df_chart[merged_df_chart['LABEL'].isna()]

# 결과 확인
unmatched_chart

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,ITEMID,CHARTTIME,STORETIME,CGID,VALUE,VALUENUM,VALUEUOM,WARNING,ERROR,RESULTSTATUS,STOPPED,LABEL,CATEGORY


In [150]:
merged_df_chart

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,ITEMID,CHARTTIME,STORETIME,CGID,VALUE,VALUENUM,VALUEUOM,WARNING,ERROR,RESULTSTATUS,STOPPED,LABEL,CATEGORY
0,10844841,51013,182359,259631.0,225612,2172-04-06 03:00:00,2172-04-06 03:46:00,20889.0,32.0,32.0,IU/L,1.0,0.0,,,Alkaline Phosphate,Labs
1,326822904,96232,153969,213668.0,224089,2162-04-11 20:49:00,2162-04-11 20:49:00,18928.0,,,,0.0,0.0,,,Anti Embolic Device,Treatments
2,285397204,54922,148483,238943.0,224017,2102-10-30 04:25:00,2102-10-30 05:25:00,20871.0,16 French,,,0.0,0.0,,,GU Catheter Size,GI/GU
3,277524681,47271,116612,275725.0,224007,2122-04-09 18:00:00,2122-04-09 18:18:00,20214.0,Right Nare,,,0.0,0.0,,,GI #1 Intub Site,GI/GU
4,84820189,9818,149679,285205.0,1125,2153-11-16 19:00:00,2153-11-16 19:03:00,15225.0,MICU,,,,,,NotStopd,Service Type,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,255788238,10302,119529,276105.0,226169,2151-07-18 22:00:00,2151-07-19 00:10:00,18469.0,1,1.0,,0.0,0.0,,,Subglottal Suctioning,Treatments
999996,164242927,25788,175274,213272.0,496,2142-08-24 01:00:00,2142-08-24 05:33:00,17706.0,1,1.0,,,,,NotStopd,PCA Dose,
999997,319709142,89132,198797,297111.0,223902,2151-06-29 23:00:00,2151-06-29 23:59:00,21386.0,,,,0.0,0.0,,,Speech,Neurological
999998,272149852,41438,131522,278264.0,224080,2193-03-15 14:00:00,2193-03-15 15:03:00,18901.0,30 Degrees,,,0.0,0.0,,,Head of Bed,Treatments


In [151]:
merged_df_chart['LABEL'].nunique()

2479

In [152]:
merged_df_chart.columns

Index(['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'ITEMID', 'CHARTTIME',
       'ERROR', 'RESULTSTATUS', 'STOPPED', 'LABEL', 'CATEGORY'],
      dtype='object')

In [153]:
# 결과 저장 리스트
rows = []

# 제외할 컬럼
exclude_cols = {
    'ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', "ITEMID", "LABEL",
    "CHARTTIME", "VALUE","VALUENUM", "VALUEUOM", "CATEGORY","CGID"
}

for index, row in merged_df_chart.iterrows():
    patient_id = row["SUBJECT_ID"]
    admission_id = row["HADM_ID"]

    # ✅ 1. LABEL 기반 행 생성
    label_val = row["LABEL"]
    value_val = row["VALUE"]
    unit_val = row["VALUEUOM"] if "VALUEUOM" in merged_df_chart.columns else np.nan
    event_date_val = row["CHARTTIME"] if "CHARTTIME" in merged_df_chart.columns else np.nan
    variable_id_val = row["ITEMID"] if "ITEMID" in merged_df_chart.columns else np.nan

    rows.append({
        "Primary_key": index + 8796219,
        "Variable_ID": variable_id_val,
        "Original_table_name": "CHARTEVENTS",
        "Variable_name": label_val,
        "Event_date": event_date_val,
        "Value": value_val,
        "Unit": unit_val,
        "Variable_type": np.nan,
        "Is_categorical": np.nan,
        "Recorder": np.nan,
        "Recorder_position": np.nan,
        "Recorder_affiliation": np.nan,
        "Patient_id": patient_id,
        "Admission_id": admission_id,
        "Ground_truth": np.nan,
        "Mapping_info_1": "event",
        "Mapping_info_2": "chart_event"
    })

    # ✅ 2. 나머지 열들에 대해 반복 (컬럼명을 Variable_name으로)
    for col in merged_df_chart.columns:
        if col not in exclude_cols:
            value = row[col]
            rows.append({
                "Primary_key": index + 8796219,  # 고유성 확보용 소수 해시
                "Variable_ID": np.nan,
                "Original_table_name": "CHARTEVENTS",
                "Variable_name": col,
                "Event_date": np.nan,
                "Value": value,
                "Unit": np.nan,
                "Variable_type": np.nan,
                "Is_categorical": np.nan,
                "Recorder": np.nan,
                "Recorder_position": np.nan,
                "Recorder_affiliation": np.nan,
                "Patient_id": patient_id,
                "Admission_id": admission_id,
                "Ground_truth": np.nan,
                "Mapping_info_1": np.nan,
                "Mapping_info_2": np.nan
            })
            
chartevents_quiq = pd.DataFrame(rows, columns=QUIQ_cols)

# -------------------------------
def infer_variable_type(val):
    if pd.isna(val):
        return np.nan
    elif isinstance(val, (int, float)):
        return "numeric"
    elif isinstance(val, pd.Timestamp):
        return "timestamp"
    elif isinstance(val, str):
        # 먼저 숫자형 가능한지 체크
        try:
            float(val)  # 문자열이 숫자면 float 변환 성공
            return "numeric"
        except ValueError:
            pass

        # 다음으로 timestamp 가능한지 체크
        try:
            pd.to_datetime(val, errors="raise")
            return "timestamp"
        except Exception:
            return "string"
    else:
        return "unknown"


# Variable_type 먼저 계산
chartevents_quiq["Variable_type"] = chartevents_quiq["Value"].apply(infer_variable_type)
# -------------------------------
# CATEGORICAL_THRESHOLD 설정
CATEGORICAL_THRESHOLD = 10

# 각 Variable_name 별 고유값 수 계산 (NaN 제외)
value_counts = chartevents_quiq.groupby("Variable_name")["Value"].nunique(dropna=True)

# 고유값 수가 기준 이하인 변수 목록 추출
categorical_vars = value_counts[value_counts <= CATEGORICAL_THRESHOLD].index

# Is_categorical 컬럼 채우기
chartevents_quiq["Is_categorical"] = chartevents_quiq.apply(
    lambda row: np.nan if pd.isna(row["Value"])
    else 1 if row["Variable_name"] in categorical_vars
    else 0,
    axis=1
)
chartevents_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,8796219,225612.0,CHARTEVENTS,Alkaline Phosphate,2172-04-06 03:00:00,32.0,IU/L,numeric,0.0,,,,51013,182359,,event,chart_event
1,8796219,,CHARTEVENTS,STORETIME,,2172-04-06 03:46:00,,timestamp,0.0,,,,51013,182359,,,
2,8796219,,CHARTEVENTS,WARNING,,1.0,,numeric,1.0,,,,51013,182359,,,
3,8796219,,CHARTEVENTS,ERROR,,0.0,,numeric,1.0,,,,51013,182359,,,
4,8796219,,CHARTEVENTS,RESULTSTATUS,,,,,,,,,51013,182359,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5999995,9796218,,CHARTEVENTS,STORETIME,,2102-06-09 06:35:00,,timestamp,0.0,,,,87522,167396,,,
5999996,9796218,,CHARTEVENTS,WARNING,,0.0,,numeric,1.0,,,,87522,167396,,,
5999997,9796218,,CHARTEVENTS,ERROR,,0.0,,numeric,1.0,,,,87522,167396,,,
5999998,9796218,,CHARTEVENTS,RESULTSTATUS,,,,,,,,,87522,167396,,,


In [154]:
condition = (
    (chartevents_quiq["Variable_type"] == "numeric") &
    (chartevents_quiq["Unit"].notna()) &
    (chartevents_quiq["Is_categorical"] == 1)
)

# 해당 조건에 맞는 행들의 Is_categorical 값을 0으로 변경
chartevents_quiq.loc[condition, "Is_categorical"] = 0
chartevents_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,8796219,225612.0,CHARTEVENTS,Alkaline Phosphate,2172-04-06 03:00:00,32.0,IU/L,numeric,0.0,,,,51013,182359,,event,chart_event
1,8796219,,CHARTEVENTS,STORETIME,,2172-04-06 03:46:00,,timestamp,0.0,,,,51013,182359,,,
2,8796219,,CHARTEVENTS,WARNING,,1.0,,numeric,1.0,,,,51013,182359,,,
3,8796219,,CHARTEVENTS,ERROR,,0.0,,numeric,1.0,,,,51013,182359,,,
4,8796219,,CHARTEVENTS,RESULTSTATUS,,,,,,,,,51013,182359,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5999995,9796218,,CHARTEVENTS,STORETIME,,2102-06-09 06:35:00,,timestamp,0.0,,,,87522,167396,,,
5999996,9796218,,CHARTEVENTS,WARNING,,0.0,,numeric,1.0,,,,87522,167396,,,
5999997,9796218,,CHARTEVENTS,ERROR,,0.0,,numeric,1.0,,,,87522,167396,,,
5999998,9796218,,CHARTEVENTS,RESULTSTATUS,,,,,,,,,87522,167396,,,


In [155]:
# 1. 각 Variable_name의 고유 Value 개수 계산
value_counts = chartevents_quiq.groupby("Variable_name")["Value"].nunique(dropna=True)

# 2. 고유값 개수가 1개인 Variable_name만 추출
single_value_vars = value_counts[value_counts == 1].index

# 3. 해당 Variable_name들의 행 중에서 variable_type이 numeric이고, Unit이 있는 경우만 필터
filtered_rows = chartevents_quiq[
    (chartevents_quiq["Variable_name"].isin(single_value_vars)) &
    (chartevents_quiq["Variable_type"] == "numeric") &
    (chartevents_quiq["Unit"].notna())
]
filtered_rows

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
2640,8796659,742.0,CHARTEVENTS,calprevflg,2178-08-05 15:30:00,1,kg,numeric,0.0,,,,695,177128,,event,chart_event
3804,8796853,742.0,CHARTEVENTS,calprevflg,2116-09-23 09:00:00,1,kg,numeric,0.0,,,,29682,121179,,event,chart_event
4500,8796969,742.0,CHARTEVENTS,calprevflg,2173-09-15 06:00:00,1,kg,numeric,0.0,,,,1333,190520,,event,chart_event
4506,8796970,742.0,CHARTEVENTS,calprevflg,2189-02-06 14:00:00,1,kg,numeric,0.0,,,,30344,196989,,event,chart_event
4530,8796974,742.0,CHARTEVENTS,calprevflg,2188-08-26 06:00:00,1,kg,numeric,0.0,,,,29753,140266,,event,chart_event
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5997300,9795769,742.0,CHARTEVENTS,calprevflg,2122-05-12 21:00:00,1,kg,numeric,0.0,,,,32067,192383,,event,chart_event
5998200,9795919,742.0,CHARTEVENTS,calprevflg,2178-12-19 05:00:00,1,kg,numeric,0.0,,,,12081,129710,,event,chart_event
5999472,9796131,742.0,CHARTEVENTS,calprevflg,2144-08-29 07:00:00,1,kg,numeric,0.0,,,,18815,176775,,event,chart_event
5999856,9796195,742.0,CHARTEVENTS,calprevflg,2113-06-22 07:40:00,1,kg,numeric,0.0,,,,17906,109562,,event,chart_event


In [32]:
filtered_rows['Variable_name'].unique()

array(['calprevflg', 'Volume Out (PD)', 'Epidural Bolus (mL)',
       'Theophylline', 'Orthostatic HR standing',
       'PCA basal rate (mL/hour)', 'Impaired Skin Length #8', 'ATC %',
       'Tobramycin (Trough)', 'PCV Level (Avea)', 'Spont. Tidal Volume',
       'Cerebral Perfusion Pressure Alarm - Low',
       'BP Left Leg [Systolic]', 'Length in Inches',
       'Recovery O2 sat - Aerobic Activity Response', 'Combivent MDI',
       'Orthostatic BP lying [Diastolic]', 'Cardiac Output (CO NICOM)',
       'Volume In (PD)', 'Intra Cranial Pressure Alarm - Low',
       'Total Granulocyte Count (TGC)', 'BP Left Leg [Diastolic]',
       'Thrombin', 'avDO2', 'Impaired Skin Depth #7',
       'Transpulmonary Pressure (Exp. Hold)', 'BP Right Leg [Mean]',
       'Orthostatic BPs lying', 'OrthostatBP standing [Diastolic]',
       'Orthostatic HR lying', 'O2ER', 'BP Right Leg [Diastolic]',
       'P Low (APRV)', 'BIPAP - BPM', 'VO2', 'Recruitment Press',
       'FiO2/O2 Delivered', 'AvDO2', 'Spont

In [33]:
filtered_rows.to_csv("test.csv", index=False)

In [156]:
chartevents_quiq.to_csv("G:/2000/MIMIC_chartevents_QUIQ.csv", index=False)

In [35]:
for_via = chartevents_quiq['Variable_name'].unique()
via_df = pd.DataFrame(for_via, columns = ['Variable_name'])
via_df.to_csv("unique_variable_id.csv", index=False)

In [36]:
via_df

Unnamed: 0,Variable_name
0,Alkaline Phosphate
1,STORETIME
2,WARNING
3,ERROR
4,RESULTSTATUS
...,...
2479,seizure pads
2480,Neosynephrine drops
2481,EF
2482,ImpSkin Treatment #6


In [37]:
# 'Original_table_name' 열 추가 및 'CHARTEVENT'로 채우기
via_df['Original_table_name'] = 'CHARTEVENT'

# 'Description' 열 추가 및 'Variable_name' 열의 값 복사
via_df['Description'] = via_df['Variable_name']

In [38]:
via_df = via_df[['Original_table_name', 'Variable_name', 'Description']]
via_df

Unnamed: 0,Original_table_name,Variable_name,Description
0,CHARTEVENT,Alkaline Phosphate,Alkaline Phosphate
1,CHARTEVENT,STORETIME,STORETIME
2,CHARTEVENT,WARNING,WARNING
3,CHARTEVENT,ERROR,ERROR
4,CHARTEVENT,RESULTSTATUS,RESULTSTATUS
...,...,...,...
2479,CHARTEVENT,seizure pads,seizure pads
2480,CHARTEVENT,Neosynephrine drops,Neosynephrine drops
2481,CHARTEVENT,EF,EF
2482,CHARTEVENT,ImpSkin Treatment #6,ImpSkin Treatment #6


In [39]:
via_df.to_csv("G:/2000/MIMIC_chartevents_VIA.csv", index=False)

# Datetimeevents

In [15]:
datetimeevents = pd.read_csv('DATETIMEEVENTS.csv.gz', compression='gzip')
datetimeevents

  datetimeevents = pd.read_csv('DATETIMEEVENTS.csv.gz', compression='gzip')


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,ITEMID,CHARTTIME,STORETIME,CGID,VALUE,VALUEUOM,WARNING,ERROR,RESULTSTATUS,STOPPED
0,711,7657,121183.0,297945.0,3411,2172-03-14 11:00:00,2172-03-14 11:52:00,16446,,Date,,,,NotStopd
1,712,7657,121183.0,297945.0,3411,2172-03-14 13:00:00,2172-03-14 12:36:00,16446,,Date,,,,NotStopd
2,713,7657,121183.0,297945.0,3411,2172-03-14 15:00:00,2172-03-14 15:10:00,14957,,Date,,,,NotStopd
3,714,7657,121183.0,297945.0,3411,2172-03-14 17:00:00,2172-03-14 17:01:00,16446,,Date,,,,NotStopd
4,715,7657,121183.0,297945.0,3411,2172-03-14 19:00:00,2172-03-14 19:29:00,14815,,Date,,,,NotStopd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4485932,4456093,99366,136021.0,218447.0,224279,2197-03-24 16:32:00,2197-03-24 16:32:00,18234,2197-03-24 13:03:00,Date and Time,0.0,0.0,,
4485933,4456094,99366,136021.0,218447.0,224280,2197-03-24 16:32:00,2197-03-24 16:32:00,18234,2197-03-24 00:00:00,Date,0.0,0.0,,
4485934,4456095,99366,136021.0,218447.0,224282,2197-03-24 16:32:00,2197-03-24 16:32:00,18234,2197-03-24 00:00:00,Date,0.0,0.0,,
4485935,4456096,99366,136021.0,218447.0,224284,2197-03-24 16:32:00,2197-03-24 16:32:00,18234,2197-03-24 00:00:00,Date,0.0,0.0,,


In [16]:
datetimeevents_df = datetimeevents[datetimeevents["SUBJECT_ID"].isin(la)].reset_index(drop=True)
datetimeevents_df

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,ITEMID,CHARTTIME,STORETIME,CGID,VALUE,VALUEUOM,WARNING,ERROR,RESULTSTATUS,STOPPED
0,3565,21343,169687.0,283695.0,3411,2183-10-20 11:00:00,2183-10-20 10:48:00,19412,2183-10-27 00:00:00,Date,,,,NotStopd
1,3566,21343,169687.0,283695.0,3411,2183-10-20 13:00:00,2183-10-20 12:44:00,19412,2183-10-27 00:00:00,Date,,,,NotStopd
2,3567,21343,169687.0,283695.0,3411,2183-10-20 14:00:00,2183-10-20 14:57:00,14948,2183-10-27 00:00:00,Date,,,,NotStopd
3,3104,21343,169687.0,283695.0,3411,2183-10-21 20:00:00,2183-10-21 19:56:00,17206,2183-10-27 00:00:00,Date,,,,NotStopd
4,3105,21343,169687.0,283695.0,3411,2183-10-21 21:00:00,2183-10-21 21:43:00,17206,2183-10-27 00:00:00,Date,,,,NotStopd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184538,4465088,99491,125502.0,215100.0,224183,2139-02-11 00:31:00,2139-02-11 00:31:00,16037,2139-02-09 00:00:00,Date,0.0,0.0,,
184539,4465089,99491,125502.0,215100.0,224186,2139-02-11 00:31:00,2139-02-11 00:31:00,16037,2139-02-09 05:00:00,Date and Time,0.0,0.0,,
184540,4465090,99491,125502.0,215100.0,224187,2139-02-11 00:31:00,2139-02-11 00:31:00,16037,2139-02-09 00:00:00,Date,0.0,0.0,,
184541,4465091,99491,125502.0,215100.0,227790,2139-02-11 00:31:00,2139-02-11 00:31:00,16037,2139-02-05 23:53:00,Date and Time,0.0,0.0,,


In [17]:
num_unique_subjects1 = datetimeevents_df["SUBJECT_ID"].nunique()
num_unique_subjects1

1279

In [18]:
merged_df_date = pd.merge(
    datetimeevents_df,
    d_item[['ITEMID', 'LABEL', 'CATEGORY']],
    how='left',
    on='ITEMID'
)

# 조인 안 된 행만 필터링 (LABEL 또는 CATEGORY가 NaN인 경우)
unmatched_date = merged_df_date[merged_df_date['LABEL'].isna()]

# 결과 확인
unmatched_date

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,ITEMID,CHARTTIME,STORETIME,CGID,VALUE,VALUEUOM,WARNING,ERROR,RESULTSTATUS,STOPPED,LABEL,CATEGORY


In [19]:
merged_df_date

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,ITEMID,CHARTTIME,STORETIME,CGID,VALUE,VALUEUOM,WARNING,ERROR,RESULTSTATUS,STOPPED,LABEL,CATEGORY
0,3565,21343,169687.0,283695.0,3411,2183-10-20 11:00:00,2183-10-20 10:48:00,19412,2183-10-27 00:00:00,Date,,,,NotStopd,Equip Change [MM],
1,3566,21343,169687.0,283695.0,3411,2183-10-20 13:00:00,2183-10-20 12:44:00,19412,2183-10-27 00:00:00,Date,,,,NotStopd,Equip Change [MM],
2,3567,21343,169687.0,283695.0,3411,2183-10-20 14:00:00,2183-10-20 14:57:00,14948,2183-10-27 00:00:00,Date,,,,NotStopd,Equip Change [MM],
3,3104,21343,169687.0,283695.0,3411,2183-10-21 20:00:00,2183-10-21 19:56:00,17206,2183-10-27 00:00:00,Date,,,,NotStopd,Equip Change [MM],
4,3105,21343,169687.0,283695.0,3411,2183-10-21 21:00:00,2183-10-21 21:43:00,17206,2183-10-27 00:00:00,Date,,,,NotStopd,Equip Change [MM],
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184538,4465088,99491,125502.0,215100.0,224183,2139-02-11 00:31:00,2139-02-11 00:31:00,16037,2139-02-09 00:00:00,Date,0.0,0.0,,,PICC Line Cap Change,Access Lines - Invasive
184539,4465089,99491,125502.0,215100.0,224186,2139-02-11 00:31:00,2139-02-11 00:31:00,16037,2139-02-09 05:00:00,Date and Time,0.0,0.0,,,PICC Line Dressing Change,Access Lines - Invasive
184540,4465090,99491,125502.0,215100.0,224187,2139-02-11 00:31:00,2139-02-11 00:31:00,16037,2139-02-09 00:00:00,Date,0.0,0.0,,,PICC Line Tubing Change,Access Lines - Invasive
184541,4465091,99491,125502.0,215100.0,227790,2139-02-11 00:31:00,2139-02-11 00:31:00,16037,2139-02-05 23:53:00,Date and Time,0.0,0.0,,,Impaired Skin - Dressing Change #1,Skin - Impairment


In [20]:
merged_df_date = pd.merge(
    merged_df_date,
    caregivers[['CGID','LABEL', 'DESCRIPTION']],
    how='left',
    on='CGID'
)

# 조인 안 된 행만 필터링 (LABEL 또는 CATEGORY가 NaN인 경우)
unmatched_date = merged_df_date[merged_df_date['CGID'].isna()]

# 결과 확인
unmatched_date

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,ITEMID,CHARTTIME,STORETIME,CGID,VALUE,VALUEUOM,WARNING,ERROR,RESULTSTATUS,STOPPED,LABEL_x,CATEGORY,LABEL_y,DESCRIPTION


In [21]:
datetimeevents_df = merged_df_date

In [22]:
datetimeevents_df.columns

Index(['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'ITEMID', 'CHARTTIME',
       'RESULTSTATUS', 'STOPPED', 'LABEL_x', 'CATEGORY', 'LABEL_y',
       'DESCRIPTION'],
      dtype='object')

In [24]:
# 결과 저장 리스트
rows = []

# 제외할 컬럼
exclude_cols = {
   'ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'ITEMID','STORETIME', 'CGID',  "LABEL",
    "CHARTTIME", "VALUE", 'CATEGORY'
}

for index, row in datetimeevents_df.iterrows():
    patient_id = row["SUBJECT_ID"]
    admission_id = row["HADM_ID"]
    item_id = row["ITEMID"]
    variable_name = row["LABEL_x"]
    event_date = row["CHARTTIME"]
    value_d= row["VALUE"]
    recorder_position = row["LABEL_y"]
    
    # 기존 컬럼 처리
    for col in datetimeevents_df.columns:
        if col not in exclude_cols:
            value = row[col]
    
            rows.append({
                "Primary_key": index + 7405718,
                "Variable_ID": item_id,
                "Original_table_name": "DATETIMEEVENTS",
                "Variable_name": variable_name,
                "Event_date": event_date,
                "Value": value_d,
                "Unit":  np.nan,
                "Variable_type": np.nan,
                "Is_categorical": np.nan,
                "Recorder": np.nan,
                "Recorder_position": recorder_position,
                "Recorder_affiliation": np.nan,
                "Patient_id": patient_id,
                "Admission_id": admission_id,
                "Ground_truth": np.nan,
                "Mapping_info_1": np.nan,
                "Mapping_info_2": np.nan
            })

    
dateevents_quiq = pd.DataFrame(rows, columns=QUIQ_cols)

# -------------------------------
# Variable_type 추론 함수
def infer_variable_type(val):
    if pd.isna(val):
        return np.nan
    elif isinstance(val, (int, float)):
        return "numeric"
    elif isinstance(val, pd.Timestamp):
        return "timestamp"
    elif isinstance(val, str):
        try:
            pd.to_datetime(val, errors="raise")
            return "timestamp"
        except Exception:
            return "string"
    else:
        return "unknown"

# Variable_type 먼저 계산
dateevents_quiq["Variable_type"] = dateevents_quiq["Value"].apply(infer_variable_type)

# CATEGORICAL_THRESHOLD 설정
CATEGORICAL_THRESHOLD = 10

# 각 Variable_name 별 고유값 수 계산 (NaN 제외)
value_counts = dateevents_quiq.groupby("Variable_name")["Value"].nunique(dropna=True)

# 고유값 수가 기준 이하인 변수 목록 추출
categorical_vars = value_counts[value_counts <= CATEGORICAL_THRESHOLD].index

# Is_categorical 컬럼 채우기
dateevents_quiq["Is_categorical"] = dateevents_quiq.apply(
    lambda row: np.nan if pd.isna(row["Value"])
    else 1 if row["Variable_name"] in categorical_vars
    else 0,
    axis=1
)

In [25]:
dateevents_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,7405718,3411,DATETIMEEVENTS,Equip Change [MM],2183-10-20 11:00:00,2183-10-27 00:00:00,,timestamp,0.0,,RRT,,21343,169687.0,,,
1,7405718,3411,DATETIMEEVENTS,Equip Change [MM],2183-10-20 11:00:00,2183-10-27 00:00:00,,timestamp,0.0,,RRT,,21343,169687.0,,,
2,7405718,3411,DATETIMEEVENTS,Equip Change [MM],2183-10-20 11:00:00,2183-10-27 00:00:00,,timestamp,0.0,,RRT,,21343,169687.0,,,
3,7405718,3411,DATETIMEEVENTS,Equip Change [MM],2183-10-20 11:00:00,2183-10-27 00:00:00,,timestamp,0.0,,RRT,,21343,169687.0,,,
4,7405718,3411,DATETIMEEVENTS,Equip Change [MM],2183-10-20 11:00:00,2183-10-27 00:00:00,,timestamp,0.0,,RRT,,21343,169687.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1476339,7590260,227792,DATETIMEEVENTS,Impaired Skin - Dressing Change #3,2139-02-11 00:31:00,2139-02-09 09:35:00,,timestamp,0.0,,RN,,99491,125502.0,,,
1476340,7590260,227792,DATETIMEEVENTS,Impaired Skin - Dressing Change #3,2139-02-11 00:31:00,2139-02-09 09:35:00,,timestamp,0.0,,RN,,99491,125502.0,,,
1476341,7590260,227792,DATETIMEEVENTS,Impaired Skin - Dressing Change #3,2139-02-11 00:31:00,2139-02-09 09:35:00,,timestamp,0.0,,RN,,99491,125502.0,,,
1476342,7590260,227792,DATETIMEEVENTS,Impaired Skin - Dressing Change #3,2139-02-11 00:31:00,2139-02-09 09:35:00,,timestamp,0.0,,RN,,99491,125502.0,,,


In [26]:
# 1. 매핑 함수 수정
def map_mapping_info(row):
    value = row["Value"]

    if pd.isna(value):
        return pd.Series([np.nan, np.nan])  # 값이 없으면 NaN
    else:
        return pd.Series(["date", np.nan])  # 값이 있으면 "date"로 고정

# 2. 적용
dateevents_quiq[["Mapping_info_1", "Mapping_info_2"]] = dateevents_quiq.apply(map_mapping_info, axis=1)

In [27]:
dateevents_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,7405718,3411,DATETIMEEVENTS,Equip Change [MM],2183-10-20 11:00:00,2183-10-27 00:00:00,,timestamp,0.0,,RRT,,21343,169687.0,,date,
1,7405718,3411,DATETIMEEVENTS,Equip Change [MM],2183-10-20 11:00:00,2183-10-27 00:00:00,,timestamp,0.0,,RRT,,21343,169687.0,,date,
2,7405718,3411,DATETIMEEVENTS,Equip Change [MM],2183-10-20 11:00:00,2183-10-27 00:00:00,,timestamp,0.0,,RRT,,21343,169687.0,,date,
3,7405718,3411,DATETIMEEVENTS,Equip Change [MM],2183-10-20 11:00:00,2183-10-27 00:00:00,,timestamp,0.0,,RRT,,21343,169687.0,,date,
4,7405718,3411,DATETIMEEVENTS,Equip Change [MM],2183-10-20 11:00:00,2183-10-27 00:00:00,,timestamp,0.0,,RRT,,21343,169687.0,,date,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1476339,7590260,227792,DATETIMEEVENTS,Impaired Skin - Dressing Change #3,2139-02-11 00:31:00,2139-02-09 09:35:00,,timestamp,0.0,,RN,,99491,125502.0,,date,
1476340,7590260,227792,DATETIMEEVENTS,Impaired Skin - Dressing Change #3,2139-02-11 00:31:00,2139-02-09 09:35:00,,timestamp,0.0,,RN,,99491,125502.0,,date,
1476341,7590260,227792,DATETIMEEVENTS,Impaired Skin - Dressing Change #3,2139-02-11 00:31:00,2139-02-09 09:35:00,,timestamp,0.0,,RN,,99491,125502.0,,date,
1476342,7590260,227792,DATETIMEEVENTS,Impaired Skin - Dressing Change #3,2139-02-11 00:31:00,2139-02-09 09:35:00,,timestamp,0.0,,RN,,99491,125502.0,,date,


In [28]:
dateevents_quiq.to_csv("G:/2000/MIMIC_dateevents_QUIQ.csv", index=False)

In [29]:
# VIA 테이블 생성
via_variable_names = ['Equip Change [MM]', 'INV#1 Dsg Change [MM]',
       'INV#2 Dsg Change [MM]', 'INV#3 Dsg Change [MM]',
       'INV#4 Dsg Change [MM]', 'INV#5 Dsg Change [MM]',
       'INV#1 Cap Change [MM]', 'INV#6 Dsg Change [MM]',
       'INV#8 Dsg Change [MM]', 'INV#7 Dsg Change [MM]',
       'INV#1 Tubing Change [MM]', 'INV#2 Cap Change [MM]',
       'INV#2 Tubing Change [MM]', 'INV#3 Cap Change [MM]',
       'INV#3 Tubing Change [MM]', 'INV#4 Tubing Change [MM]',
       'INV#5 Tubing Change [MM]', 'INV#5 Cap Change [MM]',
       'INV#4 Cap Change [MM]', 'In-Line Sx Change [MM]',
       '22 Gauge Insertion Date', 'Hospital Admit Date',
       'INV#6 Cap Change [MM]', 'INV#6 Tubing Change [MM]',
       '20 Gauge Insertion Date', 'Arterial Line Dressing Change',
       'Arterial line Insertion Date', 'Arterial line Tubing Change',
       'Multi Lumen Cap Change', 'Multi Lumen Dressing Change',
       'Multi Lumen Insertion Date', 'Multi Lumen Tubing Change',
       'Arterial line Cap Change', 'Arterial line Change over Wire Date',
       'Midline Cap Change', 'Midline Dressing Change',
       'Midline Insertion Date', 'Date of Birth',
       '18 Gauge Insertion Date', 'INV#7 Cap Change [MM]',
       'Cordis/Introducer Insertion Date',
       'Cordis/Introducer Tubing Change', 'INV#7 Tubing Change [MM]',
       'Trauma Line Cap Change', 'Trauma Line Change over Wire Date',
       'Trauma Line Dressing Change', 'Sheath Dressing Change',
       'Sheath Insertion Date', 'Sheath Cap Change',
       'Sheath Tubing Change', 'GU Catheter Insertion Date',
       'Multi Lumen Change over Wire Date',
       'Cordis/Introducer Cap Change',
       'Cordis/Introducer Dressing Change', 'CCO PAC Cap Change',
       'PA Catheter Tubing Change', 'PA Catheter Cap Change',
       'PA Catheter Dressing Change', 'PA Catheter Insertion Date',
       'Cordis/Introducer Change over Wire Date', 'Last dialysis',
       'ScvO2 (Presep) Calibrated', 'AVA Insertion Date',
       'AVA Tubing Change', 'AVA Cap Change', 'AVA Dressing Change',
       'CCO PAC Dressing Change', 'CCO PAC Tubing Change',
       'CCO PAC Insertion Date', 'Dialysis Catheter Cap Change',
       'Dialysis Catheter Dressing Change',
       'Dialysis Catheter Insertion Date',
       'Dialysis CatheterTubing Change',
       'Impaired Skin  - Dressing Change #1',
       'Impaired Skin  - Dressing Change #2', 'Trauma Line Tubing Change',
       'Trauma Line Insertion Date', '16 Gauge Insertion Date',
       'Last menses', 'Impaired Skin  - Dressing Change #3',
       'SvO2 Calibrated', 'Presep Catheter Tubing Change',
       'Presep Catheter Cap Change', 'Presep Catheter Dressing Change',
       'Presep Catheter Insertion Date', 'PICC Line Cap Change',
       'PICC Line Insertion Date', 'PICC Line Change over Wire Date',
       'PICC Line Dressing Change', 'PICC Line Tubing Change',
       'Impaired Skin  - Dressing Change #4',
       'Impaired Skin  - Dressing Change #5', 'Date - Therapist',
       'RIC Insertion Date', 'Indwelling Port (PortaCath)Tubing Change',
       'Indwelling Port (PortaCath) Cap Change',
       'Indwelling (PortaCath) Dressing Change', 'Midline Tubing Change',
       'Impaired Skin  - Dressing Change #6',
       'Tunneled (Hickman) Dressing Change', 'Referral Date',
       'Tunneled (Hickman) Cap Change',
       'Tunneled (Hickman) Insertion Date', 'GU Catheter D/C Date',
       'Tunneled (Hickman) Tubing Change', 'Pheresis Catheter Cap Change',
       'Pheresis Catheter Dressing Change',
       'Pheresis Catheter Tubing Change',
       'Indwelling (PortaCath) Port #1 Date Accessed',
       'Dialysis Catheter Change over Wire Date',
       'Impaired Skin  - Dressing Change #9',
       'Pheresis Catheter Insertion Date',
       'Impaired Skin  - Dressing Change #7',
       'Indwelling (PortaCath) Port #2 Date Accessed',
       'IABP Dressing Change', 'IABP Insertion Date',
       'IABP Tubing Change', 'Impaired Skin  - Dressing Change #8',
       'Impaired Skin  - Dressing Change #10', 'ICP Line Dressing Change',
       'ICP Line Insertion Date', 'ICP Line Tubing Change',
       'Organ Bank Notified',
       'Indwelling Port (PortaCath) Insertion Date', 'IO Insertion Date',
       'Indwelling (PortaCath) Port #1 Date De-accessed',
       'Date - Student', 'Triple Introducer Cap Change',
       'Triple Introducer Dressing Change',
       'Triple Introducer Tubing Change',
       'Triple Introducer Insertion Date', '14 Gauge Insertion Date',
       'AVA Change over Wire Date', 'PA Catheter Change over Wire Date',
       'Presep Catheter Change over Wire Date',
       'Impella Aortic Pressure Tubing Change',
       'Impella Daily Tubing Change', 'Impella Dressing Change',
       'Impella Insertion Date',
       'Pheresis Catheter Change over Wire Date',
       'CCO PAC Change over Wire Date', 'Sheath Change over Wire Date',
       'Midline Change over Wire Date',
       'Triple Introducer Change over Wire Date',
       'Tandem Heart Return Cannula Dressing Change',
       'Tandem Heart Return Cannula Insertion Date',
       'Tandem Heart Access Line Insertion Date',
       'Tandem Heart Access Line Dressing Change',
       'Tandem Heart Access Line Tubing Change',
       'Indwelling (PortaCath) Port #2 Date De-accessed'
    
]

via_descriptions = ['Equip Change [MM]', 'INV#1 Dsg Change [MM]',
       'INV#2 Dsg Change [MM]', 'INV#3 Dsg Change [MM]',
       'INV#4 Dsg Change [MM]', 'INV#5 Dsg Change [MM]',
       'INV#1 Cap Change [MM]', 'INV#6 Dsg Change [MM]',
       'INV#8 Dsg Change [MM]', 'INV#7 Dsg Change [MM]',
       'INV#1 Tubing Change [MM]', 'INV#2 Cap Change [MM]',
       'INV#2 Tubing Change [MM]', 'INV#3 Cap Change [MM]',
       'INV#3 Tubing Change [MM]', 'INV#4 Tubing Change [MM]',
       'INV#5 Tubing Change [MM]', 'INV#5 Cap Change [MM]',
       'INV#4 Cap Change [MM]', 'In-Line Sx Change [MM]',
       '22 Gauge Insertion Date', 'Hospital Admit Date',
       'INV#6 Cap Change [MM]', 'INV#6 Tubing Change [MM]',
       '20 Gauge Insertion Date', 'Arterial Line Dressing Change',
       'Arterial line Insertion Date', 'Arterial line Tubing Change',
       'Multi Lumen Cap Change', 'Multi Lumen Dressing Change',
       'Multi Lumen Insertion Date', 'Multi Lumen Tubing Change',
       'Arterial line Cap Change', 'Arterial line Change over Wire Date',
       'Midline Cap Change', 'Midline Dressing Change',
       'Midline Insertion Date', 'Date of Birth',
       '18 Gauge Insertion Date', 'INV#7 Cap Change [MM]',
       'Cordis/Introducer Insertion Date',
       'Cordis/Introducer Tubing Change', 'INV#7 Tubing Change [MM]',
       'Trauma Line Cap Change', 'Trauma Line Change over Wire Date',
       'Trauma Line Dressing Change', 'Sheath Dressing Change',
       'Sheath Insertion Date', 'Sheath Cap Change',
       'Sheath Tubing Change', 'GU Catheter Insertion Date',
       'Multi Lumen Change over Wire Date',
       'Cordis/Introducer Cap Change',
       'Cordis/Introducer Dressing Change', 'CCO PAC Cap Change',
       'PA Catheter Tubing Change', 'PA Catheter Cap Change',
       'PA Catheter Dressing Change', 'PA Catheter Insertion Date',
       'Cordis/Introducer Change over Wire Date', 'Last dialysis',
       'ScvO2 (Presep) Calibrated', 'AVA Insertion Date',
       'AVA Tubing Change', 'AVA Cap Change', 'AVA Dressing Change',
       'CCO PAC Dressing Change', 'CCO PAC Tubing Change',
       'CCO PAC Insertion Date', 'Dialysis Catheter Cap Change',
       'Dialysis Catheter Dressing Change',
       'Dialysis Catheter Insertion Date',
       'Dialysis CatheterTubing Change',
       'Impaired Skin  - Dressing Change #1',
       'Impaired Skin  - Dressing Change #2', 'Trauma Line Tubing Change',
       'Trauma Line Insertion Date', '16 Gauge Insertion Date',
       'Last menses', 'Impaired Skin  - Dressing Change #3',
       'SvO2 Calibrated', 'Presep Catheter Tubing Change',
       'Presep Catheter Cap Change', 'Presep Catheter Dressing Change',
       'Presep Catheter Insertion Date', 'PICC Line Cap Change',
       'PICC Line Insertion Date', 'PICC Line Change over Wire Date',
       'PICC Line Dressing Change', 'PICC Line Tubing Change',
       'Impaired Skin  - Dressing Change #4',
       'Impaired Skin  - Dressing Change #5', 'Date - Therapist',
       'RIC Insertion Date', 'Indwelling Port (PortaCath)Tubing Change',
       'Indwelling Port (PortaCath) Cap Change',
       'Indwelling (PortaCath) Dressing Change', 'Midline Tubing Change',
       'Impaired Skin  - Dressing Change #6',
       'Tunneled (Hickman) Dressing Change', 'Referral Date',
       'Tunneled (Hickman) Cap Change',
       'Tunneled (Hickman) Insertion Date', 'GU Catheter D/C Date',
       'Tunneled (Hickman) Tubing Change', 'Pheresis Catheter Cap Change',
       'Pheresis Catheter Dressing Change',
       'Pheresis Catheter Tubing Change',
       'Indwelling (PortaCath) Port #1 Date Accessed',
       'Dialysis Catheter Change over Wire Date',
       'Impaired Skin  - Dressing Change #9',
       'Pheresis Catheter Insertion Date',
       'Impaired Skin  - Dressing Change #7',
       'Indwelling (PortaCath) Port #2 Date Accessed',
       'IABP Dressing Change', 'IABP Insertion Date',
       'IABP Tubing Change', 'Impaired Skin  - Dressing Change #8',
       'Impaired Skin  - Dressing Change #10', 'ICP Line Dressing Change',
       'ICP Line Insertion Date', 'ICP Line Tubing Change',
       'Organ Bank Notified',
       'Indwelling Port (PortaCath) Insertion Date', 'IO Insertion Date',
       'Indwelling (PortaCath) Port #1 Date De-accessed',
       'Date - Student', 'Triple Introducer Cap Change',
       'Triple Introducer Dressing Change',
       'Triple Introducer Tubing Change',
       'Triple Introducer Insertion Date', '14 Gauge Insertion Date',
       'AVA Change over Wire Date', 'PA Catheter Change over Wire Date',
       'Presep Catheter Change over Wire Date',
       'Impella Aortic Pressure Tubing Change',
       'Impella Daily Tubing Change', 'Impella Dressing Change',
       'Impella Insertion Date',
       'Pheresis Catheter Change over Wire Date',
       'CCO PAC Change over Wire Date', 'Sheath Change over Wire Date',
       'Midline Change over Wire Date',
       'Triple Introducer Change over Wire Date',
       'Tandem Heart Return Cannula Dressing Change',
       'Tandem Heart Return Cannula Insertion Date',
       'Tandem Heart Access Line Insertion Date',
       'Tandem Heart Access Line Dressing Change',
       'Tandem Heart Access Line Tubing Change',
       'Indwelling (PortaCath) Port #2 Date De-accessed'

]

via_datetimeevents = pd.DataFrame({
    'Original_table_name': 'DATETIMEEVENTS',
    'Variable_name': via_variable_names,
    'Description': via_descriptions
})
via_datetimeevents

Unnamed: 0,Original_table_name,Variable_name,Description
0,DATETIMEEVENTS,Equip Change [MM],Equip Change [MM]
1,DATETIMEEVENTS,INV#1 Dsg Change [MM],INV#1 Dsg Change [MM]
2,DATETIMEEVENTS,INV#2 Dsg Change [MM],INV#2 Dsg Change [MM]
3,DATETIMEEVENTS,INV#3 Dsg Change [MM],INV#3 Dsg Change [MM]
4,DATETIMEEVENTS,INV#4 Dsg Change [MM],INV#4 Dsg Change [MM]
...,...,...,...
145,DATETIMEEVENTS,Tandem Heart Return Cannula Insertion Date,Tandem Heart Return Cannula Insertion Date
146,DATETIMEEVENTS,Tandem Heart Access Line Insertion Date,Tandem Heart Access Line Insertion Date
147,DATETIMEEVENTS,Tandem Heart Access Line Dressing Change,Tandem Heart Access Line Dressing Change
148,DATETIMEEVENTS,Tandem Heart Access Line Tubing Change,Tandem Heart Access Line Tubing Change


In [30]:
via_datetimeevents.to_csv("G:/2000/MIMIC_datetimeevents_QUIQ.csv", index=False)

# Inputevents_CV - procedure같음

In [157]:
inputevents_cv = pd.read_csv('INPUTEVENTS_CV.csv.gz', compression='gzip')
inputevents_cv

  inputevents_cv = pd.read_csv('INPUTEVENTS_CV.csv.gz', compression='gzip')


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,CHARTTIME,ITEMID,AMOUNT,AMOUNTUOM,RATE,RATEUOM,...,ORDERID,LINKORDERID,STOPPED,NEWBOTTLE,ORIGINALAMOUNT,ORIGINALAMOUNTUOM,ORIGINALROUTE,ORIGINALRATE,ORIGINALRATEUOM,ORIGINALSITE
0,592,24457,184834.0,205776.0,2193-09-11 09:00:00,30056,100.0,ml,,,...,756654,9359133,,,,ml,Oral,,,
1,593,24457,184834.0,205776.0,2193-09-11 12:00:00,30056,200.0,ml,,,...,3564075,9359133,,,,ml,Oral,,,
2,594,24457,184834.0,205776.0,2193-09-11 16:00:00,30056,160.0,ml,,,...,422646,9359133,,,,ml,Oral,,,
3,595,24457,184834.0,205776.0,2193-09-11 19:00:00,30056,240.0,ml,,,...,5137889,9359133,,,,ml,Oral,,,
4,596,24457,184834.0,205776.0,2193-09-11 21:00:00,30056,50.0,ml,,,...,8343792,9359133,,,,ml,Oral,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17527930,17422687,27946,169481.0,212466.0,2154-07-16 22:00:00,40864,82.0,ml,,,...,10296124,4180996,,,,,,,,
17527931,17422688,27946,169481.0,212466.0,2154-07-16 23:00:00,40864,82.0,ml,,,...,10248887,4180996,,,,,,,,
17527932,17422689,27946,169481.0,212466.0,2154-07-17 00:00:00,40864,,ml,,,...,7418343,4180996,,,,,,,,
17527933,17422690,30516,132490.0,260091.0,2105-08-25 06:30:00,46046,31.0,ml,,,...,1753414,4180996,,,,,,,,


In [160]:
merged_df_input = pd.merge(
    inputevents_cv,
    d_item[['ITEMID', 'LABEL', 'CATEGORY']],
    how='left',
    on='ITEMID'
)

# 조인 안 된 행만 필터링 (LABEL 또는 CATEGORY가 NaN인 경우)
unmatched = merged_df_input[merged_df_input['LABEL'].isna()]

# 결과 확인
unmatched

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,CHARTTIME,ITEMID,AMOUNT,AMOUNTUOM,RATE,RATEUOM,...,STOPPED,NEWBOTTLE,ORIGINALAMOUNT,ORIGINALAMOUNTUOM,ORIGINALROUTE,ORIGINALRATE,ORIGINALRATEUOM,ORIGINALSITE,LABEL,CATEGORY
2878700,2944395,12581,107814.0,229220.0,2106-02-28 12:00:00,30140,6.900000,ml,,,...,,,50.0,vl,Intravenous Push,,,,,
2878701,2944396,12581,107814.0,229220.0,2106-02-28 14:00:00,30140,6.900000,ml,,,...,,,50.0,vl,Intravenous Push,,,,,
2878702,2944397,12581,107814.0,229220.0,2106-02-28 16:00:00,30140,6.900000,ml,,,...,,,50.0,vl,Intravenous Push,,,,,
2878703,2944398,12581,107814.0,229220.0,2106-02-28 18:00:00,30140,10.350000,ml,,,...,,,50.0,vl,Intravenous Push,,,,,
2878704,2944399,12581,107814.0,229220.0,2106-02-28 20:00:00,30140,10.350000,ml,,,...,,,50.0,vl,Intravenous Push,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9017633,8908891,31005,120863.0,289192.0,2106-01-28 18:00:00,30140,21.799999,ml,,,...,,,100.0,vl,Intravenous Push,1962.899902,ml/hr,,,
9017634,8908892,31005,120863.0,289192.0,2106-01-28 20:00:00,30140,40.348495,ml,,,...,,,100.0,vl,Intravenous Push,1962.899902,ml/hr,,,
9017635,8908893,31005,120863.0,289192.0,2106-01-28 21:00:00,30140,19.628998,ml,,,...,,,100.0,vl,Intravenous Push,1962.899902,ml/hr,,,
9017636,8908894,31005,120863.0,289192.0,2106-01-28 22:00:00,30140,19.600000,ml,,,...,,,100.0,vl,Intravenous Push,1962.899902,ml/hr,,,


- D_ITEMS에서 직접 확인한 결과 30140 은 LABEL, CATEGORY가 없음

In [161]:
inputevents_cv_df = merged_df_input

In [162]:
inputevents_cv_df.columns

Index(['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'CHARTTIME', 'ITEMID',
       'AMOUNT', 'AMOUNTUOM', 'RATE', 'RATEUOM', 'STORETIME', 'CGID',
       'ORDERID', 'LINKORDERID', 'STOPPED', 'NEWBOTTLE', 'ORIGINALAMOUNT',
       'ORIGINALAMOUNTUOM', 'ORIGINALROUTE', 'ORIGINALRATE', 'ORIGINALRATEUOM',
       'ORIGINALSITE', 'LABEL', 'CATEGORY'],
      dtype='object')

In [163]:
inputevents_cv_df = inputevents_cv_df[inputevents_cv_df["SUBJECT_ID"].isin(la)].reset_index(drop=True)
inputevents_cv_df

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,CHARTTIME,ITEMID,AMOUNT,AMOUNTUOM,RATE,RATEUOM,...,STOPPED,NEWBOTTLE,ORIGINALAMOUNT,ORIGINALAMOUNTUOM,ORIGINALROUTE,ORIGINALRATE,ORIGINALRATEUOM,ORIGINALSITE,LABEL,CATEGORY
0,189,14469,155925.0,204432.0,2111-06-20 18:00:00,30056,120.0,ml,,,...,,,,ml,Oral,,,,Po Intake,
1,190,14469,155925.0,204432.0,2111-06-20 19:00:00,30056,60.0,ml,,,...,,,,ml,Oral,,,,Po Intake,
2,191,14469,155925.0,204432.0,2111-06-21 08:00:00,30056,90.0,ml,,,...,,,,ml,Oral,,,,Po Intake,
3,192,14469,155925.0,204432.0,2111-06-21 10:00:00,30056,120.0,ml,,,...,,,,ml,Oral,,,,Po Intake,
4,193,14469,155925.0,204432.0,2111-06-21 12:00:00,30056,120.0,ml,,,...,,,,ml,Oral,,,,Po Intake,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
754730,17449048,7676,197621.0,218811.0,2117-01-22 08:00:00,44994,2500.0,ml,,,...,,,,,,,,,PD Dwell,Free Form Intake
754731,17449049,7676,197621.0,218811.0,2117-01-22 10:00:00,44994,,ml,,,...,,,,,,,,,PD Dwell,Free Form Intake
754732,17449050,7676,197621.0,218811.0,2117-01-22 13:00:00,44994,2500.0,ml,,,...,,,,,,,,,PD Dwell,Free Form Intake
754733,17449051,7676,197621.0,218811.0,2117-01-22 18:00:00,44994,2500.0,ml,,,...,,,,,,,,,PD Dwell,Free Form Intake


In [164]:
inputevents_cv_df['LABEL'].unique()

array(['Po Intake', 'D5W', 'Lactated Ringers', 'IV Piggyback',
       '.9% Normal Saline', 'Carrier', 'OR Crystalloid',
       'PACU Crystalloids', 'TF Residual', 'OR Colloid',
       'OR Autologous Blood', 'D5/.45NS', 'Vivonex', 'Sterile Water', nan,
       "Packed RBC's", 'Dextrose 10%', '.45% Normal Saline', 'D5NS',
       'Gastric Meds', 'PACU Colloids', 'Pre-Admission Intake', 'OR FFP',
       'Fresh Frozen Plasma', "OR Packed RBC's", 'Nepro', 'Platelets',
       'TPN', 'PPN', 'Replete w/fiber', 'Cath Lab Intake', 'TPN w/Lipids',
       'D5 Ringers Lact.', 'D5 Normal Saline', 'D5RL', 'Impact w/fiber',
       'OR Platelets', 'Albumin 5%', 'Dextran 40', 'Albumin 25%',
       'Deliver 2.0', 'Lipids', 'Hespan', 'Ultracal', 'Free Water Bolus',
       'Peptamen', 'Tube Feeding', 'GT Flush', 'Cell Saver',
       'Cryoprecipitate', 'Fentanyl Base', 'Isocal HN', "Washed PRBC's",
       'Respalor', 'Promote w/fiber', 'Criticare HN', '3% Normal Saline',
       'Promote', 'Other Blood Product

In [158]:
inputevents_cv_quiq = pd.read_csv("G:/2000/MIMIC_inputeventscv_QUIQ.csv")
inputevents_cv_quiq

  inputevents_cv_quiq = pd.read_csv("G:/2000/MIMIC_inputeventscv_QUIQ.csv")


Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,7590261,,INPUTEVENTS_CV,CHARTTIME,,2111-06-20 18:00:00,,timestamp,0,,,,14469,155925.0,,date,
1,7590261,,INPUTEVENTS_CV,AMOUNT,,120.0,ml,numeric,0,,,,14469,155925.0,,prescription,prescription_info
2,7590261,,INPUTEVENTS_CV,RATE,,,,,0,,,,14469,155925.0,,,
3,7590261,,INPUTEVENTS_CV,CGID,,18424.0,,numeric,0,,,,14469,155925.0,,,
4,7590261,,INPUTEVENTS_CV,ORDERID,,3468909,,numeric,0,,,,14469,155925.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11321020,8344995,,INPUTEVENTS_CV,ORIGINALROUTE,,,,,0,,,,7676,197621.0,,,
11321021,8344995,,INPUTEVENTS_CV,ORIGINALRATE,,,,,0,,,,7676,197621.0,,,
11321022,8344995,,INPUTEVENTS_CV,ORIGINALRATEUOM,,,,,1,,,,7676,197621.0,,,
11321023,8344995,,INPUTEVENTS_CV,ORIGINALSITE,,,,,0,,,,7676,197621.0,,,


In [170]:
# 결과 저장 리스트
rows = []

# 제외할 컬럼
exclude_cols = {'ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID',  'AMOUNTUOM', 'RATEUOM', "STORETIME", "ORIGINALRATEUOM",
                "ORIGINALAMOUNTUOM","ITEMID", "CATEGORY"}


# admission 반복 처리
for index, row in inputevents_cv_df.iterrows():
    patient_id = row["SUBJECT_ID"]
    admission_id = row["HADM_ID"]
    
    for col in inputevents_cv_df.columns:
        if col not in exclude_cols:
            value = row[col]

            # Unit 처리 조건
            if col == "AMOUNT":
                unit_val = row["AMOUNTUOM"] if "AMOUNTUOM" in inputevents_cv_df.columns else np.nan
            elif col == "RATE":
                unit_val = row["RATEUOM"] if "RATEUOM" in inputevents_cv_df.columns else np.nan
            elif col == "ORIGINALRATE":
                unit_val = row['ORIGINALRATEUOM'] if "ORIGINALRATEUOM"in inputevents_cv_df.columns else np.nan
            elif col == "ORIGINALAMOUNT":
                unit_val = row["ORIGINALAMOUNTUOM"] if "ORIGINALAMOUNTUOM" in inputevents_cv_df.columns else np.nan
            else:
                unit_val = np.nan
                
            # Event_date 설정
            if col == "LABEL":
                event_date_val = row["STORETIME"] if "STORETIME" in inputevents_cv_df.columns else np.nan
            else:
                event_date_val = np.nan
                
            # Variable_ID 설정: LABEL일 때만 ITEMID 사용
            variable_id_val = row["ITEMID"] if col == "LABEL" and "ITEMID" in inputevents_cv_df.columns else np.nan

            rows.append({
                "Primary_key": index + 7590261,
                "Variable_ID": variable_id_val,
                "Original_table_name": "INPUTEVENTS_CV",
                "Variable_name": col,
                "Event_date": event_date_val,
                "Value": value,
                "Unit": unit_val,
                "Variable_type": np.nan,      # 나중에 설정
                "Is_categorical": np.nan,     # 나중에 설정
                "Recorder": np.nan,
                "Recorder_position": np.nan,
                "Recorder_affiliation": np.nan,
                "Patient_id": patient_id,
                "Admission_id": admission_id,
                "Ground_truth": np.nan,
                "Mapping_info_1": np.nan,
                "Mapping_info_2": np.nan
            })

# QUIQ 테이블로 변환
inputevents_cv_quiq = pd.DataFrame(rows, columns=QUIQ_cols)

# -------------------------------
# Variable_type 추론 함수
def infer_variable_type(val):
    if pd.isna(val):
        return np.nan
    elif isinstance(val, (int, float)):
        return "numeric"
    elif isinstance(val, pd.Timestamp):
        return "timestamp"
    elif isinstance(val, str):
        try:
            pd.to_datetime(val, errors="raise")
            return "timestamp"
        except Exception:
            return "string"
    else:
        return "unknown"

# Variable_type 컬럼 채우기
inputevents_cv_quiq["Variable_type"] = inputevents_cv_quiq["Value"].apply(infer_variable_type)

# -------------------------------
CATEGORICAL_THRESHOLD = 10

# 1. 각 Variable_name 별 고유값 수 계산
value_counts = inputevents_cv_quiq.groupby("Variable_name")["Value"].nunique()

# 2. 고유값 수가 기준 이하인 변수 목록 추출
categorical_vars = value_counts[value_counts <= CATEGORICAL_THRESHOLD].index

# 3. Is_categorical 컬럼 채우기 (Value가 NaN이면 NaN)
inputevents_cv_quiq["Is_categorical"] = inputevents_cv_quiq.apply(
    lambda row: np.nan if pd.isna(row["Value"])
    else 1 if row["Variable_name"] in categorical_vars
    else 0,
    axis=1
)
inputevents_cv_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,7590261,,INPUTEVENTS_CV,CHARTTIME,,2111-06-20 18:00:00,,timestamp,0.0,,,,14469,155925.0,,,
1,7590261,,INPUTEVENTS_CV,AMOUNT,,120.0,ml,numeric,0.0,,,,14469,155925.0,,,
2,7590261,,INPUTEVENTS_CV,RATE,,,,,,,,,14469,155925.0,,,
3,7590261,,INPUTEVENTS_CV,CGID,,18424.0,,numeric,0.0,,,,14469,155925.0,,,
4,7590261,,INPUTEVENTS_CV,ORDERID,,3468909,,numeric,0.0,,,,14469,155925.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9811550,8344995,,INPUTEVENTS_CV,ORIGINALAMOUNT,,,,,,,,,7676,197621.0,,,
9811551,8344995,,INPUTEVENTS_CV,ORIGINALROUTE,,,,,,,,,7676,197621.0,,,
9811552,8344995,,INPUTEVENTS_CV,ORIGINALRATE,,,,,,,,,7676,197621.0,,,
9811553,8344995,,INPUTEVENTS_CV,ORIGINALSITE,,,,,,,,,7676,197621.0,,,


In [171]:
# 2. LINKORDERID, ORDERID, LABEL, ORIGINAL → Is_categorical = 1 or NaN (if Value is NaN)
target_vars = ["LINKORDERID", "ORDERID", "CGID","LABEL", "ORIGINAL"]
condition_targets = inputevents_cv_quiq["Variable_name"].isin(target_vars)

inputevents_cv_quiq.loc[condition_targets, "Is_categorical"] = inputevents_cv_quiq.loc[condition_targets, "Value"].apply(
    lambda v: 1 if pd.notna(v) else np.nan
)

# 3. LINKORDERID, ORDERID → Variable_type = "string" (only if Value is not null)
cond_linkorderid = (
    (inputevents_cv_quiq["Variable_name"] == "LINKORDERID") &
    (inputevents_cv_quiq["Value"].notna())
)
inputevents_cv_quiq.loc[cond_linkorderid, "Variable_type"] = "string"

cond_orderid = (
    (inputevents_cv_quiq["Variable_name"] == "ORDERID") &
    (inputevents_cv_quiq["Value"].notna())
)
inputevents_cv_quiq.loc[cond_orderid, "Variable_type"] = "string"

cond_cgid = (
    (inputevents_cv_quiq["Variable_name"] == "CGID") &
    (inputevents_cv_quiq["Value"].notna())
)
inputevents_cv_quiq.loc[cond_cgid, "Variable_type"] = "string"

inputevents_cv_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,7590261,,INPUTEVENTS_CV,CHARTTIME,,2111-06-20 18:00:00,,timestamp,0.0,,,,14469,155925.0,,,
1,7590261,,INPUTEVENTS_CV,AMOUNT,,120.0,ml,numeric,0.0,,,,14469,155925.0,,,
2,7590261,,INPUTEVENTS_CV,RATE,,,,,,,,,14469,155925.0,,,
3,7590261,,INPUTEVENTS_CV,CGID,,18424.0,,string,1.0,,,,14469,155925.0,,,
4,7590261,,INPUTEVENTS_CV,ORDERID,,3468909,,string,1.0,,,,14469,155925.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9811550,8344995,,INPUTEVENTS_CV,ORIGINALAMOUNT,,,,,,,,,7676,197621.0,,,
9811551,8344995,,INPUTEVENTS_CV,ORIGINALROUTE,,,,,,,,,7676,197621.0,,,
9811552,8344995,,INPUTEVENTS_CV,ORIGINALRATE,,,,,,,,,7676,197621.0,,,
9811553,8344995,,INPUTEVENTS_CV,ORIGINALSITE,,,,,,,,,7676,197621.0,,,


In [172]:
# Mapping
mapping_rules = {
    'CHARTTIME':("date", np.nan), 
    'AMOUNT': ("prescription", "prescription_info"),
    'ORIGINALRATE': ("prescription", "prescription_info"),
    'ORIGINALAMOUNT': ("prescription", "prescription_info")
}

# 2. 매핑 함수
def map_mapping_info(row):
    var_name = row["Variable_name"]
    value = row["Value"]

    if pd.isna(value):
        return pd.Series([np.nan, np.nan])  # ✅ value가 NaN이면 매핑 안함
    elif var_name == "LABEL":
        return pd.Series(["prescription", np.nan])
    else:
        return pd.Series(mapping_rules.get(var_name, (np.nan, np.nan)))

# 3. 적용
inputevents_cv_quiq[["Mapping_info_1", "Mapping_info_2"]] = inputevents_cv_quiq.apply(map_mapping_info, axis=1)
inputevents_cv_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,7590261,,INPUTEVENTS_CV,CHARTTIME,,2111-06-20 18:00:00,,timestamp,0.0,,,,14469,155925.0,,date,
1,7590261,,INPUTEVENTS_CV,AMOUNT,,120.0,ml,numeric,0.0,,,,14469,155925.0,,prescription,prescription_info
2,7590261,,INPUTEVENTS_CV,RATE,,,,,,,,,14469,155925.0,,,
3,7590261,,INPUTEVENTS_CV,CGID,,18424.0,,string,1.0,,,,14469,155925.0,,,
4,7590261,,INPUTEVENTS_CV,ORDERID,,3468909,,string,1.0,,,,14469,155925.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9811550,8344995,,INPUTEVENTS_CV,ORIGINALAMOUNT,,,,,,,,,7676,197621.0,,,
9811551,8344995,,INPUTEVENTS_CV,ORIGINALROUTE,,,,,,,,,7676,197621.0,,,
9811552,8344995,,INPUTEVENTS_CV,ORIGINALRATE,,,,,,,,,7676,197621.0,,,
9811553,8344995,,INPUTEVENTS_CV,ORIGINALSITE,,,,,,,,,7676,197621.0,,,


In [175]:
condition = (
    (inputevents_cv_quiq["Variable_name"] == "ORIGINALROUTE") &
    (inputevents_cv_quiq["Is_categorical"] == 0)
)

# 해당 조건에 맞는 행들의 Is_categorical 값을 0으로 변경
inputevents_cv_quiq.loc[condition, "Is_categorical"] = 1
inputevents_cv_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,7590261,,INPUTEVENTS_CV,CHARTTIME,,2111-06-20 18:00:00,,timestamp,0.0,,,,14469,155925.0,,date,
1,7590261,,INPUTEVENTS_CV,AMOUNT,,120.0,ml,numeric,0.0,,,,14469,155925.0,,prescription,prescription_info
2,7590261,,INPUTEVENTS_CV,RATE,,,,,,,,,14469,155925.0,,,
3,7590261,,INPUTEVENTS_CV,CGID,,18424.0,,string,1.0,,,,14469,155925.0,,,
4,7590261,,INPUTEVENTS_CV,ORDERID,,3468909,,string,1.0,,,,14469,155925.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9811550,8344995,,INPUTEVENTS_CV,ORIGINALAMOUNT,,,,,,,,,7676,197621.0,,,
9811551,8344995,,INPUTEVENTS_CV,ORIGINALROUTE,,,,,,,,,7676,197621.0,,,
9811552,8344995,,INPUTEVENTS_CV,ORIGINALRATE,,,,,,,,,7676,197621.0,,,
9811553,8344995,,INPUTEVENTS_CV,ORIGINALSITE,,,,,,,,,7676,197621.0,,,


In [176]:
inputevents_cv_quiq.head(30)

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,7590261,,INPUTEVENTS_CV,CHARTTIME,,2111-06-20 18:00:00,,timestamp,0.0,,,,14469,155925.0,,date,
1,7590261,,INPUTEVENTS_CV,AMOUNT,,120.0,ml,numeric,0.0,,,,14469,155925.0,,prescription,prescription_info
2,7590261,,INPUTEVENTS_CV,RATE,,,,,,,,,14469,155925.0,,,
3,7590261,,INPUTEVENTS_CV,CGID,,18424.0,,string,1.0,,,,14469,155925.0,,,
4,7590261,,INPUTEVENTS_CV,ORDERID,,3468909,,string,1.0,,,,14469,155925.0,,,
5,7590261,,INPUTEVENTS_CV,LINKORDERID,,3252849,,string,1.0,,,,14469,155925.0,,,
6,7590261,,INPUTEVENTS_CV,STOPPED,,,,,,,,,14469,155925.0,,,
7,7590261,,INPUTEVENTS_CV,NEWBOTTLE,,,,,,,,,14469,155925.0,,,
8,7590261,,INPUTEVENTS_CV,ORIGINALAMOUNT,,,ml,,,,,,14469,155925.0,,,
9,7590261,,INPUTEVENTS_CV,ORIGINALROUTE,,Oral,,string,1.0,,,,14469,155925.0,,,


In [177]:
inputevents_cv_quiq.to_csv("G:/2000/MIMIC_inputeventscv_QUIQ.csv", index=False)

In [41]:
inputevents_cv_quiq['Variable_name'].unique()

array(['CHARTTIME', 'AMOUNT', 'RATE', 'CGID', 'ORDERID', 'LINKORDERID',
       'STOPPED', 'NEWBOTTLE', 'ORIGINALAMOUNT', 'ORIGINALAMOUNTUOM',
       'ORIGINALROUTE', 'ORIGINALRATE', 'ORIGINALRATEUOM', 'ORIGINALSITE',
       'LABEL'], dtype=object)

In [43]:
# VIA 테이블 생성
via_variable_names = ['CHARTTIME', 'AMOUNT', 'RATE', 'CGID', 'ORDERID', 'LINKORDERID',
       'STOPPED', 'NEWBOTTLE', 'ORIGINALAMOUNT','ORIGINALROUTE', 'ORIGINALRATE', 'ORIGINALSITE',
       'LABEL']

via_descriptions = ['the time at which the measurement was chrted',
                    'amount of a drug or substance administered to the patient either between the starttime and endtime',
                    'the rate at which the drug or substance was administered to the patient either between the starttime and endtime',
                    'identifier for the caregiver who validated the given measurement',
                    "links multiple items contatined in the same solution together",
                    "links the same order across multiple instantiations",
                    "indicates whether the infusion has been disconnected or continued",
                    "indicates if a new preparation of the solution was hung at the bedside",
                    "provide information about the solution the medication was a part of when it was first entered into the information system",
                    "provide information about the solution the medication was a part of when it was first entered into the information system",
                    "provide information about the solution the medication was a part of when it was first entered into the information system",
                    "provide information about the solution the medication was a part of when it was first entered into the information system" ,
                    "LABEL"
                   ]


via_inputevents_cv = pd.DataFrame({
    'Original_table_name': 'INPUTEVENTS_CV',
    'Variable_name': via_variable_names,
    'Description': via_descriptions
})
via_inputevents_cv

Unnamed: 0,Original_table_name,Variable_name,Description
0,INPUTEVENTS_CV,CHARTTIME,the time at which the measurement was chrted
1,INPUTEVENTS_CV,AMOUNT,amount of a drug or substance administered to ...
2,INPUTEVENTS_CV,RATE,the rate at which the drug or substance was ad...
3,INPUTEVENTS_CV,CGID,identifier for the caregiver who validated the...
4,INPUTEVENTS_CV,ORDERID,links multiple items contatined in the same so...
5,INPUTEVENTS_CV,LINKORDERID,links the same order across multiple instantia...
6,INPUTEVENTS_CV,STOPPED,indicates whether the infusion has been discon...
7,INPUTEVENTS_CV,NEWBOTTLE,indicates if a new preparation of the solution...
8,INPUTEVENTS_CV,ORIGINALAMOUNT,provide information about the solution the med...
9,INPUTEVENTS_CV,ORIGINALAMOUNTUOM,provide information about the solution the med...


In [44]:
via_inputevents_cv.to_csv("G:/2000/MIMIC_inputeventscv_VIA.csv", index=False)

# Inputevents_MV - procedure같음

In [178]:
inputevents_mv = pd.read_csv('INPUTEVENTS_MV.csv.gz', compression='gzip')
inputevents_mv

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,STARTTIME,ENDTIME,ITEMID,AMOUNT,AMOUNTUOM,RATE,...,TOTALAMOUNTUOM,ISOPENBAG,CONTINUEINNEXTDEPT,CANCELREASON,STATUSDESCRIPTION,COMMENTS_EDITEDBY,COMMENTS_CANCELEDBY,COMMENTS_DATE,ORIGINALAMOUNT,ORIGINALRATE
0,241,27063,139787,223259.0,2133-02-05 06:29:00,2133-02-05 08:45:00,225166,6.774532,mEq,,...,ml,0,0,1,Rewritten,,RN,2133-02-05 12:52:00,10.000000,0.050000
1,242,27063,139787,223259.0,2133-02-05 05:34:00,2133-02-05 06:30:00,225944,28.132997,ml,30.142497,...,ml,0,0,0,FinishedRunning,,,,28.132998,30.255817
2,243,27063,139787,223259.0,2133-02-05 05:34:00,2133-02-05 06:30:00,225166,2.813300,mEq,,...,ml,0,0,0,FinishedRunning,,,,2.813300,0.050426
3,244,27063,139787,223259.0,2133-02-03 12:00:00,2133-02-03 12:01:00,225893,1.000000,dose,,...,ml,0,0,2,Rewritten,RN,,2133-02-03 17:06:00,1.000000,1.000000
4,245,27063,139787,223259.0,2133-02-03 12:00:00,2133-02-03 12:01:00,220949,100.000000,ml,,...,ml,0,0,2,Rewritten,RN,,2133-02-03 17:06:00,100.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3618986,3568968,90959,136680,240741.0,2147-08-28 12:00:00,2147-08-28 12:01:00,221744,99.999997,mcg,,...,,0,0,0,FinishedRunning,,,,100.000000,100.000000
3618987,3568969,90959,136680,240741.0,2147-08-29 12:16:00,2147-08-29 15:04:00,225942,0.842267,mg,300.809532,...,ml,0,0,0,Paused,,,,2.500000,300.000000
3618988,3568970,90959,136680,240741.0,2147-08-29 12:16:00,2147-08-29 15:04:00,225943,16.845331,ml,6.016190,...,ml,0,0,0,Paused,,,,50.000000,6.000000
3618989,3568971,90959,136680,240741.0,2147-08-29 02:30:00,2147-08-29 02:31:00,221744,99.999997,mcg,,...,,0,0,0,FinishedRunning,,,,100.000000,100.000000


In [179]:
merged_df_input = pd.merge(
    inputevents_mv,
    d_item[['ITEMID', 'LABEL', 'CATEGORY']],
    how='left',
    on='ITEMID'
)

# 조인 안 된 행만 필터링 (LABEL 또는 CATEGORY가 NaN인 경우)
unmatched = merged_df_input[merged_df_input['LABEL'].isna()]

# 결과 확인
unmatched

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,STARTTIME,ENDTIME,ITEMID,AMOUNT,AMOUNTUOM,RATE,...,CONTINUEINNEXTDEPT,CANCELREASON,STATUSDESCRIPTION,COMMENTS_EDITEDBY,COMMENTS_CANCELEDBY,COMMENTS_DATE,ORIGINALAMOUNT,ORIGINALRATE,LABEL,CATEGORY


In [180]:
inputevents_mv_df = merged_df_input

In [181]:
inputevents_mv_df.columns

Index(['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'STARTTIME', 'ENDTIME',
       'ITEMID', 'AMOUNT', 'AMOUNTUOM', 'RATE', 'RATEUOM', 'STORETIME', 'CGID',
       'ORDERID', 'LINKORDERID', 'ORDERCATEGORYNAME',
       'SECONDARYORDERCATEGORYNAME', 'ORDERCOMPONENTTYPEDESCRIPTION',
       'ORDERCATEGORYDESCRIPTION', 'PATIENTWEIGHT', 'TOTALAMOUNT',
       'TOTALAMOUNTUOM', 'ISOPENBAG', 'CONTINUEINNEXTDEPT', 'CANCELREASON',
       'STATUSDESCRIPTION', 'COMMENTS_EDITEDBY', 'COMMENTS_CANCELEDBY',
       'COMMENTS_DATE', 'ORIGINALAMOUNT', 'ORIGINALRATE', 'LABEL', 'CATEGORY'],
      dtype='object')

In [182]:
inputevents_mv_df = inputevents_mv_df[inputevents_mv_df["SUBJECT_ID"].isin(la)].reset_index(drop=True)
inputevents_mv_df

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,STARTTIME,ENDTIME,ITEMID,AMOUNT,AMOUNTUOM,RATE,...,CONTINUEINNEXTDEPT,CANCELREASON,STATUSDESCRIPTION,COMMENTS_EDITEDBY,COMMENTS_CANCELEDBY,COMMENTS_DATE,ORIGINALAMOUNT,ORIGINALRATE,LABEL,CATEGORY
0,12147,27366,174736,248077.0,2143-03-02 20:00:00,2143-03-02 20:01:00,226452,120.000000,ml,,...,0,0,FinishedRunning,,,,120.000000,120.000000,PO Intake,Fluids/Intake
1,12148,27366,174736,248077.0,2143-03-04 20:00:00,2143-03-04 20:01:00,225893,1.000000,dose,,...,0,0,FinishedRunning,,,,1.000000,1.000000,Piperacillin/Tazobactam (Zosyn),Antibiotics
2,12149,27366,174736,248077.0,2143-03-04 20:00:00,2143-03-04 20:01:00,220949,100.000000,ml,,...,0,0,FinishedRunning,,,,100.000000,0.000000,Dextrose 5%,Fluids/Intake
3,12150,27366,174736,248077.0,2143-03-01 15:12:00,2143-03-02 10:25:00,222168,1000.000013,mg,10.003491,...,0,1,Rewritten,,RN,2143-03-01 16:16:00,1000.000000,10.000001,Propofol,Medications
4,12151,27366,174736,248077.0,2143-03-01 15:12:00,2143-03-02 10:25:00,225943,100.000004,ml,5.203816,...,0,1,Rewritten,,RN,2143-03-01 16:16:00,100.000000,5.203800,Solution,Fluids/Intake
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146149,3572771,51343,160605,261872.0,2199-08-30 06:57:00,2199-08-30 09:01:00,220949,16.821499,ml,8.139435,...,0,0,FinishedRunning,,,,16.821501,8.139427,Dextrose 5%,Fluids/Intake
146150,3572772,51343,160605,261872.0,2199-08-30 18:15:00,2199-08-30 19:15:00,227522,49.999999,ml,49.999999,...,0,0,FinishedRunning,,,,50.000000,50.000000,KCL (Bolus),Medications
146151,3572773,51343,160605,261872.0,2199-08-30 18:15:00,2199-08-30 19:15:00,225166,20.000000,mEq,,...,0,0,FinishedRunning,,,,20.000000,0.333333,Potassium Chloride,Medications
146152,3572774,51343,160605,261872.0,2199-08-29 19:30:00,2199-08-29 20:30:00,225168,350.000010,ml,350.000010,...,0,0,FinishedRunning,,,,350.000000,350.000000,Packed Red Blood Cells,Blood Products/Colloids


In [183]:
inputevents_mv_df.columns

Index(['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'STARTTIME', 'ENDTIME',
       'ITEMID', 'AMOUNT', 'AMOUNTUOM', 'RATE', 'RATEUOM', 'STORETIME', 'CGID',
       'ORDERID', 'LINKORDERID', 'ORDERCATEGORYNAME',
       'SECONDARYORDERCATEGORYNAME', 'ORDERCOMPONENTTYPEDESCRIPTION',
       'ORDERCATEGORYDESCRIPTION', 'PATIENTWEIGHT', 'TOTALAMOUNT',
       'TOTALAMOUNTUOM', 'ISOPENBAG', 'CONTINUEINNEXTDEPT', 'CANCELREASON',
       'STATUSDESCRIPTION', 'COMMENTS_EDITEDBY', 'COMMENTS_CANCELEDBY',
       'COMMENTS_DATE', 'ORIGINALAMOUNT', 'ORIGINALRATE', 'LABEL', 'CATEGORY'],
      dtype='object')

In [188]:
# 결과 저장 리스트
rows = []

# 제외할 컬럼
exclude_cols = {'ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID',  'AMOUNTUOM', 
                'RATEUOM',"ORIGINALAMOUNTUOM","TOTALAMOUNTUOM", "STORETIME", "ITEMID", "CATEGORY"}


# admission 반복 처리
for index, row in inputevents_mv_df.iterrows():
    patient_id = row["SUBJECT_ID"]
    admission_id = row["HADM_ID"]
    
    for col in inputevents_mv_df.columns:
        if col not in exclude_cols:
            value = row[col]

            # Unit 처리 조건
            if col == "AMOUNT":
                unit_val = row["AMOUNTUOM"] if "AMOUNTUOM" in inputevents_mv_df.columns else np.nan
            elif col == "RATE":
                unit_val = row["RATEUOM"] if "RATEUOM" in inputevents_mv_df.columns else np.nan
            elif col == "ORIGINALAMOUNT":
                unit_val = row["ORIGINALAMOUNTUOM"] if "ORIGINALAMOUNTUOM" in inputevents_mv_df.columns else np.nan
            elif col == "TOTALAMOUNT":
                unit_val = row["TOTALAMOUNTUOM"] if "TOTALAMOUNTUOM" in inputevents_mv_df.columns else np.nan
            else:
                unit_val = np.nan
                
            # Event_date 설정
            if col == "LABEL":
                event_date_val = row["STORETIME"] if "STORETIME" in inputevents_mv_df.columns else np.nan
            else:
                event_date_val = np.nan
                
            # Variable_ID 설정: LABEL일 때만 ITEMID 사용
            variable_id_val = row["ITEMID"] if col == "LABEL" and "ITEMID" in inputevents_mv_df.columns else np.nan

            rows.append({
                "Primary_key": index + 8344996 ,
                "Variable_ID": variable_id_val,
                "Original_table_name": "INPUTEVENTS_MV",
                "Variable_name": col,
                "Event_date": event_date_val,
                "Value": value,
                "Unit": unit_val,
                "Variable_type": np.nan,      # 나중에 설정
                "Is_categorical": np.nan,     # 나중에 설정
                "Recorder": np.nan,
                "Recorder_position": np.nan,
                "Recorder_affiliation": np.nan,
                "Patient_id": patient_id,
                "Admission_id": admission_id,
                "Ground_truth": np.nan,
                "Mapping_info_1": np.nan,
                "Mapping_info_2": np.nan
            })

# QUIQ 테이블로 변환
inputevents_mv_quiq = pd.DataFrame(rows, columns=QUIQ_cols)

# -------------------------------
# Variable_type 추론 함수
def infer_variable_type(val):
    if pd.isna(val):
        return np.nan
    elif isinstance(val, (int, float)):
        return "numeric"
    elif isinstance(val, pd.Timestamp):
        return "timestamp"
    elif isinstance(val, str):
        try:
            pd.to_datetime(val, errors="raise")
            return "timestamp"
        except Exception:
            return "string"
    else:
        return "unknown"

# Variable_type 컬럼 채우기
inputevents_mv_quiq["Variable_type"] = inputevents_mv_quiq["Value"].apply(infer_variable_type)

# --------------------------------
CATEGORICAL_THRESHOLD = 10

# 1. Variable_name 별 고유값 수 계산
value_counts = inputevents_mv_quiq.groupby("Variable_name")["Value"].nunique()

# 2. 기준 이하인 변수 목록 추출
categorical_vars = value_counts[value_counts <= CATEGORICAL_THRESHOLD].index

# 3. Is_categorical 채우기 (Value가 NaN 또는 0이면 NaN)
inputevents_mv_quiq["Is_categorical"] = inputevents_mv_quiq.apply(
    lambda row: np.nan if pd.isna(row["Value"]) or row["Value"] == 0
    else 1 if row["Variable_name"] in categorical_vars
    else 0,
    axis=1
)

inputevents_mv_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,8344996,,INPUTEVENTS_MV,STARTTIME,,2143-03-02 20:00:00,,timestamp,0.0,,,,27366,174736,,,
1,8344996,,INPUTEVENTS_MV,ENDTIME,,2143-03-02 20:01:00,,timestamp,0.0,,,,27366,174736,,,
2,8344996,,INPUTEVENTS_MV,AMOUNT,,120.0,ml,numeric,0.0,,,,27366,174736,,,
3,8344996,,INPUTEVENTS_MV,RATE,,,,,,,,,27366,174736,,,
4,8344996,,INPUTEVENTS_MV,CGID,,14411,,numeric,0.0,,,,27366,174736,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3361537,8491149,,INPUTEVENTS_MV,COMMENTS_CANCELEDBY,,,,,,,,,51343,160605,,,
3361538,8491149,,INPUTEVENTS_MV,COMMENTS_DATE,,,,,,,,,51343,160605,,,
3361539,8491149,,INPUTEVENTS_MV,ORIGINALAMOUNT,,5.0,,numeric,0.0,,,,51343,160605,,,
3361540,8491149,,INPUTEVENTS_MV,ORIGINALRATE,,5.0,,numeric,0.0,,,,51343,160605,,,


In [5]:
inputevents_mv_quiq = pd.read_csv("G:/2000/QUIQ/MIMIC_inputeventsmv_QUIQ.csv")
inputevents_mv_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,8344996,,INPUTEVENTS_MV,STARTTIME,,2143-03-02 20:00:00,,timestamp,0,,,,27366,174736,,date,
1,8344996,,INPUTEVENTS_MV,ENDTIME,,2143-03-02 20:01:00,,timestamp,0,,,,27366,174736,,date,
2,8344996,,INPUTEVENTS_MV,AMOUNT,,120.0,ml,numeric,0,,,,27366,174736,,prescription,prescription_info
3,8344996,,INPUTEVENTS_MV,RATE,,,,,0,,,,27366,174736,,,
4,8344996,,INPUTEVENTS_MV,CGID,,14411,,numeric,0,,,,27366,174736,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3507691,8491149,,INPUTEVENTS_MV,COMMENTS_CANCELEDBY,,,,,1,,,,51343,160605,,,
3507692,8491149,,INPUTEVENTS_MV,COMMENTS_DATE,,,,,0,,,,51343,160605,,,
3507693,8491149,,INPUTEVENTS_MV,ORIGINALAMOUNT,,5.0,,numeric,0,,,,51343,160605,,,
3507694,8491149,,INPUTEVENTS_MV,ORIGINALRATE,,5.0,,numeric,0,,,,51343,160605,,,


In [6]:
# 2. LINKORDERID, ORDERID, LABEL, ORIGINAL → Is_categorical = 1 or NaN (if Value is NaN)
target_vars = ["LINKORDERID", "ORDERID", "CGID","LABEL", "ORIGINAL", "ORDERCATEGORYNAME", "SECONDARYORDERCATEGORYNAME"]
condition_targets = inputevents_mv_quiq["Variable_name"].isin(target_vars)

inputevents_mv_quiq.loc[condition_targets, "Is_categorical"] = inputevents_mv_quiq.loc[condition_targets, "Value"].apply(
    lambda v: 1 if pd.notna(v) else np.nan
)

# 3. LINKORDERID, ORDERID → Variable_type = "string" (only if Value is not null)
cond_linkorderid = (
    (inputevents_mv_quiq["Variable_name"] == "LINKORDERID") &
    (inputevents_mv_quiq["Value"].notna())
)
inputevents_mv_quiq.loc[cond_linkorderid, "Variable_type"] = "string"

cond_orderid = (
    (inputevents_mv_quiq["Variable_name"] == "ORDERID") &
    (inputevents_mv_quiq["Value"].notna())
)
inputevents_mv_quiq.loc[cond_orderid, "Variable_type"] = "string"

cond_cgid = (
    (inputevents_mv_quiq["Variable_name"] == "CGID") &
    (inputevents_mv_quiq["Value"].notna())
)
inputevents_mv_quiq.loc[cond_cgid, "Variable_type"] = "string"


inputevents_mv_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,8344996,,INPUTEVENTS_MV,STARTTIME,,2143-03-02 20:00:00,,timestamp,0.0,,,,27366,174736,,date,
1,8344996,,INPUTEVENTS_MV,ENDTIME,,2143-03-02 20:01:00,,timestamp,0.0,,,,27366,174736,,date,
2,8344996,,INPUTEVENTS_MV,AMOUNT,,120.0,ml,numeric,0.0,,,,27366,174736,,prescription,prescription_info
3,8344996,,INPUTEVENTS_MV,RATE,,,,,0.0,,,,27366,174736,,,
4,8344996,,INPUTEVENTS_MV,CGID,,14411,,string,1.0,,,,27366,174736,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3507691,8491149,,INPUTEVENTS_MV,COMMENTS_CANCELEDBY,,,,,1.0,,,,51343,160605,,,
3507692,8491149,,INPUTEVENTS_MV,COMMENTS_DATE,,,,,0.0,,,,51343,160605,,,
3507693,8491149,,INPUTEVENTS_MV,ORIGINALAMOUNT,,5.0,,numeric,0.0,,,,51343,160605,,,
3507694,8491149,,INPUTEVENTS_MV,ORIGINALRATE,,5.0,,numeric,0.0,,,,51343,160605,,,


In [7]:
inputevents_mv_quiq.head(20)

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,8344996,,INPUTEVENTS_MV,STARTTIME,,2143-03-02 20:00:00,,timestamp,0.0,,,,27366,174736,,date,
1,8344996,,INPUTEVENTS_MV,ENDTIME,,2143-03-02 20:01:00,,timestamp,0.0,,,,27366,174736,,date,
2,8344996,,INPUTEVENTS_MV,AMOUNT,,120.0,ml,numeric,0.0,,,,27366,174736,,prescription,prescription_info
3,8344996,,INPUTEVENTS_MV,RATE,,,,,0.0,,,,27366,174736,,,
4,8344996,,INPUTEVENTS_MV,CGID,,14411,,string,1.0,,,,27366,174736,,,
5,8344996,,INPUTEVENTS_MV,ORDERID,,5174791,,string,1.0,,,,27366,174736,,,
6,8344996,,INPUTEVENTS_MV,LINKORDERID,,5174791,,string,1.0,,,,27366,174736,,,
7,8344996,,INPUTEVENTS_MV,ORDERCATEGORYNAME,,14-Oral/Gastric Intake,,string,1.0,,,,27366,174736,,,
8,8344996,,INPUTEVENTS_MV,SECONDARYORDERCATEGORYNAME,,,,,,,,,27366,174736,,,
9,8344996,,INPUTEVENTS_MV,ORDERCOMPONENTTYPEDESCRIPTION,,Main order parameter,,string,1.0,,,,27366,174736,,,


In [198]:
# Mapping
mapping_rules = {
    'CHARTTIME':("date", np.nan), 
    'AMOUNT': ("prescription", "prescription_info"),
    'TOTALAMOUNT': ("prescription", "prescription_info"),
    'STARTTIME': ("date", np.nan),
    'ENDTIME': ("date", np.nan),
    "TOTALAMOUNT": ("prescription", "prescription_info"),
    'PATIENTWEIGHT': ("event", "chart_event"),
    'COMMENTS_DATE' : ("date", np.nan)
}

# 2. 매핑 함수
def map_mapping_info(row):
    var_name = row["Variable_name"]
    value = row["Value"]

    if pd.isna(value):
        return pd.Series([np.nan, np.nan])  # ✅ value가 NaN이면 매핑 안함
    elif var_name == "LABEL":
        return pd.Series(["prescription", np.nan])
    else:
        return pd.Series(mapping_rules.get(var_name, (np.nan, np.nan)))

# 3. 적용
inputevents_mv_quiq[["Mapping_info_1", "Mapping_info_2"]] = inputevents_mv_quiq.apply(map_mapping_info, axis=1)
inputevents_mv_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,8344996,,INPUTEVENTS_MV,STARTTIME,,2143-03-02 20:00:00,,timestamp,0.0,,,,27366,174736,,date,
1,8344996,,INPUTEVENTS_MV,ENDTIME,,2143-03-02 20:01:00,,timestamp,0.0,,,,27366,174736,,date,
2,8344996,,INPUTEVENTS_MV,AMOUNT,,120.0,ml,numeric,0.0,,,,27366,174736,,prescription,prescription_info
3,8344996,,INPUTEVENTS_MV,RATE,,,,,,,,,27366,174736,,,
4,8344996,,INPUTEVENTS_MV,CGID,,14411,,string,1.0,,,,27366,174736,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3361537,8491149,,INPUTEVENTS_MV,COMMENTS_CANCELEDBY,,,,,,,,,51343,160605,,,
3361538,8491149,,INPUTEVENTS_MV,COMMENTS_DATE,,,,,,,,,51343,160605,,,
3361539,8491149,,INPUTEVENTS_MV,ORIGINALAMOUNT,,5.0,,numeric,0.0,,,,51343,160605,,,
3361540,8491149,,INPUTEVENTS_MV,ORIGINALRATE,,5.0,,numeric,0.0,,,,51343,160605,,,


In [8]:
inputevents_mv_quiq.to_csv("G:/2000/MIMIC_inputeventsmv_QUIQ.csv", index=False)

In [60]:
inputevents_mv_quiq['Variable_name'].unique()

array(['STARTTIME', 'ENDTIME', 'AMOUNT', 'RATE', 'CGID', 'ORDERID',
       'LINKORDERID', 'ORDERCATEGORYNAME', 'SECONDARYORDERCATEGORYNAME',
       'ORDERCOMPONENTTYPEDESCRIPTION', 'ORDERCATEGORYDESCRIPTION',
       'PATIENTWEIGHT', 'TOTALAMOUNT', 'TOTALAMOUNTUOM', 'ISOPENBAG',
       'CONTINUEINNEXTDEPT', 'CANCELREASON', 'STATUSDESCRIPTION',
       'COMMENTS_EDITEDBY', 'COMMENTS_CANCELEDBY', 'COMMENTS_DATE',
       'ORIGINALAMOUNT', 'ORIGINALRATE', 'LABEL'], dtype=object)

In [61]:
# VIA 테이블 생성
via_variable_names = ['STARTTIME', 'ENDTIME', 'AMOUNT', 'RATE', 'CGID',
       'ORDERID', 'LINKORDERID', 'ORDERCATEGORYNAME',
       'SECONDARYORDERCATEGORYNAME', 'ORDERCOMPONENTTYPEDESCRIPTION',
       'ORDERCATEGORYDESCRIPTION', 'PATIENTWEIGHT', 'TOTALAMOUNT', 'STATUSDESCRIPTION',
        'ISOPENBAG', 'CONTINUEINNEXTDEPT', 'CANCELREASON',
       'COMMENTS_EDITEDBY', 'COMMENTS_CANCELEDBY',
       'COMMENTS_DATE', 'ORIGINALAMOUNT', 'ORIGINALRATE', 'LABEL',]

via_descriptions = ['record the start time of an input/output event',
                    'record the end time of an input/ouput event',
                    'amount of a drug or substance administered to the patient either between the starttime and endtime',
                    'the rate at which the drug or substance was administered to the patient either between the starttime and endtime',
                    'identifier for the caregiver who validated the given measurement',
                    "links multiple items contatined in the same solution together",
                    "links the same order across multiple instantiations",
                    'provide higher level information about the order the medication/solution is a part of. Categories represent the type of administration.',
                    'provide higher level information about the order the medication/solution is a part of. Categories represent the type of administration.',
                    'provide higher level information about the order the medication/solution is a part of. Describes the role of the substance in the solution',
                    'provide higher level information about the order the medication/solution is a part of. Describes the role of the substance in the solution',
                    'the patient weight in kilograms',
                    'the total amount of the fluid in the bag containing the solution',
                    'the ultimate status of the item, or more specifically, row. It is used to indicate why the delivery of the compound has ended. there are only six possible statuses (Changed, Paused, Fishished Running, Stopped, Rewritten, Flushed)',
                    'whether the order was from an open bag',
                    'if the order ended on patient transfer, this field indicates if it continued into the next department(e.g. a floor)',
                    'if the order was canceled, this provides some explanation',
                    'specifies if the order was edited or canceled, and if so, the date and job title of the care giver who canceled or edited it',
                    'specifies if the order was edited or canceled, and if so, the date and job title of the care giver who canceled or edited it',
                    'specifies if the order was edited or canceled, and if so, the date and job title of the care giver who canceled or edited it',
                    'the amount of the drug contatined in the bag at STARTTIME',
                    'the rate that was input by the care provider',        
                    "LABEL"
                   ]


via_inputevents_mv = pd.DataFrame({
    'Original_table_name': 'INPUTEVENTS_MV',
    'Variable_name': via_variable_names,
    'Description': via_descriptions
})
via_inputevents_mv

Unnamed: 0,Original_table_name,Variable_name,Description
0,INPUTEVENTS_MV,STARTTIME,record the start time of an input/output event
1,INPUTEVENTS_MV,ENDTIME,record the end time of an input/ouput event
2,INPUTEVENTS_MV,AMOUNT,amount of a drug or substance administered to ...
3,INPUTEVENTS_MV,RATE,the rate at which the drug or substance was ad...
4,INPUTEVENTS_MV,CGID,identifier for the caregiver who validated the...
5,INPUTEVENTS_MV,ORDERID,links multiple items contatined in the same so...
6,INPUTEVENTS_MV,LINKORDERID,links the same order across multiple instantia...
7,INPUTEVENTS_MV,ORDERCATEGORYNAME,provide higher level information about the ord...
8,INPUTEVENTS_MV,SECONDARYORDERCATEGORYNAME,provide higher level information about the ord...
9,INPUTEVENTS_MV,ORDERCOMPONENTTYPEDESCRIPTION,provide higher level information about the ord...


In [62]:
 via_inputevents_mv.to_csv("G:/2000/MIMIC_inputeventsmv_VIA.csv", index=False)

# Noteevents

In [63]:
noteevents = pd.read_csv('NOTEEVENTS.csv.gz', compression='gzip')
noteevents

  noteevents = pd.read_csv('NOTEEVENTS.csv.gz', compression='gzip')


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
0,174,22532,167853.0,2151-08-04,,,Discharge summary,Report,,,Admission Date: [**2151-7-16**] Dischar...
1,175,13702,107527.0,2118-06-14,,,Discharge summary,Report,,,Admission Date: [**2118-6-2**] Discharg...
2,176,13702,167118.0,2119-05-25,,,Discharge summary,Report,,,Admission Date: [**2119-5-4**] D...
3,177,13702,196489.0,2124-08-18,,,Discharge summary,Report,,,Admission Date: [**2124-7-21**] ...
4,178,26880,135453.0,2162-03-25,,,Discharge summary,Report,,,Admission Date: [**2162-3-3**] D...
...,...,...,...,...,...,...,...,...,...,...,...
2083175,2070657,31097,115637.0,2132-01-21,2132-01-21 03:27:00,2132-01-21 03:38:00,Nursing/other,Report,17581.0,,NPN\n\n\n#1 Infant remains in RA with O2 sats...
2083176,2070658,31097,115637.0,2132-01-21,2132-01-21 09:50:00,2132-01-21 09:53:00,Nursing/other,Report,19211.0,,"Neonatology\nDOL #5, CGA 36 weeks.\n\nCVR: Con..."
2083177,2070659,31097,115637.0,2132-01-21,2132-01-21 16:42:00,2132-01-21 16:44:00,Nursing/other,Report,20104.0,,Family Meeting Note\nFamily meeting held with ...
2083178,2070660,31097,115637.0,2132-01-21,2132-01-21 18:05:00,2132-01-21 18:16:00,Nursing/other,Report,16023.0,,NPN 1800\n\n\n#1 Resp: [**Known lastname 2243*...


In [64]:
noteevents_df = noteevents[noteevents["SUBJECT_ID"].isin(la)].reset_index(drop=True)
noteevents_df

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
0,191,26175,156154.0,2112-05-05,,,Discharge summary,Report,,,Admission Date: [**2112-4-22**] ...
1,212,76874,113329.0,2101-10-28,,,Discharge summary,Report,,,Admission Date: [**2101-10-25**] ...
2,220,66479,134640.0,2148-02-07,,,Discharge summary,Report,,,Admission Date: [**2148-2-3**] D...
3,30,25995,152664.0,2128-05-07,,,Discharge summary,Report,,,Admission Date: [**2128-5-5**] Discharg...
4,31,25995,152664.0,2128-05-07,,,Discharge summary,Report,,,Admission Date: [**2128-5-5**] Discharg...
...,...,...,...,...,...,...,...,...,...,...,...
90938,2078293,31940,169563.0,2121-06-29,2121-06-29 15:39:00,2121-06-29 15:41:00,Nursing/other,Report,20104.0,,NNP On-Call\nPlease see Dr.[**Name (NI) 194**]...
90939,2079479,32187,178887.0,2186-01-27,2186-01-27 16:56:00,2186-01-27 17:11:00,Nursing/other,Report,16660.0,,NPN 0700-1700\n\nNICU Transfer Note\n\nO: Baby...
90940,2079480,32187,178887.0,2186-01-27,2186-01-27 16:59:00,2186-01-27 17:01:00,Nursing/other,Report,16888.0,,Neonatology\nComfortable RA. No spells.\n\nWt ...
90941,2075452,31655,175166.0,2148-07-25,2148-07-25 21:51:00,2148-07-25 21:58:00,Nursing/other,Report,19211.0,,Neonatology\nBaby Girl [**Known lastname 1672*...


In [65]:
noteevents_df['CATEGORY'].unique()

array(['Discharge summary', 'Echo', 'ECG', 'Nursing', 'Respiratory ',
       'Physician ', 'Case Management ', 'Rehab Services', 'Nutrition',
       'General', 'Social Work', 'Pharmacy', 'Consult', 'Radiology',
       'Nursing/other'], dtype=object)

In [66]:
print(noteevents_df["CATEGORY"].value_counts())

CATEGORY
Nursing/other        38843
Radiology            21827
ECG                   9080
Nursing               8602
Physician             5617
Discharge summary     2622
Echo                  1986
Respiratory           1267
General                367
Nutrition              361
Rehab Services         212
Social Work            102
Case Management         41
Consult                  9
Pharmacy                 7
Name: count, dtype: int64


In [67]:
filtered_note = noteevents_df[noteevents_df["CATEGORY"].isin(["Discharge summary", "General", "Radiology"])]
filtered_note

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
0,191,26175,156154.0,2112-05-05,,,Discharge summary,Report,,,Admission Date: [**2112-4-22**] ...
1,212,76874,113329.0,2101-10-28,,,Discharge summary,Report,,,Admission Date: [**2101-10-25**] ...
2,220,66479,134640.0,2148-02-07,,,Discharge summary,Report,,,Admission Date: [**2148-2-3**] D...
3,30,25995,152664.0,2128-05-07,,,Discharge summary,Report,,,Admission Date: [**2128-5-5**] Discharg...
4,31,25995,152664.0,2128-05-07,,,Discharge summary,Report,,,Admission Date: [**2128-5-5**] Discharg...
...,...,...,...,...,...,...,...,...,...,...,...
52912,1259411,47460,,2173-02-04,2173-02-04 09:52:00,,Radiology,L WRIST(3 + VIEWS) LEFT,,,[**2173-2-4**] 9:52 AM\n WRIST(3 + VIEWS) LEFT...
52913,1260241,45910,,2198-04-06,2198-04-06 12:12:00,,Radiology,L-SPINE (AP & LAT),,,[**2198-4-6**] 12:12 PM\n T-SPINE; L-SPINE (AP...
52914,1260677,72279,114580.0,2157-08-16,2157-08-16 09:51:00,,Radiology,L UNILAT LOWER EXT VEINS LEFT,,,[**2157-8-16**] 9:51 AM\n UNILAT LOWER EXT VEI...
52915,1259506,71244,,2176-12-16,2176-12-16 04:24:00,,Radiology,CHEST (PORTABLE AP),,,[**2176-12-16**] 4:24 AM\n CHEST (PORTABLE AP)...


In [68]:
print(filtered_note["CATEGORY"].value_counts())

CATEGORY
Radiology            21827
Discharge summary     2622
General                367
Name: count, dtype: int64


In [69]:
# Radiology에서 랜덤 1000개 추출
radiology_sample = noteevents_df[noteevents_df["CATEGORY"] == "Radiology"].sample(n=1000, random_state=42)

# Discharge summary에서 랜덤 1000개 추출
discharge_sample = noteevents_df[noteevents_df["CATEGORY"] == "Discharge summary"].sample(n=1000, random_state=42)

# General은 모두 포함
general_all = noteevents_df[noteevents_df["CATEGORY"] == "General"]

# 세 개의 데이터프레임 합치기
filtered_df = pd.concat([radiology_sample, discharge_sample, general_all]).reset_index(drop=True)
filtered_df

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
0,1013519,27077,162888.0,2128-06-17,2128-06-17 15:26:00,,Radiology,PICC W/O PORT,,,[**2128-6-17**] 3:26 PM\n PICC LINE PLACMENT S...
1,1092778,59761,192397.0,2116-09-14,2116-09-14 23:08:00,,Radiology,CHEST (PA & LAT),,,[**2116-9-14**] 11:08 PM\n CHEST (PA & LAT) ...
2,917614,15219,,2176-05-13,2176-05-13 16:34:00,,Radiology,DUPLEX DOPP ABD/PEL,,,"[**2176-5-13**] 4:34 PM\n US ABD LIMIT, SINGLE..."
3,784727,15255,,2120-02-18,2120-02-18 14:49:00,,Radiology,CHEST (PA & LAT),,,[**2120-2-18**] 2:49 PM\n CHEST (PA & LAT) ...
4,1228188,99562,101705.0,2183-03-13,2183-03-13 07:29:00,,Radiology,CHEST (PORTABLE AP),,,[**2183-3-13**] 7:29 AM\n CHEST (PORTABLE AP) ...
...,...,...,...,...,...,...,...,...,...,...,...
2362,722356,75798,149915.0,2186-03-20,2186-03-20 12:02:00,2186-03-20 12:02:47,General,Generic Note,21284.0,,TITLE: CRITICAL CARE\n Present for the key p...
2363,721694,75798,149915.0,2186-03-17,2186-03-17 09:38:00,2186-03-17 09:39:12,General,Generic Note,21284.0,,TITLE: CRITICAL CARE\n Present for the key ...
2364,732807,81551,119989.0,2131-04-03,2131-04-03 14:48:00,2131-04-03 14:48:24,General,Generic Note,16223.0,,TITLE:\n Clinical Nutrition:\n Diet: Regul...
2365,732634,44413,106426.0,2195-04-06,2195-04-06 19:15:00,2195-04-06 19:15:44,General,ICU Event Note,21284.0,,Clinician: Attending\n CRITICAL CARE\n [*...


In [70]:
noteevents_test = filtered_df

In [71]:
# 결과 저장 리스트
rows = []

# 제외할 컬럼
exclude_cols = {'ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'CHARTDATE', 'CATEGORY'}


# admission 반복 처리
for index, row in noteevents_test.iterrows():
    patient_id = row["SUBJECT_ID"]
    admission_id = row["HADM_ID"]
    
    for col in noteevents_test.columns:
        if col not in exclude_cols:
            value = row[col]
                
            # Event_date 설정
            if col == "TEXT":
                event_date_val = row["CHARTDATE"] if "CHARTDATE" in noteevents_test.columns else np.nan
            else:
                event_date_val = np.nan
                
            if col == "TEXT":
                mapping_note = row["CATEGORY"] if "CATEGORY" in noteevents_test.columns else np.nan
            else:
                mapping_note = np.nan
                

            rows.append({
                "Primary_key": index + 8491150,
                "Variable_ID": variable_id_val,
                "Original_table_name": "NOTEEVENTS",
                "Variable_name": col,
                "Event_date": event_date_val,
                "Value": value,
                "Unit": np.nan,
                "Variable_type": np.nan,      # 나중에 설정
                "Is_categorical": np.nan,     # 나중에 설정
                "Recorder": np.nan,
                "Recorder_position": np.nan,
                "Recorder_affiliation": np.nan,
                "Patient_id": patient_id,
                "Admission_id": admission_id,
                "Ground_truth": np.nan,
                "Mapping_info_1": mapping_note,
                "Mapping_info_2": np.nan
            })

# QUIQ 테이블로 변환
noteevents_quiq = pd.DataFrame(rows, columns=QUIQ_cols)

# -------------------------------
# Variable_type 추론 함수
def infer_variable_type(val):
    if pd.isna(val):
        return np.nan
    elif isinstance(val, (int, float)):
        return "numeric"
    elif isinstance(val, pd.Timestamp):
        return "timestamp"
    elif isinstance(val, str):
        try:
            pd.to_datetime(val, errors="raise")
            return "timestamp"
        except Exception:
            return "string"
    else:
        return "unknown"

# Variable_type 컬럼 채우기
noteevents_quiq["Variable_type"] = noteevents_quiq["Value"].apply(infer_variable_type)

# -------------------------------
# Is_categorical 판단: 고유값 수가 적은 변수는 범주형으로 간주
CATEGORICAL_THRESHOLD = 10

# 각 Variable_name 별 고유값 수 계산
value_counts = noteevents_quiq.groupby("Variable_name")["Value"].nunique()

# 고유값 수가 기준 이하인 변수 목록 추출
categorical_vars = value_counts[value_counts <= CATEGORICAL_THRESHOLD].index

# Is_categorical 컬럼 채우기: 범주형이면 1, 아니면 0
noteevents_quiq["Is_categorical"] = noteevents_quiq["Variable_name"].apply(
    lambda var: 1 if var in categorical_vars else 0
)
noteevents_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,8491150,225154,NOTEEVENTS,CHARTTIME,,2128-06-17 15:26:00,,timestamp,0,,,,27077,162888.0,,,
1,8491150,225154,NOTEEVENTS,STORETIME,,,,,0,,,,27077,162888.0,,,
2,8491150,225154,NOTEEVENTS,DESCRIPTION,,PICC W/O PORT,,string,0,,,,27077,162888.0,,,
3,8491150,225154,NOTEEVENTS,CGID,,,,,0,,,,27077,162888.0,,,
4,8491150,225154,NOTEEVENTS,ISERROR,,,,,1,,,,27077,162888.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14197,8493516,225154,NOTEEVENTS,STORETIME,,2107-11-18 07:14:21,,timestamp,0,,,,17190,106415.0,,,
14198,8493516,225154,NOTEEVENTS,DESCRIPTION,,Generic Note,,string,0,,,,17190,106415.0,,,
14199,8493516,225154,NOTEEVENTS,CGID,,14775.0,,numeric,0,,,,17190,106415.0,,,
14200,8493516,225154,NOTEEVENTS,ISERROR,,,,,1,,,,17190,106415.0,,,


## 안돌려도 됨

In [222]:
filtered_df = noteevents_quiq[
    (noteevents_quiq["Variable_name"] == "TEXT") &
    (noteevents_quiq["Mapping_info_1"] == "Radiology")
]

filtered_df

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
5,8585917,224275,NOTEEVENTS,TEXT,2128-06-17,[**2128-6-17**] 3:26 PM\n PICC LINE PLACMENT S...,,string,0,,,,27077,162888.0,,Radiology,
11,8585918,224275,NOTEEVENTS,TEXT,2116-09-14,[**2116-9-14**] 11:08 PM\n CHEST (PA & LAT) ...,,string,0,,,,59761,192397.0,,Radiology,
17,8585919,224275,NOTEEVENTS,TEXT,2176-05-13,"[**2176-5-13**] 4:34 PM\n US ABD LIMIT, SINGLE...",,string,0,,,,15219,,,Radiology,
23,8585920,224275,NOTEEVENTS,TEXT,2120-02-18,[**2120-2-18**] 2:49 PM\n CHEST (PA & LAT) ...,,string,0,,,,15255,,,Radiology,
29,8585921,224275,NOTEEVENTS,TEXT,2183-03-13,[**2183-3-13**] 7:29 AM\n CHEST (PORTABLE AP) ...,,string,0,,,,99562,101705.0,,Radiology,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5975,8586912,224275,NOTEEVENTS,TEXT,2135-04-23,[**2135-4-23**] 4:11 PM\n CHEST (PORTABLE AP) ...,,string,0,,,,22043,183211.0,,Radiology,
5981,8586913,224275,NOTEEVENTS,TEXT,2161-11-01,[**2161-11-1**] 2:11 PM\n LUMBO-SACRAL SPINE (...,,string,0,,,,188,132401.0,,Radiology,
5987,8586914,224275,NOTEEVENTS,TEXT,2124-06-13,[**2124-6-13**] 9:04 AM\n CHEST PORT. LINE PLA...,,string,0,,,,49565,171674.0,,Radiology,
5993,8586915,224275,NOTEEVENTS,TEXT,2140-02-10,[**2140-2-10**] 11:39 AM\n CHEST (PORTABLE AP)...,,string,0,,,,25049,180379.0,,Radiology,


In [224]:
noteevents_test['CATEGORY'].unique()

array(['Radiology', 'Discharge summary', 'General'], dtype=object)

In [225]:
noteevents_quiq['Variable_name'].unique()

array(['CHARTTIME', 'STORETIME', 'DESCRIPTION', 'CGID', 'ISERROR', 'TEXT'],
      dtype=object)

In [226]:
print(noteevents_quiq[["Variable_name", "Mapping_info_1"]].drop_duplicates())

      Variable_name     Mapping_info_1
0         CHARTTIME                NaN
1         STORETIME                NaN
2       DESCRIPTION                NaN
3              CGID                NaN
4           ISERROR                NaN
5              TEXT          Radiology
6005           TEXT  Discharge summary
12005          TEXT            General


## 매핑 여기서부터

In [72]:
# Mapping
mapping_rules = {
    'CHARTTIME':("date", np.nan), 
    'STORETIME':("date", np.nan),
}

def map_mapping_info(row):
    var_name = row["Mapping_info_1"]
    value = row["Value"]

    if pd.isna(value):
        return pd.Series([np.nan, np.nan])  # ✅ value가 NaN이면 매핑 안함
    elif var_name == "Radiology":
        return pd.Series(["note_rad", np.nan])
    elif var_name == "General":
        return pd.Series(["note_clinical", np.nan])
    elif var_name == "Discharge summary":
        return pd.Series(["note_clinical", "DIS"])
    else:
        return pd.Series(mapping_rules.get(var_name, (np.nan, np.nan)))

# 3. 적용
noteevents_quiq[["Mapping_info_1", "Mapping_info_2"]] = noteevents_quiq.apply(map_mapping_info, axis=1)
noteevents_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,8491150,225154,NOTEEVENTS,CHARTTIME,,2128-06-17 15:26:00,,timestamp,0,,,,27077,162888.0,,,
1,8491150,225154,NOTEEVENTS,STORETIME,,,,,0,,,,27077,162888.0,,,
2,8491150,225154,NOTEEVENTS,DESCRIPTION,,PICC W/O PORT,,string,0,,,,27077,162888.0,,,
3,8491150,225154,NOTEEVENTS,CGID,,,,,0,,,,27077,162888.0,,,
4,8491150,225154,NOTEEVENTS,ISERROR,,,,,1,,,,27077,162888.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14197,8493516,225154,NOTEEVENTS,STORETIME,,2107-11-18 07:14:21,,timestamp,0,,,,17190,106415.0,,,
14198,8493516,225154,NOTEEVENTS,DESCRIPTION,,Generic Note,,string,0,,,,17190,106415.0,,,
14199,8493516,225154,NOTEEVENTS,CGID,,14775.0,,numeric,0,,,,17190,106415.0,,,
14200,8493516,225154,NOTEEVENTS,ISERROR,,,,,1,,,,17190,106415.0,,,


In [73]:
import numpy as np
import pandas as pd

# 분류 함수 정의 (Radiology TEXT 내용 기반)
def classify_radiology_note(text):
    if pd.isna(text):
        return np.nan
    text = text.lower()
    if any(kw in text for kw in ["abdomen ct", "abdominal ct", "ct abdomen", "ct a/p", "ct abd"]):
        return "ACT"   # Abdominal CT
    elif any(kw in text for kw in ["chest ct", "ct chest", "thorax ct"]):
        return "CCT"   # Chest CT
    elif any(kw in text for kw in ["head ct", "brain ct", "ct head", "ct brain", "cth"]):
        return "BCT"   # Brain CT
    elif any(kw in text for kw in ["spine ct", "ct spine", "ct lumbar", "ct cervical", "ct thoracic"]):
        return "SCT"   # Spine CT
    elif any(kw in text for kw in ["abdomen", "abdominal", "kub", "axr"]):
        return "AXR"   # Abdominal X-ray
    elif any(kw in text for kw in ["chest", "cxr", "thorax"]):
        return "CXR"   # Chest X-ray
    elif any(kw in text for kw in ["spine", "lumbar", "cervical", "thoracic"]):
        return "SXR"   # Spine X-ray
    else:
        return np.nan

# 조건: Variable_name == 'TEXT' and Mapping_info_1 == 'note_rad'
mask = (noteevents_quiq["Variable_name"] == "TEXT") & (noteevents_quiq["Mapping_info_1"] == "note_rad")

# 해당 조건의 행에만 classify 적용하여 Mapping_info_2에 저장
noteevents_quiq.loc[mask, "Mapping_info_2"] = noteevents_quiq.loc[mask, "Value"].apply(classify_radiology_note)
noteevents_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,8491150,225154,NOTEEVENTS,CHARTTIME,,2128-06-17 15:26:00,,timestamp,0,,,,27077,162888.0,,,
1,8491150,225154,NOTEEVENTS,STORETIME,,,,,0,,,,27077,162888.0,,,
2,8491150,225154,NOTEEVENTS,DESCRIPTION,,PICC W/O PORT,,string,0,,,,27077,162888.0,,,
3,8491150,225154,NOTEEVENTS,CGID,,,,,0,,,,27077,162888.0,,,
4,8491150,225154,NOTEEVENTS,ISERROR,,,,,1,,,,27077,162888.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14197,8493516,225154,NOTEEVENTS,STORETIME,,2107-11-18 07:14:21,,timestamp,0,,,,17190,106415.0,,,
14198,8493516,225154,NOTEEVENTS,DESCRIPTION,,Generic Note,,string,0,,,,17190,106415.0,,,
14199,8493516,225154,NOTEEVENTS,CGID,,14775.0,,numeric,0,,,,17190,106415.0,,,
14200,8493516,225154,NOTEEVENTS,ISERROR,,,,,1,,,,17190,106415.0,,,


In [74]:
print(noteevents_quiq["Mapping_info_2"].value_counts(dropna=False))

Mapping_info_2
NaN    12366
DIS     1000
CXR      476
AXR      132
BCT       82
ACT       70
SXR       40
CCT       32
SCT        4
Name: count, dtype: int64


In [199]:
noteevents_quiq = pd.read_csv("G:2000/MIMIC_noteevents_QUIQ.csv")
noteevents_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,8491150,225154,NOTEEVENTS,CHARTTIME,,2128-06-17 15:26:00,,timestamp,0,,,,27077,162888.0,,,
1,8491150,225154,NOTEEVENTS,STORETIME,,,,,0,,,,27077,162888.0,,,
2,8491150,225154,NOTEEVENTS,DESCRIPTION,,PICC W/O PORT,,string,0,,,,27077,162888.0,,,
3,8491150,225154,NOTEEVENTS,CGID,,,,,0,,,,27077,162888.0,,,
4,8491150,225154,NOTEEVENTS,ISERROR,,,,,1,,,,27077,162888.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14197,8493516,225154,NOTEEVENTS,STORETIME,,2107-11-18 07:14:21,,timestamp,0,,,,17190,106415.0,,,
14198,8493516,225154,NOTEEVENTS,DESCRIPTION,,Generic Note,,string,0,,,,17190,106415.0,,,
14199,8493516,225154,NOTEEVENTS,CGID,,14775.0,,numeric,0,,,,17190,106415.0,,,
14200,8493516,225154,NOTEEVENTS,ISERROR,,,,,1,,,,17190,106415.0,,,


In [201]:
noteevents_quiq["Is_categorical"] = noteevents_quiq["Value"].apply(
    lambda v: np.nan if pd.isna(v) else 0  # 기본값은 0으로 설정
)

noteevents_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,8491150,225154,NOTEEVENTS,CHARTTIME,,2128-06-17 15:26:00,,timestamp,0.0,,,,27077,162888.0,,,
1,8491150,225154,NOTEEVENTS,STORETIME,,,,,,,,,27077,162888.0,,,
2,8491150,225154,NOTEEVENTS,DESCRIPTION,,PICC W/O PORT,,string,0.0,,,,27077,162888.0,,,
3,8491150,225154,NOTEEVENTS,CGID,,,,,,,,,27077,162888.0,,,
4,8491150,225154,NOTEEVENTS,ISERROR,,,,,,,,,27077,162888.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14197,8493516,225154,NOTEEVENTS,STORETIME,,2107-11-18 07:14:21,,timestamp,0.0,,,,17190,106415.0,,,
14198,8493516,225154,NOTEEVENTS,DESCRIPTION,,Generic Note,,string,0.0,,,,17190,106415.0,,,
14199,8493516,225154,NOTEEVENTS,CGID,,14775.0,,numeric,0.0,,,,17190,106415.0,,,
14200,8493516,225154,NOTEEVENTS,ISERROR,,,,,,,,,17190,106415.0,,,


In [202]:
noteevents_quiq.head(20)

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,8491150,225154,NOTEEVENTS,CHARTTIME,,2128-06-17 15:26:00,,timestamp,0.0,,,,27077,162888.0,,,
1,8491150,225154,NOTEEVENTS,STORETIME,,,,,,,,,27077,162888.0,,,
2,8491150,225154,NOTEEVENTS,DESCRIPTION,,PICC W/O PORT,,string,0.0,,,,27077,162888.0,,,
3,8491150,225154,NOTEEVENTS,CGID,,,,,,,,,27077,162888.0,,,
4,8491150,225154,NOTEEVENTS,ISERROR,,,,,,,,,27077,162888.0,,,
5,8491150,225154,NOTEEVENTS,TEXT,2128-06-17,[**2128-6-17**] 3:26 PM\n PICC LINE PLACMENT S...,,string,0.0,,,,27077,162888.0,,note_rad,CXR
6,8491151,225154,NOTEEVENTS,CHARTTIME,,2116-09-14 23:08:00,,timestamp,0.0,,,,59761,192397.0,,,
7,8491151,225154,NOTEEVENTS,STORETIME,,,,,,,,,59761,192397.0,,,
8,8491151,225154,NOTEEVENTS,DESCRIPTION,,CHEST (PA & LAT),,string,0.0,,,,59761,192397.0,,,
9,8491151,225154,NOTEEVENTS,CGID,,,,,,,,,59761,192397.0,,,


In [203]:
target_vars = ["DESCRIPTION", "CGID", "TEXT"]
condition_targets = noteevents_quiq["Variable_name"].isin(target_vars)

noteevents_quiq.loc[condition_targets, "Is_categorical"] = noteevents_quiq.loc[condition_targets, "Value"].apply(
    lambda v: 1 if pd.notna(v) else np.nan
)

noteevents_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,8491150,225154,NOTEEVENTS,CHARTTIME,,2128-06-17 15:26:00,,timestamp,0.0,,,,27077,162888.0,,,
1,8491150,225154,NOTEEVENTS,STORETIME,,,,,,,,,27077,162888.0,,,
2,8491150,225154,NOTEEVENTS,DESCRIPTION,,PICC W/O PORT,,string,1.0,,,,27077,162888.0,,,
3,8491150,225154,NOTEEVENTS,CGID,,,,,,,,,27077,162888.0,,,
4,8491150,225154,NOTEEVENTS,ISERROR,,,,,,,,,27077,162888.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14197,8493516,225154,NOTEEVENTS,STORETIME,,2107-11-18 07:14:21,,timestamp,0.0,,,,17190,106415.0,,,
14198,8493516,225154,NOTEEVENTS,DESCRIPTION,,Generic Note,,string,1.0,,,,17190,106415.0,,,
14199,8493516,225154,NOTEEVENTS,CGID,,14775.0,,numeric,1.0,,,,17190,106415.0,,,
14200,8493516,225154,NOTEEVENTS,ISERROR,,,,,,,,,17190,106415.0,,,


In [204]:
noteevents_quiq.to_csv("G:2000/MIMIC_noteevents_QUIQ.csv", index=False)

In [76]:
# VIA 테이블 생성
via_variable_names = ['CHARTTIME', 'STORETIME', 'CATEGORY', 'DESCRIPTION',
       'CGID', 'ISERROR', 'TEXT']

via_descriptions = ['records the date and time at which the note was charted',
                    'records the date and time at which a note was saved into the system',
                    'define the type of note recorded',
                    'define the type of note recorded',
                    'identifier for the caregiver who input the note',
                    'ISERROR=1 column indicates that a physician has identified this note as an error',
                    'contatins the note text'
                   ]


via_noteevents = pd.DataFrame({
    'Original_table_name': 'NOTEEVENTS',
    'Variable_name': via_variable_names,
    'Description': via_descriptions
})
via_noteevents

Unnamed: 0,Original_table_name,Variable_name,Description
0,NOTEEVENTS,CHARTTIME,records the date and time at which the note wa...
1,NOTEEVENTS,STORETIME,records the date and time at which a note was ...
2,NOTEEVENTS,CATEGORY,define the type of note recorded
3,NOTEEVENTS,DESCRIPTION,define the type of note recorded
4,NOTEEVENTS,CGID,identifier for the caregiver who input the note
5,NOTEEVENTS,ISERROR,ISERROR=1 column indicates that a physician ha...
6,NOTEEVENTS,TEXT,contatins the note text


In [77]:
via_noteevents.to_csv("G:/2000/MIMIC_noteevents_VIA.csv", index=False)

# Outputevents

In [205]:
outputevents = pd.read_csv('OUTPUTEVENTS.csv.gz', compression='gzip')
outputevents

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,CHARTTIME,ITEMID,VALUE,VALUEUOM,STORETIME,CGID,STOPPED,NEWBOTTLE,ISERROR
0,344,21219,177991.0,225765.0,2142-09-08 10:00:00,40055,200.0,ml,2142-09-08 12:08:00,17269,,,
1,345,21219,177991.0,225765.0,2142-09-08 12:00:00,40055,200.0,ml,2142-09-08 12:08:00,17269,,,
2,346,21219,177991.0,225765.0,2142-09-08 13:00:00,40055,120.0,ml,2142-09-08 13:39:00,17269,,,
3,347,21219,177991.0,225765.0,2142-09-08 14:00:00,40055,100.0,ml,2142-09-08 16:17:00,17269,,,
4,348,21219,177991.0,225765.0,2142-09-08 16:00:00,40055,200.0,ml,2142-09-08 16:17:00,17269,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4349213,4340476,68375,123645.0,276251.0,2175-09-05 23:00:00,226588,30.0,mL,2175-09-05 22:48:00,16915,,,
4349214,4340477,68375,123645.0,276251.0,2175-09-06 00:00:00,226588,0.0,mL,2175-09-06 00:03:00,20971,,,
4349215,4340478,68375,123645.0,276251.0,2175-09-06 01:00:00,226588,40.0,mL,2175-09-06 01:13:00,20971,,,
4349216,4340479,68375,123645.0,276251.0,2175-09-06 02:00:00,226588,20.0,mL,2175-09-06 02:14:00,20971,,,


In [206]:
outputevents = outputevents[outputevents["SUBJECT_ID"].isin(la)].reset_index(drop=True)
outputevents

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,CHARTTIME,ITEMID,VALUE,VALUEUOM,STORETIME,CGID,STOPPED,NEWBOTTLE,ISERROR
0,27,14469,155925.0,204432.0,2111-06-14 06:00:00,40055,0.0,,2111-06-14 06:00:00,21570,,,
1,28,14469,155925.0,204432.0,2111-06-17 01:00:00,40055,40.0,ml,2111-06-17 01:09:00,18592,,,
2,29,14469,155925.0,204432.0,2111-06-17 02:00:00,40055,30.0,ml,2111-06-17 02:41:00,18592,,,
3,30,14469,155925.0,204432.0,2111-06-17 03:00:00,40055,30.0,ml,2111-06-17 04:45:00,18592,,,
4,31,14469,155925.0,204432.0,2111-06-17 05:00:00,40055,60.0,ml,2111-06-17 06:12:00,18592,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
180832,4343746,80539,102089.0,248451.0,2138-08-24 08:02:00,226559,280.0,mL,2138-08-24 08:02:00,18539,,,
180833,4343747,80539,102089.0,248451.0,2138-08-24 10:00:00,226559,160.0,mL,2138-08-24 10:08:00,18539,,,
180834,4343748,80539,102089.0,248451.0,2138-08-24 11:00:00,226559,450.0,mL,2138-08-24 11:22:00,18539,,,
180835,4343749,80539,102089.0,248451.0,2138-08-24 12:00:00,226559,400.0,mL,2138-08-24 12:09:00,18539,,,


In [207]:
merged_df2 = pd.merge(
    outputevents,
    d_item[['ITEMID', 'LABEL', 'CATEGORY']],
    how='left',
    on='ITEMID'
)

# 조인 안 된 행만 필터링 (LABEL 또는 CATEGORY가 NaN인 경우)
unmatched2 = merged_df2[merged_df2['LABEL'].isna()]

# 결과 확인
unmatched2

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,CHARTTIME,ITEMID,VALUE,VALUEUOM,STORETIME,CGID,STOPPED,NEWBOTTLE,ISERROR,LABEL,CATEGORY


In [208]:
merged_df2

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,CHARTTIME,ITEMID,VALUE,VALUEUOM,STORETIME,CGID,STOPPED,NEWBOTTLE,ISERROR,LABEL,CATEGORY
0,27,14469,155925.0,204432.0,2111-06-14 06:00:00,40055,0.0,,2111-06-14 06:00:00,21570,,,,Urine Out Foley,
1,28,14469,155925.0,204432.0,2111-06-17 01:00:00,40055,40.0,ml,2111-06-17 01:09:00,18592,,,,Urine Out Foley,
2,29,14469,155925.0,204432.0,2111-06-17 02:00:00,40055,30.0,ml,2111-06-17 02:41:00,18592,,,,Urine Out Foley,
3,30,14469,155925.0,204432.0,2111-06-17 03:00:00,40055,30.0,ml,2111-06-17 04:45:00,18592,,,,Urine Out Foley,
4,31,14469,155925.0,204432.0,2111-06-17 05:00:00,40055,60.0,ml,2111-06-17 06:12:00,18592,,,,Urine Out Foley,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180832,4343746,80539,102089.0,248451.0,2138-08-24 08:02:00,226559,280.0,mL,2138-08-24 08:02:00,18539,,,,Foley,Output
180833,4343747,80539,102089.0,248451.0,2138-08-24 10:00:00,226559,160.0,mL,2138-08-24 10:08:00,18539,,,,Foley,Output
180834,4343748,80539,102089.0,248451.0,2138-08-24 11:00:00,226559,450.0,mL,2138-08-24 11:22:00,18539,,,,Foley,Output
180835,4343749,80539,102089.0,248451.0,2138-08-24 12:00:00,226559,400.0,mL,2138-08-24 12:09:00,18539,,,,Foley,Output


In [209]:
outputevents_df = merged_df2

In [210]:
# 결과 저장 리스트
rows = []

# 제외할 컬럼
exclude_cols = {'ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'VALUEUOM', 'CHARTTIME', 'ITEMID', 'CATEGORY', 'VALUE', 'LABEL'}

for index, row in outputevents_df.iterrows():
    patient_id = row["SUBJECT_ID"]
    admission_id = row["HADM_ID"]

    # ✅ 1. LABEL 기반 행 생성
    label_val = row["LABEL"]
    value_val = row["VALUE"]
    unit_val = row["VALUEUOM"] if "VALUEUOM" in outputevents_df.columns else np.nan
    event_date_val = row["CHARTTIME"] if "CHARTTIME" in outputevents_df.columns else np.nan
    variable_id_val = row["ITEMID"] if "ITEMID" in outputevents_df.columns else np.nan

    rows.append({
        "Primary_key": index + 8493517,
        "Variable_ID": variable_id_val,
        "Original_table_name": "OUTPUTEVENTS",
        "Variable_name": label_val,
        "Event_date": event_date_val,
        "Value": value_val,
        "Unit": unit_val,
        "Variable_type": np.nan,
        "Is_categorical": np.nan,
        "Recorder": np.nan,
        "Recorder_position": np.nan,
        "Recorder_affiliation": np.nan,
        "Patient_id": patient_id,
        "Admission_id": admission_id,
        "Ground_truth": np.nan,
        "Mapping_info_1": "event",
        "Mapping_info_2": "chart_event"
    })

    # ✅ 2. 나머지 열들에 대해 반복 (컬럼명을 Variable_name으로)
    for col in outputevents_df.columns:
        if col not in exclude_cols:
            value = row[col]
            rows.append({
                "Primary_key": index + 8493517,  # 고유성 확보용 소수 해시
                "Variable_ID": np.nan,
                "Original_table_name": "OUTPUTEVENTS",
                "Variable_name": col,
                "Event_date": np.nan,
                "Value": value,
                "Unit": np.nan,
                "Variable_type": np.nan,
                "Is_categorical": np.nan,
                "Recorder": np.nan,
                "Recorder_position": np.nan,
                "Recorder_affiliation": np.nan,
                "Patient_id": patient_id,
                "Admission_id": admission_id,
                "Ground_truth": np.nan,
                "Mapping_info_1": np.nan,
                "Mapping_info_2": np.nan
            })


# QUIQ 테이블로 변환
outputevents_quiq = pd.DataFrame(rows, columns=QUIQ_cols)

# -------------------------------
# Variable_type 추론 함수
def infer_variable_type(val):
    if pd.isna(val):
        return np.nan
    elif isinstance(val, (int, float)):
        return "numeric"
    elif isinstance(val, pd.Timestamp):
        return "timestamp"
    elif isinstance(val, str):
        try:
            pd.to_datetime(val, errors="raise")
            return "timestamp"
        except Exception:
            return "string"
    else:
        return "unknown"

# Variable_type 컬럼 채우기
outputevents_quiq["Variable_type"] = outputevents_quiq["Value"].apply(infer_variable_type)

# -------------------------------
# Is_categorical 판단
CATEGORICAL_THRESHOLD = 10

# 1. 각 Variable_name 별 고유값 수 계산
value_counts = outputevents_quiq.groupby("Variable_name")["Value"].nunique()

# 2. 고유값 수가 기준 이하인 변수 목록 추출
categorical_vars = value_counts[value_counts <= CATEGORICAL_THRESHOLD].index

# 3. Is_categorical 컬럼 채우기 (Value가 NaN이면 NaN)
outputevents_quiq["Is_categorical"] = outputevents_quiq.apply(
    lambda row: np.nan if pd.isna(row["Value"])
    else 1 if row["Variable_name"] in categorical_vars
    else 0,
    axis=1
)
outputevents_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,8493517,40055.0,OUTPUTEVENTS,Urine Out Foley,2111-06-14 06:00:00,0.0,,numeric,0.0,,,,14469,155925.0,,event,chart_event
1,8493517,,OUTPUTEVENTS,STORETIME,,2111-06-14 06:00:00,,timestamp,0.0,,,,14469,155925.0,,,
2,8493517,,OUTPUTEVENTS,CGID,,21570,,numeric,0.0,,,,14469,155925.0,,,
3,8493517,,OUTPUTEVENTS,STOPPED,,,,,,,,,14469,155925.0,,,
4,8493517,,OUTPUTEVENTS,NEWBOTTLE,,,,,,,,,14469,155925.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1085017,8674353,,OUTPUTEVENTS,STORETIME,,2136-10-22 10:29:00,,timestamp,0.0,,,,44625,163762.0,,,
1085018,8674353,,OUTPUTEVENTS,CGID,,20357,,numeric,0.0,,,,44625,163762.0,,,
1085019,8674353,,OUTPUTEVENTS,STOPPED,,,,,,,,,44625,163762.0,,,
1085020,8674353,,OUTPUTEVENTS,NEWBOTTLE,,,,,,,,,44625,163762.0,,,


In [211]:
# Mapping
mapping_rules = {
    'STORETIME':("date", np.nan), 
}

# 1. 매핑 함수 (값이 없고 기존 매핑도 없을 때만 매핑 시도)
def map_mapping_info(row):
    var_name = row["Variable_name"]
    value = row["Value"]

    # 기존 매핑 유지
    if pd.notna(row["Mapping_info_1"]):
        return pd.Series([row["Mapping_info_1"], row["Mapping_info_2"]])
    
    # 새 매핑 적용 (단, value가 NaN이면 매핑 안함)
    if pd.isna(value):
        return pd.Series([np.nan, np.nan])
    else:
        return pd.Series(mapping_rules.get(var_name, (np.nan, np.nan)))

# 2. 기존 Mapping_info 컬럼이 없으면 생성 (예외 방지)
if "Mapping_info_1" not in outputevents_quiq.columns:
    outputevents_quiq["Mapping_info_1"] = np.nan
    outputevents_quiq["Mapping_info_2"] = np.nan

# 3. 적용
outputevents_quiq[["Mapping_info_1", "Mapping_info_2"]] = outputevents_quiq.apply(map_mapping_info, axis=1)
outputevents_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,8493517,40055.0,OUTPUTEVENTS,Urine Out Foley,2111-06-14 06:00:00,0.0,,numeric,0.0,,,,14469,155925.0,,event,chart_event
1,8493517,,OUTPUTEVENTS,STORETIME,,2111-06-14 06:00:00,,timestamp,0.0,,,,14469,155925.0,,date,
2,8493517,,OUTPUTEVENTS,CGID,,21570,,numeric,0.0,,,,14469,155925.0,,,
3,8493517,,OUTPUTEVENTS,STOPPED,,,,,,,,,14469,155925.0,,,
4,8493517,,OUTPUTEVENTS,NEWBOTTLE,,,,,,,,,14469,155925.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1085017,8674353,,OUTPUTEVENTS,STORETIME,,2136-10-22 10:29:00,,timestamp,0.0,,,,44625,163762.0,,date,
1085018,8674353,,OUTPUTEVENTS,CGID,,20357,,numeric,0.0,,,,44625,163762.0,,,
1085019,8674353,,OUTPUTEVENTS,STOPPED,,,,,,,,,44625,163762.0,,,
1085020,8674353,,OUTPUTEVENTS,NEWBOTTLE,,,,,,,,,44625,163762.0,,,


In [213]:
target_vars = ["CGID", "TEXT"]
condition_targets = outputevents_quiq["Variable_name"].isin(target_vars)

outputevents_quiq.loc[condition_targets, "Is_categorical"] = outputevents_quiq.loc[condition_targets, "Value"].apply(
    lambda v: 1 if pd.notna(v) else np.nan
)

cond_cgid = (
    (outputevents_quiq["Variable_name"] == "CGID") &
    (outputevents_quiq["Value"].notna())
)
outputevents_quiq.loc[cond_cgid, "Variable_type"] = "string"

outputevents_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,8493517,40055.0,OUTPUTEVENTS,Urine Out Foley,2111-06-14 06:00:00,0.0,,numeric,0.0,,,,14469,155925.0,,event,chart_event
1,8493517,,OUTPUTEVENTS,STORETIME,,2111-06-14 06:00:00,,timestamp,0.0,,,,14469,155925.0,,date,
2,8493517,,OUTPUTEVENTS,CGID,,21570,,string,1.0,,,,14469,155925.0,,,
3,8493517,,OUTPUTEVENTS,STOPPED,,,,,,,,,14469,155925.0,,,
4,8493517,,OUTPUTEVENTS,NEWBOTTLE,,,,,,,,,14469,155925.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1085017,8674353,,OUTPUTEVENTS,STORETIME,,2136-10-22 10:29:00,,timestamp,0.0,,,,44625,163762.0,,date,
1085018,8674353,,OUTPUTEVENTS,CGID,,20357,,string,1.0,,,,44625,163762.0,,,
1085019,8674353,,OUTPUTEVENTS,STOPPED,,,,,,,,,44625,163762.0,,,
1085020,8674353,,OUTPUTEVENTS,NEWBOTTLE,,,,,,,,,44625,163762.0,,,


In [214]:
outputevents_quiq.to_csv("G:/2000/MIMIC_outputevents_QUIQ.csv", index=False)

In [48]:
# VIA 테이블 생성
via_variable_names = ['STORETIME', 'CGID', 'STOPPED', 'NEWBOTTLE','ISERROR',
                     'Urine Out Foley', 'Foley', 'Chest Tube #1', 'Stool Out Stool',
       'Drain Out #1 JP Medial', 'Void', 'Chest Tubes CTICU CT 1',
       'Pre-Admission Output Pre-Admission Output', 'TF Residual',
       'Urine Out IleoConduit', 'R Pleural #1', 'pin site drain',
       'Urine Out Void', 'Cerebral Drain L Ventricular Drain',
       'JP Medial', 'Jackson Pratt #1', 'Drainage Bag', 'Chest Tube #2',
       'Urine .', 'Gastric Gastric Tube', 'Gastric Emesis',
       'Pre-Admission', 'Ultrafiltrate Ultrafiltrate', 'Suprapubic',
       'Wound Vac #1', 'Drain Out #4 Jackson Pratt',
       'Urine Out Condom Cath', 'Stool', 'Drain Out #4 Other',
       'Chest Tubes Left Pleural 1', 'Cerebral Ventricular #1',
       'JP Lateral', 'Urine Out Incontinent', 'Drain Out #2 JP Lateral',
       'Ostomy (output)', 'Gastric Nasogastric',
       'Drain Out #1 Jackson Pratt', 'Chest Tubes Mediastinal',
       'Chest Tubes Right Pleural 1',
       'Cerebral Drain R Ventricular Drain', 'Lumbar', 'Blood Out Lab',
       'OR Out PACU Urine', 'Ileoconduit', 'Chest Tubes CTICU CT 2',
       'Mediastinal', 'Stool Out Ostomy', 'Oral Gastric',
       'Stool Out Fecal Bag', 'Drain Out #3 Other', 'OR Out EBL',
       'Stool Out Rectal Tube', 'Jackson Pratt #2', 'Condom Cath',
       'Drain Out #1 Other', 'Gastric Tube', 'Repogle Tube NG', 'OR EBL',
       'Stool Out Ileostomy', 'Jackson Pratt #3',
       'Drain Out #2 Jackson Pratt', 'Hemovac #1', 'Rectal Tube',
       'Fecal Bag', 'Nasogastric', 'Pigtail #1', 'Urine Out Suprapubic',
       'R Nephrostomy', 'Drain Out #1 Hemovac', 'TF Residual Output',
       'Drain Out #5 Wound Vac', 'Gastric Oral Gastric',
       'GU Irrigant/Urine Volume Out', 'Urine Out Lt Nephrostomy',
       'Drain Out #1 JP Lateral', 'OR Out OR Urine', 'Pigtail #2',
       'Drain Out #1 T Tube', 'Drain Out #2 Other', 'BM20NG+thick&easy',
       'Cerebral Ventricular #2', 'Chest Tubes Right Pleural 2',
       'Cath Lab Output', 'Drain Out #1 Tap', 'GU Irrigant Volume In',
       'R Pleural #2', 'Ostomies Ileostomy', 'L Pleural #1',
       'L Nephrostomy', 'Stool Out Colostomy', 'Drain Out #3 T Tube',
       'OR Urine', 'Wound Vac #2', 'T Tube', 'Drain Out #2 T Tube',
       'JP 5 and JP 6 sxn', 'chest tube irrigant', 'LLQ Paracentesis sit',
       'Drain Out #3 Jackson Pratt', 'left chest drain', 'stool cc/kg/hr',
       'Drain Out #1 Pericardial', 'Drain Out #1 Lumbar',
       'Chest Tube R #3', 'Urine Out Other', 'Drain Out #1 Wound Vac',
       'Cath Lab', 'Drain Out #6 Other', 'Chest Tubes Other',
       'Chest Tubes Left Pleural 2', 'Tap', 'Drain Out #7 Jackson Pratt',
       'Drain Out #5 Other', 'PACU Out PACU Drains', 'left arm drainage',
       'Straight Cath', 'PACU Out PACU Urine',
       'Drain Out #5 Jackson Pratt', 'penrose drain outpt',
       'Urine Out Rt Nephrostomy', 'Stool Out (non-specific)',
       'Drain Out #1 Pigtail', 'Urine Out Ureteral Stent #1',
       'Drain Out #8 Jackson Pratt', 'Gastric Other', 'R Ureteral Stent',
       'Jackson Pratt #4', 'Emesis', 'Cerebral Drain Subdural',
       'PD drainage', 'thoracentesis', 'Drain Out #2 Wound Vac',
       'Stool .', 'L Pleural #2', 'JP #1 & #2 to CLWS',
       'PACU Out PACU NG', 'Urine Out Straight Cath', 'spit',
       'ANGIO URINE OUT', 'Gastric Jejunostomy Tube',
       'Cerebral Drain Other', 'D5W VIA J-TUBE', 'Drain Out #2 Hemovac',
       'drain flush', 'JP FLUSH', 'JP #1&#2 CLWS', 'Sump #1',
       'DRAIN FLUSH', 'Drain Out #2 Pigtail', 'Drain Out #1 Davol',
       'Cardiac output', 'PACU EBL', 'BM20PO+thick&easy', 'Jejunostomy',
       'Urine cc/k/hr', 'Cerebral Subdural #1', 'TRUE URINE',
       'Drain Out #3 Wound Vac', 'Stool Out Other', 'Gastric lavage',
       'True Urine', 'Drain Out #1 Jejunostomy Tube', 'mucous fistula',
       'G-TUBE TO GRAVITY', 'Hemovac #2', 'Urine Out Ureteral Stent #2',
       'Drain Out #1 Sump', 'Drain Out #6 Jackson Pratt',
       'Drain Out #4 Pigtail', 'Repogle Tube OGT',
       'Drain Out #2 JP Medial', 'Drain Out #1 Lt Nephrostomy',
       'Chest Tubes Chest Tube', 'Drain Out #3 Pigtail', 'Pericardial',
       'Drain Out #2 Sump', 'D5W via J-tube',
       'Drain Out #1 Rt Nephrostomy', 'PEG (BAG)', 'R chest drain bag',
       'R PLEURAL NS IRRIG', 'Dialysis', 'PACU Drains',
       'ascitic drainage', 'Drain Out #4 T Tube', 'Cerebral Subdural #2',
       'ER URINE', 'right pleural 3', 't tube flush', 'dialysis output',
       'Hemodialysis', 'Hemodialysis removal', 'L LOWER LEG DRAIN',
       'ngt/bile aspirate', 'paracentesis', 'Drain Out #3 Lumbar',
       'PACU Urine', 'left drain', 'Sump #2', 'pericard drain asp',
       'Chole tube', 'endo drainage', 'PACU Gastric',
       'right arm drainage', 'Left Arm drains.', 'Thoracentesis',
       'urine cc/kg/hr', 'emesis', 'T-TUBE', 'j-tube flush',
       'right drain', 'MUCOUS FISTULA', 'Gastric Blakemore',
       'JP LEFT LEG #2', 'J-tube flush', 'EP lab output',
       'Drain Out #3 JP Lateral', 'J Tube Flush', 'ostomy',
       'URINE CC/KG/HR', 'GASTRIC ASPIRATION', 'Dialysis indwelling',
       'Drain bag old CTsite', 'Drain Out #1 Penrose',
       'Drain Out #7 Pigtail', 'penrose drain, abd.', 'JP RIGHT LEG #1',
       'cardiac output', 'Drain Out #8 Pigtail', 'vac drain',
       'pleural tap', 'PACU Out EBL', 'hemodialysis', 'dialysis out',
       'HIP WOUND OUT', 'hd removed', 'Dialysis out', 'ED OUTPUT',
       'paracenteis drainag.', 'PD NET FLUID REMOVED', 'ER output',
       'JP Site.', 'stool/fistula', 'rt/lt nepro tube flh', 'VAC output',
       'JP 1&2', 'Drain Out #4 Penrose', 'DIALYSIS OUT',
       'Pheresis Output', 'NG out', 'LEFT ARM POUCH', 'ngt flush,h20',
       'urine cc/k/hr', '1/2strength impact', 'ed output',
       'Drain Out #3 Sump', 'Penrose #1', 'ORAL DRAINAGE',
       'Drain Out #2 Pericardial', 'JTUBE SITE DRAINAGE', 'spits',
       'Drain Out #2 Tap', 'gastric lavage', 'DIALYSIS OUTPUT', 'PEG',
       'abd drain', 'urine o/p cc/kg/hr', 'GU output total',
       'LLE Drainage bag', 'DIALYSIS', 'Drain Out #6 T Tube',
       'pleural fluid', 'r femoral drng', 'PD Volume Out.',
       'Drain Out #5 Jejunostomy Tube', 'peg output',
       'Drain Out #3 Hemovac', 'pleural tap output', 'VICU OUTPUT',
       'hd out', 'HD output', 'Cardiac Output', 'BM28 4Enf Powd 4 MCT',
       'ileostomy', 'dialysis', 'Blakemore', 'Cerebral Drain #1']

via_descriptions = ['records the time at which an observation was manually input or manually valaidated by a member of the clinical staff',
                    'the identifier for the aregiver who validated the given measurement',
                    'indicates if the order was disconnected at the given CHARTTIME',
                    'indicates that a new bag of solution was hung at the given CHARTTIME',
                    'A Metavision checkbox where a care giver can specify that an observation is an error. No other details are provided',
                    'Urine Out Foley', 'Foley', 'Chest Tube #1', 'Stool Out Stool',
       'Drain Out #1 JP Medial', 'Void', 'Chest Tubes CTICU CT 1',
       'Pre-Admission Output Pre-Admission Output', 'TF Residual',
       'Urine Out IleoConduit', 'R Pleural #1', 'pin site drain',
       'Urine Out Void', 'Cerebral Drain L Ventricular Drain',
       'JP Medial', 'Jackson Pratt #1', 'Drainage Bag', 'Chest Tube #2',
       'Urine .', 'Gastric Gastric Tube', 'Gastric Emesis',
       'Pre-Admission', 'Ultrafiltrate Ultrafiltrate', 'Suprapubic',
       'Wound Vac #1', 'Drain Out #4 Jackson Pratt',
       'Urine Out Condom Cath', 'Stool', 'Drain Out #4 Other',
       'Chest Tubes Left Pleural 1', 'Cerebral Ventricular #1',
       'JP Lateral', 'Urine Out Incontinent', 'Drain Out #2 JP Lateral',
       'Ostomy (output)', 'Gastric Nasogastric',
       'Drain Out #1 Jackson Pratt', 'Chest Tubes Mediastinal',
       'Chest Tubes Right Pleural 1',
       'Cerebral Drain R Ventricular Drain', 'Lumbar', 'Blood Out Lab',
       'OR Out PACU Urine', 'Ileoconduit', 'Chest Tubes CTICU CT 2',
       'Mediastinal', 'Stool Out Ostomy', 'Oral Gastric',
       'Stool Out Fecal Bag', 'Drain Out #3 Other', 'OR Out EBL',
       'Stool Out Rectal Tube', 'Jackson Pratt #2', 'Condom Cath',
       'Drain Out #1 Other', 'Gastric Tube', 'Repogle Tube NG', 'OR EBL',
       'Stool Out Ileostomy', 'Jackson Pratt #3',
       'Drain Out #2 Jackson Pratt', 'Hemovac #1', 'Rectal Tube',
       'Fecal Bag', 'Nasogastric', 'Pigtail #1', 'Urine Out Suprapubic',
       'R Nephrostomy', 'Drain Out #1 Hemovac', 'TF Residual Output',
       'Drain Out #5 Wound Vac', 'Gastric Oral Gastric',
       'GU Irrigant/Urine Volume Out', 'Urine Out Lt Nephrostomy',
       'Drain Out #1 JP Lateral', 'OR Out OR Urine', 'Pigtail #2',
       'Drain Out #1 T Tube', 'Drain Out #2 Other', 'BM20NG+thick&easy',
       'Cerebral Ventricular #2', 'Chest Tubes Right Pleural 2',
       'Cath Lab Output', 'Drain Out #1 Tap', 'GU Irrigant Volume In',
       'R Pleural #2', 'Ostomies Ileostomy', 'L Pleural #1',
       'L Nephrostomy', 'Stool Out Colostomy', 'Drain Out #3 T Tube',
       'OR Urine', 'Wound Vac #2', 'T Tube', 'Drain Out #2 T Tube',
       'JP 5 and JP 6 sxn', 'chest tube irrigant', 'LLQ Paracentesis sit',
       'Drain Out #3 Jackson Pratt', 'left chest drain', 'stool cc/kg/hr',
       'Drain Out #1 Pericardial', 'Drain Out #1 Lumbar',
       'Chest Tube R #3', 'Urine Out Other', 'Drain Out #1 Wound Vac',
       'Cath Lab', 'Drain Out #6 Other', 'Chest Tubes Other',
       'Chest Tubes Left Pleural 2', 'Tap', 'Drain Out #7 Jackson Pratt',
       'Drain Out #5 Other', 'PACU Out PACU Drains', 'left arm drainage',
       'Straight Cath', 'PACU Out PACU Urine',
       'Drain Out #5 Jackson Pratt', 'penrose drain outpt',
       'Urine Out Rt Nephrostomy', 'Stool Out (non-specific)',
       'Drain Out #1 Pigtail', 'Urine Out Ureteral Stent #1',
       'Drain Out #8 Jackson Pratt', 'Gastric Other', 'R Ureteral Stent',
       'Jackson Pratt #4', 'Emesis', 'Cerebral Drain Subdural',
       'PD drainage', 'thoracentesis', 'Drain Out #2 Wound Vac',
       'Stool .', 'L Pleural #2', 'JP #1 & #2 to CLWS',
       'PACU Out PACU NG', 'Urine Out Straight Cath', 'spit',
       'ANGIO URINE OUT', 'Gastric Jejunostomy Tube',
       'Cerebral Drain Other', 'D5W VIA J-TUBE', 'Drain Out #2 Hemovac',
       'drain flush', 'JP FLUSH', 'JP #1&#2 CLWS', 'Sump #1',
       'DRAIN FLUSH', 'Drain Out #2 Pigtail', 'Drain Out #1 Davol',
       'Cardiac output', 'PACU EBL', 'BM20PO+thick&easy', 'Jejunostomy',
       'Urine cc/k/hr', 'Cerebral Subdural #1', 'TRUE URINE',
       'Drain Out #3 Wound Vac', 'Stool Out Other', 'Gastric lavage',
       'True Urine', 'Drain Out #1 Jejunostomy Tube', 'mucous fistula',
       'G-TUBE TO GRAVITY', 'Hemovac #2', 'Urine Out Ureteral Stent #2',
       'Drain Out #1 Sump', 'Drain Out #6 Jackson Pratt',
       'Drain Out #4 Pigtail', 'Repogle Tube OGT',
       'Drain Out #2 JP Medial', 'Drain Out #1 Lt Nephrostomy',
       'Chest Tubes Chest Tube', 'Drain Out #3 Pigtail', 'Pericardial',
       'Drain Out #2 Sump', 'D5W via J-tube',
       'Drain Out #1 Rt Nephrostomy', 'PEG (BAG)', 'R chest drain bag',
       'R PLEURAL NS IRRIG', 'Dialysis', 'PACU Drains',
       'ascitic drainage', 'Drain Out #4 T Tube', 'Cerebral Subdural #2',
       'ER URINE', 'right pleural 3', 't tube flush', 'dialysis output',
       'Hemodialysis', 'Hemodialysis removal', 'L LOWER LEG DRAIN',
       'ngt/bile aspirate', 'paracentesis', 'Drain Out #3 Lumbar',
       'PACU Urine', 'left drain', 'Sump #2', 'pericard drain asp',
       'Chole tube', 'endo drainage', 'PACU Gastric',
       'right arm drainage', 'Left Arm drains.', 'Thoracentesis',
       'urine cc/kg/hr', 'emesis', 'T-TUBE', 'j-tube flush',
       'right drain', 'MUCOUS FISTULA', 'Gastric Blakemore',
       'JP LEFT LEG #2', 'J-tube flush', 'EP lab output',
       'Drain Out #3 JP Lateral', 'J Tube Flush', 'ostomy',
       'URINE CC/KG/HR', 'GASTRIC ASPIRATION', 'Dialysis indwelling',
       'Drain bag old CTsite', 'Drain Out #1 Penrose',
       'Drain Out #7 Pigtail', 'penrose drain, abd.', 'JP RIGHT LEG #1',
       'cardiac output', 'Drain Out #8 Pigtail', 'vac drain',
       'pleural tap', 'PACU Out EBL', 'hemodialysis', 'dialysis out',
       'HIP WOUND OUT', 'hd removed', 'Dialysis out', 'ED OUTPUT',
       'paracenteis drainag.', 'PD NET FLUID REMOVED', 'ER output',
       'JP Site.', 'stool/fistula', 'rt/lt nepro tube flh', 'VAC output',
       'JP 1&2', 'Drain Out #4 Penrose', 'DIALYSIS OUT',
       'Pheresis Output', 'NG out', 'LEFT ARM POUCH', 'ngt flush,h20',
       'urine cc/k/hr', '1/2strength impact', 'ed output',
       'Drain Out #3 Sump', 'Penrose #1', 'ORAL DRAINAGE',
       'Drain Out #2 Pericardial', 'JTUBE SITE DRAINAGE', 'spits',
       'Drain Out #2 Tap', 'gastric lavage', 'DIALYSIS OUTPUT', 'PEG',
       'abd drain', 'urine o/p cc/kg/hr', 'GU output total',
       'LLE Drainage bag', 'DIALYSIS', 'Drain Out #6 T Tube',
       'pleural fluid', 'r femoral drng', 'PD Volume Out.',
       'Drain Out #5 Jejunostomy Tube', 'peg output',
       'Drain Out #3 Hemovac', 'pleural tap output', 'VICU OUTPUT',
       'hd out', 'HD output', 'Cardiac Output', 'BM28 4Enf Powd 4 MCT',
       'ileostomy', 'dialysis', 'Blakemore', 'Cerebral Drain #1'

]

via_outputevents = pd.DataFrame({
    'Original_table_name': 'OUTPUTEVENTS',
    'Variable_name': via_variable_names,
    'Description': via_descriptions
})
via_outputevents

Unnamed: 0,Original_table_name,Variable_name,Description
0,OUTPUTEVENTS,STORETIME,records the time at which an observation was m...
1,OUTPUTEVENTS,CGID,the identifier for the aregiver who validated ...
2,OUTPUTEVENTS,STOPPED,indicates if the order was disconnected at the...
3,OUTPUTEVENTS,NEWBOTTLE,indicates that a new bag of solution was hung ...
4,OUTPUTEVENTS,ISERROR,A Metavision checkbox where a care giver can s...
...,...,...,...
287,OUTPUTEVENTS,BM28 4Enf Powd 4 MCT,BM28 4Enf Powd 4 MCT
288,OUTPUTEVENTS,ileostomy,ileostomy
289,OUTPUTEVENTS,dialysis,dialysis
290,OUTPUTEVENTS,Blakemore,Blakemore


In [49]:
via_outputevents.to_csv("G:/2000/MIMIC_ouputevents_VIA.csv", index=False)

# Procedureevents_mv

In [215]:
procedureevents_mv = pd.read_csv('PROCEDUREEVENTS_MV.csv.gz', compression='gzip')
procedureevents_mv

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,STARTTIME,ENDTIME,ITEMID,VALUE,VALUEUOM,LOCATION,...,ORDERCATEGORYNAME,SECONDARYORDERCATEGORYNAME,ORDERCATEGORYDESCRIPTION,ISOPENBAG,CONTINUEINNEXTDEPT,CANCELREASON,STATUSDESCRIPTION,COMMENTS_EDITEDBY,COMMENTS_CANCELEDBY,COMMENTS_DATE
0,379,29070,115071,232563.0,2145-03-12 23:04:00,2145-03-12 23:05:00,225401,1.0,,,...,Procedures,,Electrolytes,0,0,0,FinishedRunning,,,
1,380,29070,115071,232563.0,2145-03-12 23:04:00,2145-03-12 23:05:00,225454,1.0,,,...,Procedures,,Electrolytes,0,0,0,FinishedRunning,,,
2,381,29070,115071,232563.0,2145-03-12 23:05:00,2145-03-18 20:01:00,225792,8456.0,hour,,...,Ventilation,,Task,1,0,0,FinishedRunning,,,
3,382,29070,115071,232563.0,2145-03-12 23:36:00,2145-03-12 23:37:00,225402,1.0,,,...,Procedures,,Electrolytes,0,0,0,FinishedRunning,,,
4,383,29070,115071,232563.0,2145-03-13 01:27:00,2145-03-16 16:00:00,224560,5193.0,min,Right IJ,...,Invasive Lines,,Task,1,0,0,FinishedRunning,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
258061,257337,41958,182711,246007.0,2155-08-09 01:51:00,2155-08-10 11:20:00,225204,2009.0,min,Right Antecube,...,Invasive Lines,,Task,1,0,0,FinishedRunning,,,
258062,257338,41958,182711,246007.0,2155-08-09 01:57:00,2155-08-09 01:58:00,225402,1.0,,,...,Procedures,,Electrolytes,0,0,0,FinishedRunning,,,
258063,257339,41958,182711,246007.0,2155-08-09 01:57:00,2155-08-09 01:58:00,225459,1.0,,,...,Imaging,,Electrolytes,0,0,0,FinishedRunning,,,
258064,257340,41958,182711,246007.0,2155-08-09 01:57:00,2155-08-09 01:58:00,225966,1.0,,,...,Procedures,,Electrolytes,0,0,1,Rewritten,,RN,2155-08-09 02:20:00


In [216]:
merged_procedure = pd.merge(
    procedureevents_mv,
    d_item[['ITEMID', 'LABEL', 'CATEGORY']],
    how='left',
    on='ITEMID'
)

# 조인 안 된 행만 필터링 (LABEL 또는 CATEGORY가 NaN인 경우)
unmatched_p = merged_procedure[merged_procedure['LABEL'].isna()]

# 결과 확인
unmatched_p

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,STARTTIME,ENDTIME,ITEMID,VALUE,VALUEUOM,LOCATION,...,ORDERCATEGORYDESCRIPTION,ISOPENBAG,CONTINUEINNEXTDEPT,CANCELREASON,STATUSDESCRIPTION,COMMENTS_EDITEDBY,COMMENTS_CANCELEDBY,COMMENTS_DATE,LABEL,CATEGORY


In [217]:
procedureevents_mv_df = merged_procedure

In [218]:
procedureevents_mv_df.columns

Index(['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'STARTTIME', 'ENDTIME',
       'ITEMID', 'VALUE', 'VALUEUOM', 'LOCATION', 'LOCATIONCATEGORY',
       'STORETIME', 'CGID', 'ORDERID', 'LINKORDERID', 'ORDERCATEGORYNAME',
       'SECONDARYORDERCATEGORYNAME', 'ORDERCATEGORYDESCRIPTION', 'ISOPENBAG',
       'CONTINUEINNEXTDEPT', 'CANCELREASON', 'STATUSDESCRIPTION',
       'COMMENTS_EDITEDBY', 'COMMENTS_CANCELEDBY', 'COMMENTS_DATE', 'LABEL',
       'CATEGORY'],
      dtype='object')

In [219]:
procedureevents_mv_df = procedureevents_mv_df[procedureevents_mv_df["SUBJECT_ID"].isin(la)].reset_index(drop=True)
procedureevents_mv_df

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,STARTTIME,ENDTIME,ITEMID,VALUE,VALUEUOM,LOCATION,...,ORDERCATEGORYDESCRIPTION,ISOPENBAG,CONTINUEINNEXTDEPT,CANCELREASON,STATUSDESCRIPTION,COMMENTS_EDITEDBY,COMMENTS_CANCELEDBY,COMMENTS_DATE,LABEL,CATEGORY
0,987,27366,174736,248077.0,2143-03-01 04:23:00,2143-03-02 07:25:00,224277,1622.0,min,R Antecube,...,Task,1,0,0,FinishedRunning,,,,18 Gauge,Access Lines - Peripheral
1,988,27366,174736,248077.0,2143-03-01 04:24:00,2143-03-01 07:00:00,224275,156.0,min,R Hand,...,Task,1,0,0,FinishedRunning,,,,20 Gauge,Access Lines - Peripheral
2,989,27366,174736,248077.0,2143-03-01 07:45:00,2143-03-01 07:46:00,224385,1.0,,,...,Electrolytes,0,0,0,FinishedRunning,,,,Intubation,1-Intubation/Extubation
3,990,27366,174736,248077.0,2143-03-01 08:00:00,2143-03-01 09:24:00,224277,84.0,min,R Hand,...,Task,1,0,0,FinishedRunning,,,,18 Gauge,Access Lines - Peripheral
4,991,27366,174736,248077.0,2143-03-01 08:31:00,2143-03-01 08:32:00,224385,1.0,,,...,Electrolytes,0,0,2,Rewritten,RN,,2143-03-01 08:32:00,Intubation,1-Intubation/Extubation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10296,257762,45910,153970,249864.0,2197-10-25 20:47:00,2197-10-27 14:47:00,224275,2520.0,min,,...,Task,1,0,0,FinishedRunning,,,,20 Gauge,Access Lines - Peripheral
10297,257763,45910,153970,249864.0,2197-10-25 21:18:00,2197-10-26 07:52:00,224268,634.0,min,Right IJ,...,Task,1,0,0,FinishedRunning,,,,Trauma line,Access Lines - Invasive
10298,257764,45910,153970,249864.0,2197-10-26 08:11:00,2197-10-27 12:38:00,224268,1707.0,min,Right IJ,...,Task,1,0,0,FinishedRunning,,,,Trauma line,Access Lines - Invasive
10299,257765,45910,153970,249864.0,2197-10-26 17:05:00,2197-10-26 17:06:00,227194,1.0,,,...,Electrolytes,0,0,0,FinishedRunning,,,,Extubation,1-Intubation/Extubation


In [220]:
# 결과 저장 리스트
rows = []

# 제외할 컬럼
exclude_cols = {'ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'VALUEUOM', 'CHARTTIME', 'ITEMID', 'CATEGORY'}

for index, row in procedureevents_mv_df.iterrows():
    patient_id = row["SUBJECT_ID"]
    admission_id = row["HADM_ID"]
    
    for col in procedureevents_mv_df.columns:
        if col not in exclude_cols:
            value = row[col]

            # Unit 처리 조건
            if col == "VALUE":
                unit_val = row["VALUEUOM"] if "VALUEUOM" in procedureevents_mv_df.columns else np.nan
            else:
                unit_val = np.nan
                
            # Event_date 설정
            if col == "LABEL":
                event_date_val = row["STARTTIME"] if "STARTTIME" in procedureevents_mv_df.columns else np.nan
            else:
                event_date_val = np.nan
                
            # Variable_ID 설정: LABEL일 때만 ITEMID 사용
            variable_id_val = row["ITEMID"] if col == "LABEL" and "ITEMID" in procedureevents_mv_df.columns else np.nan

            rows.append({
                "Primary_key": index + 8785918, 
                "Variable_ID": variable_id_val,
                "Original_table_name": "OUTPUTEVENTS",
                "Variable_name": col,
                "Event_date": event_date_val,
                "Value": value,
                "Unit": unit_val,
                "Variable_type": np.nan,      # 나중에 설정
                "Is_categorical": np.nan,     # 나중에 설정
                "Recorder": np.nan,
                "Recorder_position": np.nan,
                "Recorder_affiliation": np.nan,
                "Patient_id": patient_id,
                "Admission_id": admission_id,
                "Ground_truth": np.nan,
                "Mapping_info_1": np.nan,
                "Mapping_info_2": np.nan
            })

# QUIQ 테이블로 변환
procedureevents_quiq = pd.DataFrame(rows, columns=QUIQ_cols)

# -------------------------------
# Variable_type 추론 함수
def infer_variable_type(val):
    if pd.isna(val):
        return np.nan
    elif isinstance(val, (int, float)):
        return "numeric"
    elif isinstance(val, pd.Timestamp):
        return "timestamp"
    elif isinstance(val, str):
        # 먼저 숫자형 가능한지 체크
        try:
            float(val)  # 문자열이 숫자면 float 변환 성공
            return "numeric"
        except ValueError:
            pass

        # 다음으로 timestamp 가능한지 체크
        try:
            pd.to_datetime(val, errors="raise")
            return "timestamp"
        except Exception:
            return "string"
    else:
        return "unknown"

# Variable_type 컬럼 채우기
procedureevents_quiq["Variable_type"] = procedureevents_quiq["Value"].apply(infer_variable_type)

# -------------------------------
CATEGORICAL_THRESHOLD = 10

# 고유값 수 계산
value_counts = procedureevents_quiq.groupby("Variable_name")["Value"].nunique()

# 기준 이하인 변수 목록
categorical_vars = value_counts[value_counts <= CATEGORICAL_THRESHOLD].index

# Is_categorical 설정 (Value가 없으면 NaN)
procedureevents_quiq["Is_categorical"] = procedureevents_quiq.apply(
    lambda row: np.nan if pd.isna(row["Value"])
    else 1 if row["Variable_name"] in categorical_vars
    else 0,
    axis=1
)
procedureevents_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,8785918,,OUTPUTEVENTS,STARTTIME,,2143-03-01 04:23:00,,timestamp,0.0,,,,27366,174736,,,
1,8785918,,OUTPUTEVENTS,ENDTIME,,2143-03-02 07:25:00,,timestamp,0.0,,,,27366,174736,,,
2,8785918,,OUTPUTEVENTS,VALUE,,1622.0,min,numeric,0.0,,,,27366,174736,,,
3,8785918,,OUTPUTEVENTS,LOCATION,,R Antecube,,string,0.0,,,,27366,174736,,,
4,8785918,,OUTPUTEVENTS,LOCATIONCATEGORY,,Peripheral - old,,string,1.0,,,,27366,174736,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206015,8796218,,OUTPUTEVENTS,STATUSDESCRIPTION,,FinishedRunning,,string,1.0,,,,45910,153970,,,
206016,8796218,,OUTPUTEVENTS,COMMENTS_EDITEDBY,,,,,,,,,45910,153970,,,
206017,8796218,,OUTPUTEVENTS,COMMENTS_CANCELEDBY,,,,,,,,,45910,153970,,,
206018,8796218,,OUTPUTEVENTS,COMMENTS_DATE,,,,,,,,,45910,153970,,,


In [223]:
procedureevents_quiq.head(30)

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,8785918,,OUTPUTEVENTS,STARTTIME,,2143-03-01 04:23:00,,timestamp,0.0,,,,27366,174736,,,
1,8785918,,OUTPUTEVENTS,ENDTIME,,2143-03-02 07:25:00,,timestamp,0.0,,,,27366,174736,,,
2,8785918,,OUTPUTEVENTS,VALUE,,1622.0,min,numeric,0.0,,,,27366,174736,,,
3,8785918,,OUTPUTEVENTS,LOCATION,,R Antecube,,string,1.0,,,,27366,174736,,,
4,8785918,,OUTPUTEVENTS,LOCATIONCATEGORY,,Peripheral - old,,string,1.0,,,,27366,174736,,,
5,8785918,,OUTPUTEVENTS,STORETIME,,2143-03-02 07:32:00,,timestamp,0.0,,,,27366,174736,,,
6,8785918,,OUTPUTEVENTS,CGID,,21297,,string,1.0,,,,27366,174736,,,
7,8785918,,OUTPUTEVENTS,ORDERID,,7079526,,string,1.0,,,,27366,174736,,,
8,8785918,,OUTPUTEVENTS,LINKORDERID,,7079526,,string,1.0,,,,27366,174736,,,
9,8785918,,OUTPUTEVENTS,ORDERCATEGORYNAME,,Peripheral Lines,,string,1.0,,,,27366,174736,,,


In [222]:
# 2. LINKORDERID, ORDERID, LABEL, ORIGINAL → Is_categorical = 1 or NaN (if Value is NaN)
target_vars = ["LINKORDERID", "ORDERID", "CGID","LABEL", "ORDERCATEGORYNAME", "SECONDARYORDERCATEGORYNAME", "LOCATION"]
condition_targets = procedureevents_quiq["Variable_name"].isin(target_vars)

procedureevents_quiq.loc[condition_targets, "Is_categorical"] = procedureevents_quiq.loc[condition_targets, "Value"].apply(
    lambda v: 1 if pd.notna(v) else np.nan
)

# 3. LINKORDERID, ORDERID → Variable_type = "string" (only if Value is not null)
cond_linkorderid = (
    (procedureevents_quiq["Variable_name"] == "LINKORDERID") &
    (procedureevents_quiq["Value"].notna())
)
procedureevents_quiq.loc[cond_linkorderid, "Variable_type"] = "string"

cond_orderid = (
    (procedureevents_quiq["Variable_name"] == "ORDERID") &
    (procedureevents_quiq["Value"].notna())
)
procedureevents_quiq.loc[cond_orderid, "Variable_type"] = "string"

cond_cgid = (
    (procedureevents_quiq["Variable_name"] == "CGID") &
    (procedureevents_quiq["Value"].notna())
)
procedureevents_quiq.loc[cond_cgid, "Variable_type"] = "string"

procedureevents_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,8785918,,OUTPUTEVENTS,STARTTIME,,2143-03-01 04:23:00,,timestamp,0.0,,,,27366,174736,,,
1,8785918,,OUTPUTEVENTS,ENDTIME,,2143-03-02 07:25:00,,timestamp,0.0,,,,27366,174736,,,
2,8785918,,OUTPUTEVENTS,VALUE,,1622.0,min,numeric,0.0,,,,27366,174736,,,
3,8785918,,OUTPUTEVENTS,LOCATION,,R Antecube,,string,1.0,,,,27366,174736,,,
4,8785918,,OUTPUTEVENTS,LOCATIONCATEGORY,,Peripheral - old,,string,1.0,,,,27366,174736,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206015,8796218,,OUTPUTEVENTS,STATUSDESCRIPTION,,FinishedRunning,,string,1.0,,,,45910,153970,,,
206016,8796218,,OUTPUTEVENTS,COMMENTS_EDITEDBY,,,,,,,,,45910,153970,,,
206017,8796218,,OUTPUTEVENTS,COMMENTS_CANCELEDBY,,,,,,,,,45910,153970,,,
206018,8796218,,OUTPUTEVENTS,COMMENTS_DATE,,,,,,,,,45910,153970,,,


In [224]:
# 1. 매핑 함수 (값이 없고 기존 매핑도 없을 때만 매핑 시도)
mapping_rules = {
    'STARTTIME':("date", np.nan),
    'ENDTIME':("date", np.nan), 
    'COMMENTS_DATE': ("date", np.nan),
    'STORETIME': ("date", np.nan),
    'LABEL': ("procedure", np.nan)
}

def map_mapping_info(row):
    var_name = row["Variable_name"]
    value = row["Value"]

    # 기존 매핑 유지
    if pd.notna(row["Mapping_info_1"]):
        return pd.Series([row["Mapping_info_1"], row["Mapping_info_2"]])
    
    # 새 매핑 적용 (단, value가 NaN이면 매핑 안함)
    if pd.isna(value):
        return pd.Series([np.nan, np.nan])
    else:
        return pd.Series(mapping_rules.get(var_name, (np.nan, np.nan)))

# 2. 기존 Mapping_info 컬럼이 없으면 생성 (예외 방지)
if "Mapping_info_1" not in procedureevents_quiq.columns:
    procedureevents_quiq["Mapping_info_1"] = np.nan
    procedureevents_quiq["Mapping_info_2"] = np.nan

# 3. 적용
procedureevents_quiq[["Mapping_info_1", "Mapping_info_2"]] = procedureevents_quiq.apply(map_mapping_info, axis=1)
procedureevents_quiq

Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,8785918,,OUTPUTEVENTS,STARTTIME,,2143-03-01 04:23:00,,timestamp,0.0,,,,27366,174736,,date,
1,8785918,,OUTPUTEVENTS,ENDTIME,,2143-03-02 07:25:00,,timestamp,0.0,,,,27366,174736,,date,
2,8785918,,OUTPUTEVENTS,VALUE,,1622.0,min,numeric,0.0,,,,27366,174736,,,
3,8785918,,OUTPUTEVENTS,LOCATION,,R Antecube,,string,1.0,,,,27366,174736,,,
4,8785918,,OUTPUTEVENTS,LOCATIONCATEGORY,,Peripheral - old,,string,1.0,,,,27366,174736,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206015,8796218,,OUTPUTEVENTS,STATUSDESCRIPTION,,FinishedRunning,,string,1.0,,,,45910,153970,,,
206016,8796218,,OUTPUTEVENTS,COMMENTS_EDITEDBY,,,,,,,,,45910,153970,,,
206017,8796218,,OUTPUTEVENTS,COMMENTS_CANCELEDBY,,,,,,,,,45910,153970,,,
206018,8796218,,OUTPUTEVENTS,COMMENTS_DATE,,,,,,,,,45910,153970,,,


In [225]:
procedureevents_quiq.to_csv("G:/2000/MIMIC_procedureevents_QUIQ.csv", index=False)

In [103]:
procedureevents_quiq['Variable_name'].unique()

array(['STARTTIME', 'ENDTIME', 'VALUE', 'LOCATION', 'LOCATIONCATEGORY',
       'STORETIME', 'CGID', 'ORDERID', 'LINKORDERID', 'ORDERCATEGORYNAME',
       'SECONDARYORDERCATEGORYNAME', 'ORDERCATEGORYDESCRIPTION',
       'ISOPENBAG', 'CONTINUEINNEXTDEPT', 'CANCELREASON',
       'STATUSDESCRIPTION', 'COMMENTS_EDITEDBY', 'COMMENTS_CANCELEDBY',
       'COMMENTS_DATE', 'LABEL'], dtype=object)

In [104]:
# VIA 테이블 생성
via_variable_names = ['STARTTIME', 
                      'ENDTIME', 'VALUE', 'LOCATION', 'LOCATIONCATEGORY',
                      'STORETIME', 'CGID', 'ORDERID', 'LINKORDERID', 
                      'ORDERCATEGORYNAME', 'SECONDARYORDERCATEGORYNAME', 'ORDERCATEGORYDESCRIPTION',
       'ISOPENBAG', 'CONTINUEINNEXTDEPT', 'CANCELREASON',
       'STATUSDESCRIPTION', 'COMMENTS_EDITEDBY', 'COMMENTS_CANCELEDBY',
       'COMMENTS_DATE', 'LABEL']

via_descriptions = [
                    'record the start time of an input/output event',
                    'record the end time of an input/ouput event',
                    'Value of procedure',
                    'Location where the procedure was performed (e.g., right hand, left digital)',
                    'Category of the location',
    
                    'records the date and time at which a note was saved into the system',
                    'identifier for the caregiver who validated the given measurement',
                    "links multiple items contatined in the same solution together",
                    "links the same order across multiple instantiations",
                    
                    'provide higher level information about the order the medication/solution is a part of. Categories represent the type of administration.',
                    'provide higher level information about the order the medication/solution is a part of. Categories represent the type of administration.',
                    'provide higher level information about the order the medication/solution is a part of. Describes the role of the substance in the solution',
                    
                    'whether the order was from an open bag',
                    'if the order ended on patient transfer, this field indicates if it continued into the next department(e.g. a floor)',
                    'if the order was canceled, this provides some explanation',
                    
                    'description of status',
                    'specifies if the order was edited or canceled, and if so, the date and job title of the care giver who canceled or edited it',
                    'specifies if the order was edited or canceled, and if so, the date and job title of the care giver who canceled or edited it',
                    'specifies if the order was edited or canceled, and if so, the date and job title of the care giver who canceled or edited it',
                    'procedure name'
                    
                   ]


via_procedureevents = pd.DataFrame({
    'Original_table_name': 'PROCEDUREEVENTS',
    'Variable_name': via_variable_names,
    'Description': via_descriptions
})
via_procedureevents

Unnamed: 0,Original_table_name,Variable_name,Description
0,PROCEDUREEVENTS,STARTTIME,record the start time of an input/output event
1,PROCEDUREEVENTS,ENDTIME,record the end time of an input/ouput event
2,PROCEDUREEVENTS,VALUE,Value of procedure
3,PROCEDUREEVENTS,LOCATION,Location where the procedure was performed (e....
4,PROCEDUREEVENTS,LOCATIONCATEGORY,Category of the location
5,PROCEDUREEVENTS,STORETIME,records the date and time at which a note was ...
6,PROCEDUREEVENTS,CGID,identifier for the caregiver who validated the...
7,PROCEDUREEVENTS,ORDERID,links multiple items contatined in the same so...
8,PROCEDUREEVENTS,LINKORDERID,links the same order across multiple instantia...
9,PROCEDUREEVENTS,ORDERCATEGORYNAME,provide higher level information about the ord...


In [105]:
via_procedureevents.to_csv("G:/2000/MIMIC_procedureevents_VIA.csv", index=False)

----

In [3]:
import os
import glob
import pandas as pd

  from pandas.core import (


In [5]:
directory = "G:/2000/QUIQ"

file_pattern = os.path.join(directory, 'MIMIC_*_QUIQ.csv')
csv_files = glob.glob(file_pattern)

df_list = [pd.read_csv(file) for file in csv_files]
combined_df = pd.concat(df_list, ignore_index=True)
combined_df = combined_df.sort_values(by='Primary_key').reset_index(drop=True)

combined_df

  df_list = [pd.read_csv(file) for file in csv_files]
  df_list = [pd.read_csv(file) for file in csv_files]


Unnamed: 0,Primary_key,Variable_ID,Original_table_name,Variable_name,Event_date,Value,Unit,Variable_type,Is_categorical,Recorder,Recorder_position,Recorder_affiliation,Patient_id,Admission_id,Ground_truth,Mapping_info_1,Mapping_info_2
0,1,,PATIENTS,EXPIRE_FLAG,,0,,numeric,1.0,,,,253,,,,
1,1,,PATIENTS,DOD_SSN,,,,,0.0,,,,253,,,,
2,1,,PATIENTS,DOD_HOSP,,,,,0.0,,,,253,,,,
3,1,,PATIENTS,GENDER,,F,,string,1.0,,,,253,,,,
4,1,,PATIENTS,DOB,,2089-11-26 00:00:00,,timestamp,0.0,,,,253,,,date,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28451754,9796218,,CHARTEVENTS,WARNING,,0.0,,numeric,1.0,,,,87522,167396.0,,,
28451755,9796218,,CHARTEVENTS,ERROR,,0.0,,numeric,1.0,,,,87522,167396.0,,,
28451756,9796218,,CHARTEVENTS,RESULTSTATUS,,,,,,,,,87522,167396.0,,,
28451757,9796218,,CHARTEVENTS,STOPPED,,,,,,,,,87522,167396.0,,,


In [6]:
combined_df.to_csv("G:/2000/MIMIC_QUIQ.csv", index=False)

In [228]:
directory = "G:/2000/"

file_pattern = os.path.join(directory, 'MIMIC_*_VIA.csv')
csv_files = glob.glob(file_pattern)

df_list = [pd.read_csv(file) for file in csv_files]
combined_via = pd.concat(df_list, ignore_index=True)

combined_via

Unnamed: 0,Original_table_name,Variable_name,Description
0,ADMISSIONS,ADMITTIME,Timestamp when the patient was admitted to the...
1,ADMISSIONS,DISCHTIME,Timestamp when the patient was discharged from...
2,ADMISSIONS,DEATHTIME,Timestamp of death (if the patient died during...
3,ADMISSIONS,ADMISSION_TYPE,"Type of admission, such as emergency, urgent, ..."
4,ADMISSIONS,ADMISSION_LOCATION,Location from which the patient was admitted (...
...,...,...,...
3489,TRANSFERS,PREV_WARDID,the previous ward in which the patient stayed
3490,TRANSFERS,CURR_WARDID,the current ward in which the patient stayed
3491,TRANSFERS,INTIME,the date and time the patient was transferred ...
3492,TRANSFERS,OUTTIME,the date and time the patient was transferred ...


In [229]:
combined_via.to_csv("G:/2000/MIMIC_VIA.csv", index=False)