# Init

In [1]:
cd ..

/Users/wliao0504/code/clif/CLIF-MIMIC


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [45]:
# src/tables/patient.py
import numpy as np
import pandas as pd
import duckdb
import logging
from importlib import reload
import src.utils
reload(src.utils)
from src.utils import construct_mapper_dict, load_mapping_csv, \
    rename_and_reorder_cols, save_to_rclif, setup_logging, mimic_table_pathfinder, \
    clif_table_pathfinder, read_from_rclif

loaded configuration from /Users/wliao0504/code/clif/CLIF-MIMIC/src/../config/config.json


In [3]:
setup_logging()

PATIENT_COL_NAMES = [
    "patient_id", "race_name", "race_category", "ethnicity_name", "ethnicity_category",
    "sex_name", "sex_category", "birth_date", "death_dttm", "language_name", "language_category"
]

2025-05-10 15:51:25,380 - INFO - initialized logging at logs/etl.log


In [4]:
race_ethnicity_mapping = load_mapping_csv("race_ethnicity")
race_mapper_dict = construct_mapper_dict(race_ethnicity_mapping, "mimic_race", "race")
ethnicity_mapper_dict = construct_mapper_dict(race_ethnicity_mapping, "mimic_race", "ethnicity")

# load mimic data
mimic_patients = pd.read_parquet(mimic_table_pathfinder("patients"))
mimic_admissions = pd.read_parquet(mimic_table_pathfinder("admissions"))

# Dev

## language

In [35]:
language_df = pd.read_excel("data/data_models/language_category.1.xlsx")
language_df["language_category"].unique()

In [9]:
mimic_admissions.value_counts("language", dropna = False)

language
English                   492316
Spanish                    18303
Russian                     8143
Chinese                     7598
Kabuverdianu                4801
Portuguese                  2938
Haitian                     2518
Other                       1392
Vietnamese                  1170
Italian                     1000
Modern Greek (1453-)         880
NaN                          775
Arabic                       676
American Sign Language       522
Persian                      454
Polish                       417
Korean                       360
Thai                         288
Khmer                        264
Amharic                      254
Hindi                        191
French                       191
Somali                       160
Japanese                     158
Bengali                      152
Armenian                     107
Name: count, dtype: int64

In [28]:
LANGUAGE_MAPPER = {
    'English': 'English', 
    'Spanish': 'Spanish', 
    'Russian': 'Russian', 
    'Chinese': 'Chinese', 
    'Kabuverdianu': 'Portuguese', 
    'Portuguese': 'Portuguese', 
    'Haitian': 'Haitian Creole', 
    'Other': 'Other and unspecified languages', 
    'Vietnamese': 'Vietnamese', 
    'Italian': 'Italian', 
    'Modern Greek (1453-)': 'Greek', 
    None: 'Unknown or NA',  # NOTE: test this 
    'Arabic': 'Arabic', 
    'American Sign Language': 'Sign Language', 
    'Persian': 'Persian', 
    'Polish': 'Polish', 
    'Korean': 'Korean', 
    'Thai': 'Thai, Lao, or other Tai-Kadai languages', 
    'Khmer': 'Khmer', 
    'Amharic': 'Amharic, Somali, or other Afro-Asiatic languages', 
    'Hindi': 'Hindi', 
    'French': 'French', 
    'Somali': 'Amharic, Somali, or other Afro-Asiatic languages', 
    'Japanese': 'Japanese', 
    'Bengali': 'Bengali', 
    'Armenian': 'Armenian'
}

test if conflicting language over multiple encounters

In [23]:
query = f"""
SELECT 
    subject_id as patient_id, 
    hadm_id as hospitalization_id,
    language as language_name,
    admittime as admittime
FROM '{mimic_table_pathfinder("admissions")}'
"""
df = duckdb.query(query).df()
df["language_category"] = df["language_name"].map(language_mapper)

the following show that the inconsistency would not be a concern:

In [15]:
query = """
SELECT 
    patient_id,
    FIRST(language_category) as language_category,
    COUNT(DISTINCT language_name) AS unique_lang_count,
FROM df
/* WHERE race_category NOT IN ('Other', 'Unknown') OR ethnicity_category NOT IN ('Other', 'Unknown') */
GROUP BY patient_id
-- HAVING unique_lang_count > 1
"""
df2 = duckdb.query(query).df()
df2.head()

Unnamed: 0,patient_id,language_category,unique_lang_count
0,19374186,English,1
1,19374315,English,1
2,19374373,English,1
3,19374379,English,1
4,19374436,English,1


In [25]:
df[df["language_name"].isna()]

Unnamed: 0,patient_id,hospitalization_id,language_name,admittime,language_category
597,10010424,28388172,,2164-05-27 17:47:00,Unknown or NA
2677,10049642,26714941,,2177-02-20 03:15:00,Unknown or NA
2767,10051939,29623973,,2112-05-29 15:48:00,Unknown or NA
3665,10071605,21911997,,2156-08-17 20:17:00,Unknown or NA
5325,10107262,27069114,,2158-04-18 01:22:00,Unknown or NA
...,...,...,...,...,...
543448,19952742,29264490,,2149-10-12 12:00:00,Unknown or NA
545066,19984259,20109446,,2153-11-29 23:43:00,Unknown or NA
545067,19984259,26619670,,2154-01-28 21:29:00,Unknown or NA
545068,19984259,26869546,,2153-12-09 21:06:00,Unknown or NA


In [29]:
query = f"""
SELECT 
    subject_id as patient_id,
    FIRST(language) as language_name
FROM '{mimic_table_pathfinder("admissions")}'
GROUP BY subject_id
"""
df = duckdb.query(query).df()
df["language_category"] = df["language_name"].map(LANGUAGE_MAPPER)
df

Unnamed: 0,patient_id,language_name,language_category
0,19084358,English,English
1,19084360,English,English
2,19085455,English,English
3,19085662,English,English
4,19086156,Russian,Russian
...,...,...,...
223447,19082444,English,English
223448,19082488,English,English
223449,19082696,English,English
223450,19083252,English,English


## race and ethnicity

In [5]:
query = """
SELECT 
    subject_id as patient_id, 
    hadm_id as hospitalization_id,
    race as race_name, 
    race as ethnicity_name,
    admittime as admittime
FROM mimic_admissions
"""
race_ethn = duckdb.query(query).df()
race_ethn["race_category"] = race_ethn["race_name"].map(race_mapper_dict)
race_ethn["ethnicity_category"] = race_ethn["ethnicity_name"].map(ethnicity_mapper_dict)
query = """
SELECT 
    patient_id,
    hospitalization_id,
    race_name,
    race_category,
    ethnicity_name,
    ethnicity_category,
    admittime,
    CASE
        WHEN (race_category IN ('Other', 'Unknown')) AND (ethnicity_category IN ('Other', 'Unknown')) THEN 1
        ELSE 0
    END AS true_noninfo
FROM race_ethn
"""
race_ethn = duckdb.query(query).df()
race_ethn

Unnamed: 0,patient_id,hospitalization_id,race_name,race_category,ethnicity_name,ethnicity_category,admittime,true_noninfo
0,10000032,22595853,WHITE,White,WHITE,Non-Hispanic,2180-05-06 22:23:00,0
1,10000032,22841357,WHITE,White,WHITE,Non-Hispanic,2180-06-26 18:27:00,0
2,10000032,25742920,WHITE,White,WHITE,Non-Hispanic,2180-08-05 23:44:00,0
3,10000032,29079034,WHITE,White,WHITE,Non-Hispanic,2180-07-23 12:35:00,0
4,10000068,25022803,WHITE,White,WHITE,Non-Hispanic,2160-03-03 23:16:00,0
...,...,...,...,...,...,...,...,...
546023,19999828,25744818,WHITE,White,WHITE,Non-Hispanic,2149-01-08 16:44:00,0
546024,19999828,29734428,WHITE,White,WHITE,Non-Hispanic,2147-07-18 16:23:00,0
546025,19999840,21033226,WHITE,White,WHITE,Non-Hispanic,2164-09-10 13:47:00,0
546026,19999840,26071774,WHITE,White,WHITE,Non-Hispanic,2164-07-25 00:27:00,0


In [6]:
test_patient_ids = [10005236, 10016673, 10017492, 10006513, 10304619, 10617011, 10360391, 10375831,11066451]
race_ethn_test = race_ethn[race_ethn["patient_id"].isin(test_patient_ids)]
race_ethn_test

Unnamed: 0,patient_id,hospitalization_id,race_name,race_category,ethnicity_name,ethnicity_category,admittime,true_noninfo
324,10005236,23384508,WHITE - OTHER EUROPEAN,White,WHITE - OTHER EUROPEAN,Non-Hispanic,2180-06-26 20:22:00,0
325,10005236,25656545,UNKNOWN,Unknown,UNKNOWN,Unknown,2177-11-29 11:30:00,1
412,10006513,28504108,HISPANIC/LATINO - DOMINICAN,Other,HISPANIC/LATINO - DOMINICAN,Hispanic,2125-05-06 11:22:00,0
413,10006513,29846618,OTHER,Other,OTHER,Unknown,2127-03-27 14:52:00,1
955,10016673,29103261,HISPANIC/LATINO - MEXICAN,Other,HISPANIC/LATINO - MEXICAN,Hispanic,2188-03-18 18:00:00,0
985,10017492,27417763,PATIENT DECLINED TO ANSWER,Unknown,PATIENT DECLINED TO ANSWER,Unknown,2116-06-26 18:25:00,1
986,10017492,27672872,PATIENT DECLINED TO ANSWER,Unknown,PATIENT DECLINED TO ANSWER,Unknown,2114-03-19 20:05:00,1
15902,10304619,21512114,HISPANIC OR LATINO,Other,HISPANIC OR LATINO,Hispanic,2161-06-19 00:47:00,0
15903,10304619,23891877,HISPANIC/LATINO - PUERTO RICAN,Other,HISPANIC/LATINO - PUERTO RICAN,Hispanic,2171-01-19 01:04:00,0
15904,10304619,24179237,HISPANIC/LATINO - PUERTO RICAN,Other,HISPANIC/LATINO - PUERTO RICAN,Hispanic,2174-06-17 06:37:00,0


In [7]:
query = """
SELECT 
    patient_id, 
    race_name,
    race_category,
    ethnicity_name,
    ethnicity_category,
    COUNT(*) AS count,
    MAX(admittime) AS most_recent,
    true_noninfo,
    ROW_NUMBER() OVER (
        PARTITION BY patient_id 
        ORDER BY 
            count DESC, 
            true_noninfo,
            most_recent DESC
            ) 
        AS rn
FROM race_ethn_test
GROUP BY patient_id, race_name, race_category, ethnicity_name, ethnicity_category, true_noninfo
"""
race_ethn_ranked_test = duckdb.query(query).df()
race_ethn_ranked_test

Unnamed: 0,patient_id,race_name,race_category,ethnicity_name,ethnicity_category,count,most_recent,true_noninfo,rn
0,10360391,BLACK/AFRICAN AMERICAN,Black or African American,BLACK/AFRICAN AMERICAN,Non-Hispanic,3,2204-04-08 06:08:00,0,1
1,10360391,WHITE,White,WHITE,Non-Hispanic,1,2194-04-20 07:15:00,0,2
2,10375831,ASIAN - ASIAN INDIAN,Asian,ASIAN - ASIAN INDIAN,Non-Hispanic,3,2134-07-02 17:48:00,0,1
3,10375831,OTHER,Other,OTHER,Unknown,1,2130-01-01 16:58:00,1,2
4,11066451,WHITE,White,WHITE,Non-Hispanic,1,2191-11-19 14:32:00,0,1
5,11066451,UNKNOWN,Unknown,UNKNOWN,Unknown,1,2189-10-10 21:38:00,1,2
6,10304619,HISPANIC OR LATINO,Other,HISPANIC OR LATINO,Hispanic,1,2161-06-19 00:47:00,0,2
7,10617011,HISPANIC/LATINO - PUERTO RICAN,Other,HISPANIC/LATINO - PUERTO RICAN,Hispanic,1,2157-06-14 21:33:00,0,1
8,10617011,HISPANIC OR LATINO,Other,HISPANIC OR LATINO,Hispanic,1,2154-05-01 03:59:00,0,2
9,10017492,PATIENT DECLINED TO ANSWER,Unknown,PATIENT DECLINED TO ANSWER,Unknown,2,2116-06-26 18:25:00,1,1


In [8]:
query_new = """
SELECT 
    patient_id,
    race_name,
    race_category,
    ethnicity_name,
    ethnicity_category,
    rn
FROM race_ethn_ranked_test
WHERE rn = 1
"""
df2 = duckdb.query(query_new).df()
df2

Unnamed: 0,patient_id,race_name,race_category,ethnicity_name,ethnicity_category,rn
0,10360391,BLACK/AFRICAN AMERICAN,Black or African American,BLACK/AFRICAN AMERICAN,Non-Hispanic,1
1,10375831,ASIAN - ASIAN INDIAN,Asian,ASIAN - ASIAN INDIAN,Non-Hispanic,1
2,11066451,WHITE,White,WHITE,Non-Hispanic,1
3,10617011,HISPANIC/LATINO - PUERTO RICAN,Other,HISPANIC/LATINO - PUERTO RICAN,Hispanic,1
4,10017492,PATIENT DECLINED TO ANSWER,Unknown,PATIENT DECLINED TO ANSWER,Unknown,1
5,10005236,WHITE - OTHER EUROPEAN,White,WHITE - OTHER EUROPEAN,Non-Hispanic,1
6,10016673,HISPANIC/LATINO - MEXICAN,Other,HISPANIC/LATINO - MEXICAN,Hispanic,1
7,10006513,HISPANIC/LATINO - DOMINICAN,Other,HISPANIC/LATINO - DOMINICAN,Hispanic,1
8,10304619,HISPANIC/LATINO - PUERTO RICAN,Other,HISPANIC/LATINO - PUERTO RICAN,Hispanic,1


### one patient who died twice

In [1]:
death_test = death[death["patient_id"] == 19931581]

NameError: name 'death' is not defined

# Validate

In [9]:
mimic_patients

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10000032,F,52,2180,2014 - 2016,2180-09-09
1,10000048,F,23,2126,2008 - 2010,
2,10000058,F,33,2168,2020 - 2022,
3,10000068,F,19,2160,2008 - 2010,
4,10000084,M,72,2160,2017 - 2019,2161-02-13
...,...,...,...,...,...,...
364622,19999828,F,46,2147,2017 - 2019,
364623,19999829,F,28,2186,2008 - 2010,
364624,19999840,M,58,2164,2008 - 2010,2164-09-17
364625,19999914,F,49,2158,2017 - 2019,


In [7]:
clif_patient = pd.read_parquet(clif_table_pathfinder("patient"))

In [8]:
clif_patient.value_counts(
    ["race_name", "race_category", "ethnicity_name", "ethnicity_category"],
    dropna = False
    )

race_name                                  race_category                              ethnicity_name                             ethnicity_category
NaN                                        NaN                                        NaN                                        NaN                   141175
WHITE                                      White                                      WHITE                                      Non-Hispanic          139118
BLACK/AFRICAN AMERICAN                     Black or African American                  BLACK/AFRICAN AMERICAN                     Non-Hispanic           23552
UNKNOWN                                    Unknown                                    UNKNOWN                                    Unknown                 9630
OTHER                                      Other                                      OTHER                                      Unknown                 8754
WHITE - OTHER EUROPEAN                     White              

In [15]:
mimic_admissions.value_counts("race", dropna = False)

race
WHITE                                        336538
BLACK/AFRICAN AMERICAN                        75482
OTHER                                         19788
WHITE - OTHER EUROPEAN                        13972
UNKNOWN                                       13870
HISPANIC/LATINO - PUERTO RICAN                10903
HISPANIC OR LATINO                             8287
ASIAN                                          7809
ASIAN - CHINESE                                7644
WHITE - RUSSIAN                                6597
BLACK/CAPE VERDEAN                             6205
HISPANIC/LATINO - DOMINICAN                    6070
BLACK/CARIBBEAN ISLAND                         3875
BLACK/AFRICAN                                  3495
UNABLE TO OBTAIN                               3478
PATIENT DECLINED TO ANSWER                     2162
PORTUGUESE                                     2082
ASIAN - SOUTH EAST ASIAN                       1973
WHITE - EASTERN EUROPEAN                       1886
HISPANI

In [43]:
import src.tables.patient as patient
reload(patient)

2025-05-12 12:41:07,766 - INFO - initialized logging at logs/etl.log


<module 'src.tables.patient' from '/Users/wliao0504/code/clif/CLIF-MIMIC/src/tables/patient.py'>

In [None]:
out = patient._test()

In [44]:
patient._main()

2025-05-12 12:41:10,806 - INFO - starting to build clif patient table -- 
2025-05-12 12:41:10,808 - INFO - initialized logging at logs/etl.log
2025-05-12 12:41:10,818 - INFO - fetching and processing the second component of the patient table: race and ethnicity data...
2025-05-12 12:41:11,068 - INFO - fetching and processing the first component of the patient table: sex/gender data...
2025-05-12 12:41:11,087 - INFO - fetching and processing the third component: death data...
2025-05-12 12:41:11,097 - INFO - fetching and processing the fourth component: language data...
2025-05-12 12:41:11,116 - INFO - merging the four components...
2025-05-12 12:41:11,335 - INFO - saving patient rclif table as a parquet file at /Users/wliao0504/code/clif/CLIF-MIMIC/src/../output/rclif-dev-test/clif_patient.parquet.
2025-05-12 12:41:11,439 - INFO - output saved to a parquet file, everything completed for the patient table!


In [47]:
clif_patient = read_from_rclif("patient")
clif_patient.head()

Unnamed: 0,patient_id,race_name,race_category,ethnicity_name,ethnicity_category,sex_name,sex_category,birth_date,death_dttm,language_name,language_category
0,18433744,HISPANIC/LATINO - SALVADORAN,Other,HISPANIC/LATINO - SALVADORAN,Hispanic,M,Male,NaT,NaT,Spanish,Spanish
1,18433796,WHITE - RUSSIAN,White,WHITE - RUSSIAN,Non-Hispanic,F,Female,NaT,NaT,English,English
2,18433806,BLACK/AFRICAN AMERICAN,Black or African American,BLACK/AFRICAN AMERICAN,Non-Hispanic,M,Male,NaT,NaT,English,English
3,18433819,WHITE,White,WHITE,Non-Hispanic,M,Male,NaT,NaT,English,English
4,18433862,WHITE,White,WHITE,Non-Hispanic,F,Female,NaT,NaT,English,English
