In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
from dateutil.relativedelta import relativedelta
from faker import Faker
import random
from transformers import set_seed
from sklearn.model_selection import train_test_split

# Set the random seed
random.seed(42)
set_seed(42)

# Set the style for the plots
sns.set_style("whitegrid")

# Initialize the Faker library
fake = Faker()
Faker.seed(42)

TRAIN_RATIO = 0.9
PAD_TOKEN = "<PAD>"
UNKNOWN_TOKEN = "<UNK>"

ID_COLUMN = "_id"
TEXT_COLUMN = "text"
TARGET_COLUMN = "target"
SUBJECT_ID_COLUMN = "subject_id"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
mimic_notes = pd.read_csv("/home/michele/physionet.org/files/mimic-iv-note/2.2/note/discharge.csv.gz",compression='gzip')
mimic_proc = pd.read_csv("/home/michele/physionet.org/files/mimiciv/2.2/hosp/procedures_icd.csv.gz",compression='gzip')
mimic_diag = pd.read_csv("/home/michele/physionet.org/files/mimiciv/2.2/hosp/diagnoses_icd.csv.gz",compression='gzip')
procedures = pd.read_csv("/home/michele/physionet.org/files/mimiciv/2.2/hosp/d_icd_procedures.csv.gz",compression='gzip')
diagnoses = pd.read_csv("/home/michele/physionet.org/files/mimiciv/2.2/hosp/d_icd_diagnoses.csv.gz",compression='gzip')



In [3]:
print(mimic_notes.columns)
print(mimic_proc.columns)
print(mimic_diag.columns)
print(procedures.columns)
print(diagnoses.columns)

Index(['note_id', 'subject_id', 'hadm_id', 'note_type', 'note_seq',
       'charttime', 'storetime', 'text'],
      dtype='object')
Index(['subject_id', 'hadm_id', 'seq_num', 'chartdate', 'icd_code',
       'icd_version'],
      dtype='object')
Index(['subject_id', 'hadm_id', 'seq_num', 'icd_code', 'icd_version'], dtype='object')
Index(['icd_code', 'icd_version', 'long_title'], dtype='object')
Index(['icd_code', 'icd_version', 'long_title'], dtype='object')


In [4]:
#merge mimic_proc and diagnoses
mimic_proc = mimic_proc.merge(procedures, how='inner', on=['icd_code','icd_version'])
print(mimic_proc.columns)
#merge mimic_diag and diagnoses
mimic_diag = mimic_diag.merge(diagnoses, how='inner', on=['icd_code','icd_version'])
print(mimic_diag.columns)

Index(['subject_id', 'hadm_id', 'seq_num', 'chartdate', 'icd_code',
       'icd_version', 'long_title'],
      dtype='object')
Index(['subject_id', 'hadm_id', 'seq_num', 'icd_code', 'icd_version',
       'long_title'],
      dtype='object')


In [5]:

def reformat_icd10(code: str, is_diag: bool) -> str:
    """
    Put a period in the right place because the MIMIC-3 data files exclude them.
    Generally, procedure codes have dots after the first two digits,
    while diagnosis codes have dots after the first three digits.
    """
    code = "".join(code.split("."))
    if not is_diag:
        return code
    return code[:3] + "." + code[3:]


def reformat_icd9(code: str, is_diag: bool) -> str:
    """
    Put a period in the right place because the MIMIC-3 data files exclude them.
    Generally, procedure codes have dots after the first two digits,
    while diagnosis codes have dots after the first three digits.
    """
    code = "".join(code.split("."))
    if is_diag:
        if code.startswith("E"):
            if len(code) > 4:
                return code[:4] + "." + code[4:]
        else:
            if len(code) > 3:
                return code[:3] + "." + code[3:]
    else:
        if len(code) > 2:
            return code[:2] + "." + code[2:]
    return code

def reformat_icd(code: str, version: int, is_diag: bool) -> str:
    """format icd code depending on version"""
    if version == 9:
        return reformat_icd9(code, is_diag)
    elif version == 10:
        return reformat_icd10(code, is_diag)
    else:
        raise ValueError("version must be 9 or 10")

# Format the codes by adding decimal points
mimic_proc["icd_code"] = mimic_proc.apply(
    lambda row: reformat_icd(
        code=row["icd_code"], version=row["icd_version"], is_diag=False
    ),
    axis=1,
)
mimic_diag["icd_code"] = mimic_diag.apply(
    lambda row: reformat_icd(
        code=row["icd_code"], version=row["icd_version"], is_diag=True
    ),
    axis=1,
)


In [6]:
from functools import partial
def sort_by_indexes(lst, indexes, reverse=False):
  return [val for (_, val) in sorted(zip(indexes, lst), key=lambda x: \
          x[0], reverse=reverse)]

def reformat_code_dataframe(row: pd.DataFrame, cols: list) -> pd.Series:
    """Takes a dataframe and a column name and returns a series with the column name and a list of codes.

    Example:
        Input:

                subject_id  _id     icd9_diag
        608           2   163353     V3001
        609           2   163353      V053
        610           2   163353      V290

        Output:

        icd9_diag    [V053, V290, V3001]

    Args:
        row (pd.DataFrame): Dataframe with a column of codes.
        col (str): column name of the codes.

    Returns:
        pd.Series: Series with the column name and a list of codes.
    """
    out = dict()
    
    # Sort the first column and rearrange the second column accordingly
    sorted_indices = row[cols[0]].argsort()
    out[cols[0]] = sort_by_indexes(row[cols[0]], sorted_indices)
    out[cols[1]] = sort_by_indexes(row[cols[1]], sorted_indices)

    return pd.Series(out)
    

    return out

def parse_codes_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """Parse the codes dataframe"""
    df = df.rename(columns={"hadm_id": ID_COLUMN, "subject_id": SUBJECT_ID_COLUMN})
    df = df.dropna(subset=["icd_code"])
    df = df.drop_duplicates(subset=[ID_COLUMN, "icd_code"])
    df = (
        df.groupby([SUBJECT_ID_COLUMN, ID_COLUMN, "icd_version"])
        .apply(partial(reformat_code_dataframe, cols=["icd_code","long_title"]))
        .reset_index()
    )
    return df

def parse_notes_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """Parse the notes dataframe"""
    df = df.rename(
        columns={
            "hadm_id": ID_COLUMN,
            "subject_id": SUBJECT_ID_COLUMN,
            "text": TEXT_COLUMN,
        }
    )
    df = df.dropna(subset=[TEXT_COLUMN])
    df = df.drop_duplicates(subset=[ID_COLUMN, TEXT_COLUMN])
    return df

In [7]:
# Process codes and notes
mimic_proc = parse_codes_dataframe(mimic_proc)
mimic_diag = parse_codes_dataframe(mimic_diag)
mimic_notes = parse_notes_dataframe(mimic_notes)

  .apply(partial(reformat_code_dataframe, cols=["icd_code","long_title"]))
  .apply(partial(reformat_code_dataframe, cols=["icd_code","long_title"]))


In [8]:
mimic_proc.head(20)

Unnamed: 0,subject_id,_id,icd_version,icd_code,long_title
0,10000032,22595853,9,[54.91],[Percutaneous abdominal drainage]
1,10000032,22841357,9,[54.91],[Percutaneous abdominal drainage]
2,10000032,25742920,9,[54.91],[Percutaneous abdominal drainage]
3,10000068,25022803,9,[89.38],[Other nonoperative respiratory measurements]
4,10000117,27988844,10,[0QS734Z],[Reposition Left Upper Femur with Internal Fix...
5,10000280,25852320,9,[89.38],[Other nonoperative respiratory measurements]
6,10000560,28979390,9,[55.51],[Nephroureterectomy]
7,10000635,26134563,9,"[37.27, 37.28, 37.34]","[Cardiac mapping, Intracardiac echocardiograph..."
8,10000719,24558333,9,[75.69],[Repair of other current obstetric laceration]
9,10000826,20032235,9,[54.91],[Percutaneous abdominal drainage]


In [9]:
# Merge the codes and notes into an icd10 dataframe
mimic_proc_10 = mimic_proc[mimic_proc["icd_version"] == 10]
mimic_proc_10 = mimic_proc_10.rename(columns={"icd_code": "icd10_proc"})
mimic_diag_10 = mimic_diag[mimic_diag["icd_version"] == 10]
mimic_diag_10 = mimic_diag_10.rename(columns={"icd_code": "icd10_diag"})
mimiciv_10 = mimic_notes.merge(
    mimic_proc_10[[ID_COLUMN, "icd10_proc","long_title"]], on=ID_COLUMN, how="inner"
)
mimiciv_10 = mimiciv_10.merge(
    mimic_diag_10[[ID_COLUMN, "icd10_diag","long_title"]], on=ID_COLUMN, how="inner"
)

In [10]:
mimiciv_10.head(20)

Unnamed: 0,note_id,subject_id,_id,note_type,note_seq,charttime,storetime,text,icd10_proc,icd10_diag
0,10000117-DS-22,10000117,27988844,DS,22,2183-09-21 00:00:00,2183-09-29 16:23:00,\nName: ___ Unit No: ___\n...,[0QS734Z],"[I34.1, G43.909, Z87.442, Z87.891, K21.9, S72...."
1,10001401-DS-17,10001401,21544441,DS,17,2131-06-15 00:00:00,2131-06-15 13:24:00,\nName: ___ Unit No: ___\n \n...,"[0UT9FZZ, 0TTB4ZZ, 0UBG4ZZ, 0T1807C, 0UT2FZZ, ...","[C67.5, E78.5, I10., E89.0, D25.9, Z87.891]"
2,10001401-DS-18,10001401,26840593,DS,18,2131-07-02 00:00:00,2131-07-08 09:02:00,\nName: ___ Unit No: ___\n \n...,"[02HV33Z, 0W9J30Z, 3E0436Z]","[Y92.89, B96.6, R78.81, I10., K65.1, C67.9, T8..."
3,10001401-DS-20,10001401,27060146,DS,20,2131-10-05 00:00:00,2131-10-05 16:11:00,\nName: ___ Unit No: ___\n \n...,"[0D9W30Z, 02HV33Z, 0DWW30Z, 0D9W3ZX]","[Y83.8, T81.4XXA, E78.5, K43.5, E03.9, K65.1, ..."
4,10001401-DS-21,10001401,28058085,DS,21,2131-11-15 00:00:00,2131-11-20 19:51:00,\nName: ___ Unit No: ___\n \n...,"[0T783DZ, 0TB63ZX, 0TB73ZX, BT14YZZ]","[E03.9, R31.0, N99.89, R82.71, N99.820, I10., ..."
5,10001401-DS-22,10001401,27012892,DS,22,2133-07-13 00:00:00,2133-07-13 17:38:00,\nName: ___ Unit No: ___\n \n...,"[0T783DZ, 02HV33Z, 0TB68ZX, 0T948ZX]","[I10., T81.40XA, Z93.6, T81.44XA, N12., N13.30..."
6,10001884-DS-31,10001884,26202981,DS,31,2130-08-23 00:00:00,2130-08-23 16:51:00,\nName: ___ Unit No: ___\n \nA...,[0DJ08ZZ],"[H40.9, K92.1, E78.5, M47.892, I25.10, J44.9, ..."
7,10001884-DS-38,10001884,26184834,DS,38,2131-01-20 00:00:00,2131-01-20 09:41:00,\nName: ___ Unit No: ___\n \nA...,"[02HV33Z, 3E0G76Z, 0BH17EZ, 02HK3JZ, 5A12012, ...","[I73.89, J01.90, R50.9, Z96.641, Z87.891, I48...."
8,10001919-DS-20,10001919,29897682,DS,20,2124-04-21 00:00:00,2124-04-21 17:30:00,\nName: ___ Unit No: ___\...,"[0DBW4ZX, 0DJ68ZZ, 0T9B80Z, 8E0W4CZ]","[C16.9, C78.6, K91.71, Y92.234, E03.9, K21.9, ..."
9,10002013-DS-12,10002013,21763296,DS,12,2165-11-26 00:00:00,2165-11-26 18:46:00,\nName: ___ Unit No: ___\n...,"[0JBR0ZZ, B211YZZ, B212YZZ, B218YZZ]","[N18.3, E11.22, E11.319, M86.9, K21.9, J44.9, ..."


In [11]:
for d in diagnoses['long_title']:
    print(d)

Cholera due to vibrio cholerae
Cholera due to vibrio cholerae el tor
Cholera, unspecified
Typhoid fever
Paratyphoid fever A
Paratyphoid fever B
Paratyphoid fever C
Paratyphoid fever, unspecified
Salmonella gastroenteritis
Salmonella septicemia
Localized salmonella infection, unspecified
Salmonella meningitis
Salmonella pneumonia
Salmonella arthritis
Salmonella osteomyelitis
Other localized salmonella infections
Other specified salmonella infections
Salmonella infection, unspecified
Shigella dysenteriae
Shigella flexneri
Shigella boydii
Shigella sonnei
Other specified shigella infections
Shigellosis, unspecified
Staphylococcal food poisoning
Botulism food poisoning
Food poisoning due to Clostridium perfringens (C. welchii)
Food poisoning due to other Clostridia
Food poisoning due to Vibrio parahaemolyticus
Food poisoning due to Vibrio vulnificus
Other bacterial food poisoning
Food poisoning, unspecified
Acute amebic dysentery without mention of abscess
Chronic intestinal amebiasis witho

In [12]:
# remove notes with no codes
mimiciv_10 = mimiciv_10.dropna(subset=["icd10_proc", "icd10_diag"], how="all")

In [13]:
import numpy as np
# convert NaNs to empty lists
mimiciv_10["icd10_proc"] = mimiciv_10["icd10_proc"].apply(
    lambda x: [] if x is np.nan else x
)
mimiciv_10["icd10_diag"] = mimiciv_10["icd10_diag"].apply(
    lambda x: [] if x is np.nan else x
)

In [14]:
from collections import Counter


def filter_codes(df: pd.DataFrame, columns: list[str], min_count: int) -> pd.DataFrame:
    """Filter the codes dataframe to only include codes that appear at least min_count times

    Args:
        df (pd.DataFrame): The codes dataframe
        col (str): The column name of the codes
        min_count (int): The minimum number of times a code must appear

    Returns:
        pd.DataFrame: The filtered codes dataframe
    """
    for col in columns:
        code_counts = Counter([code for codes in df[col] for code in codes])
        codes_to_keep = set(
            code for code, count in code_counts.items() if count >= min_count
        )
        df[col] = df[col].apply(lambda x: [code for code in x if code in codes_to_keep])
        print(f"Number of unique codes in {col} before filtering: {len(code_counts)}")
        print(f"Number of unique codes in {col} after filtering: {len(codes_to_keep)}")

    return df

MIN_TARGET_COUNT = 10

In [15]:
mimiciv_10 = filter_codes(mimiciv_10, ["icd10_proc", "icd10_diag"], MIN_TARGET_COUNT)

Number of unique codes in icd10_proc before filtering: 9941
Number of unique codes in icd10_proc after filtering: 2139
Number of unique codes in icd10_diag before filtering: 13956
Number of unique codes in icd10_diag after filtering: 4543


In [16]:
mimiciv_10.head()

Unnamed: 0,note_id,subject_id,_id,note_type,note_seq,charttime,storetime,text,icd10_proc,icd10_diag
0,10000117-DS-22,10000117,27988844,DS,22,2183-09-21 00:00:00,2183-09-29 16:23:00,\nName: ___ Unit No: ___\n...,[0QS734Z],"[I34.1, G43.909, Z87.442, Z87.891, K21.9, S72...."
1,10001401-DS-17,10001401,21544441,DS,17,2131-06-15 00:00:00,2131-06-15 13:24:00,\nName: ___ Unit No: ___\n \n...,"[0UT9FZZ, 0TTB4ZZ, 0UBG4ZZ, 0T1807C, 0UT2FZZ, ...","[C67.5, E78.5, I10., E89.0, D25.9, Z87.891]"
2,10001401-DS-18,10001401,26840593,DS,18,2131-07-02 00:00:00,2131-07-08 09:02:00,\nName: ___ Unit No: ___\n \n...,"[02HV33Z, 0W9J30Z, 3E0436Z]","[Y92.89, B96.6, R78.81, I10., K65.1, C67.9, T8..."
3,10001401-DS-20,10001401,27060146,DS,20,2131-10-05 00:00:00,2131-10-05 16:11:00,\nName: ___ Unit No: ___\n \n...,"[0D9W30Z, 02HV33Z]","[Y83.8, T81.4XXA, E78.5, K43.5, E03.9, K65.1, ..."
4,10001401-DS-21,10001401,28058085,DS,21,2131-11-15 00:00:00,2131-11-20 19:51:00,\nName: ___ Unit No: ___\n \n...,[BT14YZZ],"[E03.9, R31.0, N99.89, R82.71, N99.820, I10., ..."


In [17]:
# define target
mimiciv_10[TARGET_COLUMN] = mimiciv_10["icd10_proc"] + mimiciv_10["icd10_diag"]
mimiciv_10["long_title"] = mimiciv_10["long_title"] + mimiciv_10["long_title_x"]
# remove empty target
mimiciv_10 = mimiciv_10[mimiciv_10[TARGET_COLUMN].apply(lambda x: len(x) > 0)]
# reset index
mimiciv_10 = mimiciv_10.reset_index(drop=True)

KeyError: 'long_title'

In [None]:

for i in range(100):
    print(len(mimiciv_10['target'][i]),len(mimiciv_10['long_title'][i]))
    #print(mimiciv_10['target'][i])
    #print(mimiciv_10['long_title'][i])
    print(len(mimiciv_10["icd10_proc"][i]),len(mimiciv_10["icd10_diag"][i]))

In [None]:
# Text preprocess the notes
#mimiciv_9 = preprocess_documents(df=mimiciv_9, preprocessor=preprocessor)
#mimiciv_10 = preprocess_documents(df=mimiciv_10, preprocessor=preprocessor)

In [None]:
# save files to disk
#mimiciv_9.to_feather(output_dir_icd9 / "mimiciv_icd9.feather")
mimiciv_10.to_feather("/home/michele/dtfh_private/mimic/data/mimiciv_icd10.feather")
