In [2]:
import pandas as pd
from copy import deepcopy

patients_pn = pd.read_csv("pneumonia/patients_pn.csv", index_col=0)
patients_admission = pd.read_csv(
    "pneumonia/outcome_admission.csv", index_col=0)

### 基本信息


In [9]:
patients = pd.read_csv("mimic-iv-2/hosp/patients.csv")
patients_demo = pd.merge(
    patients_admission[["subject_id", "hadm_id", "admittime"]], patients)

patients_demo['adm_year'] = pd.to_datetime(patients_demo['admittime']).dt.year
patients_demo["adm_age"] = patients_demo["adm_year"] - \
    (patients_demo["anchor_year"] - patients_demo["anchor_age"])

patients_demo.loc[patients_demo["gender"] == "M", "gender"] = 1
patients_demo.loc[patients_demo["gender"] == "F", "gender"] = 0

patients_demo[["subject_id", "hadm_id", 'gender', 'adm_age']
              ].drop_duplicates().to_csv("pneumonia/feature_demo.csv")

### 既往史


In [15]:
diagnoses = pd.read_csv("mimic-iv-2/hosp/diagnoses_icd.csv", header=0)
full_diag = diagnoses.loc[diagnoses["hadm_id"].isin(
    patients_admission["hadm_id"])].reset_index(drop=True)
combids = {
    "Congestive heart failure": "428\d|I50\d",
    "Cardiacarrhythmias": "427\d|I5[4-9]\d",
    "Coronary artery atherosclerosis": "414\d|I25\d",
    "Pulmonarycirculation": "41[5-7]\d|I2[6-8]\d",
    "Hypertention": "40\d|I1\d",
    "Chronicpulmonary": "49\d|J4[0-7]\d",
    "Heptic disease": "57[1-3]\d|K7[0-6]\d",
    "Renal diseases": "58[2-6]\d|593\d|N0\d|N1[7-9]\d",
    "Blood abnormal": "286\d|D6[5-8]\d",
    "Diabetes": "250\d|E1[0-4]\d",
    "Neuro": "780\d|R40\d",
    "Immunity suppression": "279\d|D80\d|279\d|D84\d|042\d|B20\d",
    # "Sepsis":"995\d|038\d|A4[0-1]\d|R65",
}

for coms, icd in combids.items():
    full_diag[coms] = 0
    full_diag.loc[full_diag["icd_code"].str.match(icd), coms] = 1

full_diag = full_diag.drop(["seq_num", "icd_code", "icd_version"], axis=1).groupby(
    ['hadm_id']).max().reset_index()
full_diag.drop_duplicates().to_csv("pneumonia/feature_history.csv")
# import matplotlib.pyplot as plt
# plt.figure(figsize=(10, 6))
# plt.bar(combids.keys(), full_diag[combids.keys()].mean())
# plt.xlabel('Categories')
# plt.ylabel('Probabilities')
# plt.title('Probabilities of Medical Categories')
# plt.xticks(rotation=45)
# plt.tight_layout()
# plt.show()

### 体格检查


In [27]:
weight = pd.read_csv("concepts/firstday_lab/first_day_weight.csv")
urine = pd.read_csv("concepts/firstday_lab/first_day_urine_output.csv")
vitalsign = pd.read_csv("concepts/firstday_lab/first_day_vitalsign.csv")

phyexam = pd.merge(patients_admission[["subject_id", "hadm_id"]], weight)
phyexam = pd.merge(phyexam, urine)
phyexam = pd.merge(phyexam, vitalsign)
phyexam.drop_duplicates().to_csv("outcome_vitalsign_icu.csv")

### 实验室指标


In [2]:
import os
from copy import deepcopy
lab_dfs = []

# 将event按小时和时间分类
input_dir = "concepts/measurement/"
for i in os.listdir(input_dir):
    mear = pd.read_csv("concepts/measurement/%s" % i)
    lab_dfs.append(mear)
lab_df = pd.concat(lab_dfs, axis=0, ignore_index=True)
lab_df = pd.merge(patients_admission, lab_df)
lab_df['admittime'] = pd.to_datetime(lab_df['admittime'])
lab_df['charttime'] = pd.to_datetime(
    lab_df['charttime'], format="%d/%m/%Y %H:%M:%S")
lab_df["lab_hour"] = (lab_df["charttime"] - lab_df["admittime"])
lab_df["lab_day"] = (lab_df["charttime"] - lab_df["admittime"]).dt.days

# 筛选3天内的实验室检查
labs = lab_df.loc[(lab_df["lab_day"] <= 3) & (lab_df["lab_day"] >= -3)]
lab_columns = ['troponin_t', 'ck_mb', 'ntprobnp', 'albumin',
               'globulin', 'total_protein', 'aniongap', 'bicarbonate', 'bun',
               'calcium', 'chloride', 'creatinine', 'glucose', 'sodium', 'potassium',
               'so2', 'po2', 'pco2', 'fio2_chartevents', 'fio2', 'aado2', 'aado2_calc',
               'pao2fio2ratio', 'ph', 'baseexcess', 'totalco2', 'hematocrit',
               'hemoglobin', 'carboxyhemoglobin', 'methemoglobin', 'temperature',
               'lactate', 'crp', 'wbc', 'basophils_abs', 'eosinophils_abs',
               'lymphocytes_abs', 'monocytes_abs', 'neutrophils_abs', 'basophils',
               'eosinophils', 'lymphocytes', 'monocytes', 'neutrophils',
               'atypical_lymphocytes', 'bands', 'immature_granulocytes',
               'metamyelocytes', 'nrbc', 'mch', 'mchc', 'mcv', 'platelet', 'rbc',
               'rdw', 'rdwsd', 'd_dimer', 'fibrinogen', 'thrombin', 'inr', 'pt', 'ptt',
               'alt', 'alp', 'ast', 'amylase', 'bilirubin_total', 'bilirubin_direct',
               'bilirubin_indirect', 'ck_cpk', 'ggt', 'ld_ldh']

# 按hadm_id和时间排序数据
labs.sort_values(by=['subject_id', 'hadm_id',
                 'lab_day', 'lab_hour'], inplace=True)

# 针对每个实验室指标列，进行前向填充
for i in set(labs["hadm_id"]):
    labs.loc[labs["hadm_id"] == i,
             lab_columns] = labs.loc[labs["hadm_id"] == i, lab_columns].ffill()
labs0 = labs.drop(['admittime', 'dischtime', 'charttime', 'specimen_id', 'specimen'],
                  axis=1).groupby(['subject_id', 'hadm_id']).first().reset_index()

labs0.drop_duplicates().to_csv("pneumonia/feature_labs.csv")

### 影像学


In [71]:
cxr_time = pd.read_csv(
    "../MIMIC-IV-CXR/mimic-iv-cxr-2/mimic-cxr-2.0.0-metadata.csv")
cxr_time = cxr_time[["subject_id", "study_id", "StudyDate"]].drop_duplicates()
cxr_che = pd.read_csv(
    "../MIMIC-IV-CXR/mimic-iv-cxr-2/mimic-cxr-2.0.0-chexpert.csv")
cxr_data = pd.merge(cxr_time, cxr_che)

cxr_df = pd.merge(
    patients_admission[["subject_id", "hadm_id", "admittime"]], cxr_data)
cxr_df['admittime'] = pd.to_datetime(cxr_df['admittime'])
cxr_df['StudyDate'] = pd.to_datetime(cxr_df['StudyDate'], format="%Y%m%d")
cxr_df["cxr_day"] = (cxr_df["StudyDate"] - cxr_df["admittime"]).dt.days

cxr_df = cxr_df.drop(
    ['subject_id', 'admittime', 'study_id', 'StudyDate'], axis=1)
cxr_df = cxr_df.groupby(['hadm_id', 'cxr_day']).mean().reset_index()
cxr_df = cxr_df.loc[(cxr_df["cxr_day"] <= 3) & (cxr_df["cxr_day"] >= -3)]
cxr_df = cxr_df.fillna(0)
cxr_df = cxr_df.sort_values(['hadm_id', 'cxr_day'])
# 根据 hadm_id 分组并保留 lab_day 最小的行
cxr_df = cxr_df.groupby('hadm_id').first().reset_index()
cxr_df.drop_duplicates().to_csv("pneumonia/feature_cxr.csv")

### 感染类型


In [25]:
bact_tyep = {"MRSA": ['Methicillin resistant pneumonia due to Staphylococcus aureus', 'Pneumonia due to Methicillin resistant Staphylococcus aureus'],
             "MSSA": ['Methicillin susceptible pneumonia due to Staphylococcus aureus', 'Pneumonia due to Methicillin susceptible Staphylococcus aureus'],
             "Staphylococcus": ['Pneumonia due to Staphylococcus, unspecified', 'Pneumonia due to other staphylococcus', 'Pneumonia due to staphylococcus, unspecified', 'Other Staphylococcus pneumonia'],
             "Streptococcus": ['Pneumonia due to Streptococcus, group A', 'Pneumonia due to Streptococcus, group B', 'Pneumonia due to streptococcus, group B', 'Pneumonia due to Streptococcus, unspecified', 'Pneumonia due to other Streptococcus', 'Pneumonia due to other streptococci'],
             "Escherichia coli": ['Pneumonia due to Escherichia coli', 'Pneumonia due to escherichia coli [E. coli]'],
             "Hemophilus influenzae": ['Pneumonia due to Hemophilus influenzae [H. influenzae]'],
             "Klebsiella pneumoniae": ['Pneumonia due to Klebsiella pneumoniae'],
             "Legionnaires": ["Pneumonia due to Legionnaires' disease"],
             "Pseudomonas": ['Pneumonia due to Pseudomonas'],
             "anaerobes": ['Pneumonia due to anaerobes'],
             "Mycoplasma": ['Pneumonia due to Mycoplasma pneumoniae', 'Pneumonia due to mycoplasma pneumoniae'],
             "other G-": ['Pneumonia due to other Gram-negative bacteria', 'Pneumonia due to other gram-negative bacteria'],
             "unspecified BACT": ['Bacterial pneumonia, unspecified', 'Pneumonia due to other specified bacteria', 'Pneumonia due to other specified organism', 'Unspecified bacterial pneumonia']}
virus_type = {"Adenoviral": ['Adenoviral pneumonia', 'Pneumonia due to adenovirus'], "metapneumovirus": ['Human metapneumovirus pneumonia'],
              "Parainfluenza": ['Parainfluenza virus pneumonia', 'Pneumonia due to parainfluenza virus'],
              "syncytial": ['Pneumonia due to respiratory syncytial virus', 'Respiratory syncytial virus pneumonia'],
              "H1N1": ['Influenza due to identified 2009 H1N1 influenza virus with pneumonia', 'Influenza due to identified novel H1N1 influenza virus'],
              "avian": ['Influenza due to identified avian influenza virus', 'Influenza due to identified avian influenza virus with other manifestations', 'Influenza due to identified avian influenza virus with other respiratory manifestations', 'Influenza due to identified avian influenza virus with pneumonia'],
              "influenza A": ['Influenza due to identified novel influenza A virus with other manifestations', 'Influenza due to identified novel influenza A virus with other respiratory manifestations', 'Influenza due to identified novel influenza A virus with pneumonia'],
              "other influenz": ['Influenza due to other identified influenza virus with encephalopathy', 'Influenza due to other identified influenza virus with gastrointestinal manifestations', 'Influenza due to other identified influenza virus with other manifestations', 'Influenza due to other identified influenza virus with other respiratory manifestations', 'Influenza due to other identified influenza virus with other specified pneumonia', 'Influenza due to other identified influenza virus with the same other identified influenza virus pneumonia', 'Influenza due to other identified influenza virus with unspecified type of pneumonia', 'Influenza due to unidentified influenza virus with encephalopathy', 'Influenza due to unidentified influenza virus with gastrointestinal manifestations', 'Influenza due to unidentified influenza virus with other manifestations', 'Influenza due to unidentified influenza virus with other respiratory manifestations', 'Influenza due to unidentified influenza virus with specified pneumonia', 'Influenza due to unidentified influenza virus with unspecified type of pneumonia', 'Influenza with other manifestations', 'Influenza with other respiratory manifestations', 'Influenza with pneumonia'],
              "other virus": ['Other viral pneumonia', 'Viral pneumonia, unspecified', 'Pneumonia due to other virus not elsewhere classified']}

In [None]:
diagnoses_pn = pd.read_csv("pneumonia/diagnoses_pn.csv", index_col=0)
for i in diagnoses_pn.index:
    for jk, jv in bact_type.items():
        if diagnoses_pn.loc[i, "long_title"] in jv:
            diagnoses_pn.loc[i, "long_title"] = jk
    for jk, jv in virus_type.items():
        if diagnoses_pn.loc[i, "long_title"] in jv:
            diagnoses_pn.loc[i, "long_title"] = jk
for coms in set(diagnoses_pn["long_title"]):
    diagnoses_pn[coms] = 0
    diagnoses_pn.loc[diagnoses_pn["long_title"] == coms, coms] = 1
diagnoses_pn = diagnoses_pn.drop(["seq_num", "icd_code", "icd_version", "mixed_infection",
                                 "long_title"], axis=1).groupby(['hadm_id']).max().reset_index()
diagnoses_pn.drop_duplicates().to_csv("pneumonia/feature_infectiontype.csv")

### SOFA


In [8]:
sofa = pd.read_csv("concepts/sepsis/sofa.csv")
admission_icu = pd.read_csv("pneumonia/outcome_admission_icu.csv", index_col=0)
sofa = pd.merge(admission_icu[["subject_id", "hadm_id", "stay_id"]], sofa)