### 基本信息

In [31]:
import pandas as pd
from copy import deepcopy

adm_pn = pd.read_csv("pneumonia/adm_pn.csv", index_col=0)
patients = pd.read_csv("../MIMIC-IV/mimic-iv-2/hosp/patients.csv")
patients_demo = pd.merge(adm_pn[["subject_id", "hadm_id","admittime"]].drop_duplicates(), patients)
patients_demo['adm_year'] = pd.to_datetime(patients_demo['admittime']).dt.year
patients_demo["adm_age"] = patients_demo["adm_year"]-(patients_demo["anchor_year"] - patients_demo["anchor_age"])
patients_demo.loc[patients_demo["gender"] == "M","gender"] = 1
patients_demo.loc[patients_demo["gender"] == "F","gender"] = 0

patients_demo = patients_demo[["hadm_id",'gender', 'adm_age']].drop_duplicates()
patients_demo.to_csv("pneumonia/patients_demo.csv")

### 既往史

In [8]:
import pandas as pd
import numpy as np
adm_pn = pd.read_csv("pneumonia/adm_pn.csv", index_col=0)
diagnoses = pd.read_csv("mimic-iv-2/hosp/diagnoses_icd.csv", header=0)
full_diag = diagnoses.loc[diagnoses["hadm_id"].isin(adm_pn["hadm_id"])].reset_index(drop=True)
full_diag = pd.merge(full_diag,adm_pn[['hadm_id']].drop_duplicates())
combids = {
    "Congestive heart failure":"428\d|I50\d",
    "Cardiacarrhythmias":"427\d|I5[4-9]\d",
    "Coronary artery atherosclerosis":"414\d|I25\d",
    "Pulmonarycirculation":"41[5-7]\d|I2[6-8]\d",
    "Hypertention":"40\d|I1\d",
    "Chronicpulmonary":"49\d|J4[0-7]\d",
    "Heptic disease":"57[1-3]\d|K7[0-6]\d",
    "Renal diseases":"58[2-6]\d|593\d|N0\d|N1[7-9]\d",
    "Blood abnormal":"286\d|D6[5-8]\d",
    "Diabetes":"250\d|E1[0-4]\d",
    "Neuro":"780\d|R40\d",
    "Immunity suppression":"279\d|D80\d|279\d|D84\d|042\d|B20\d",
    # "Sepsis":"995\d|038\d|A4[0-1]\d|R65",
    }

for coms,icd in combids.items():
    full_diag[coms] = 0
    full_diag.loc[full_diag["icd_code"].str.match(icd), coms] = 1
    
full_diag = full_diag.drop(["subject_id","seq_num","icd_code","icd_version","infection_type"], axis=1).groupby(['hadm_id']).max().reset_index().drop_duplicates()
full_diag.to_csv("pneumonia/patients_past.csv")
# import matplotlib.pyplot as plt
# plt.figure(figsize=(10, 6))
# plt.bar(combids.keys(), full_diag[combids.keys()].mean())
# plt.xlabel('Categories')
# plt.ylabel('Probabilities')
# plt.title('Probabilities of Medical Categories')

# plt.xticks(rotation=45)
# plt.tight_layout()
# plt.show()

### 实验室指标

In [3]:
import pandas as pd
import numpy as np
import os
from copy import deepcopy
adm_pn = pd.read_csv("pneumonia/adm_pn.csv", index_col=0)
df = deepcopy(adm_pn[["hadm_id", "admittime"]].drop_duplicates())

### 将event按小时和时间分类
input_dir = "concepts/measurement/"    
for i in os.listdir(input_dir):
    mear = pd.read_csv("concepts/measurement/%s"%i)
    df = pd.merge(df, mear, how="left")
    
df['admittime'] = pd.to_datetime(df['admittime'])
df['charttime'] = pd.to_datetime(df['charttime'], format="%d/%m/%Y %H:%M:%S")

df["lab_day"] = (df["charttime"] - df["admittime"]).dt.days

In [4]:
df = df.drop(['subject_id', 'admittime', 'charttime','specimen_id','specimen'],axis=1).groupby(['hadm_id','lab_day']).mean().reset_index()
df = df.loc[(df["lab_day"]<=3) & (df["lab_day"]>=-3),df.isna().mean()<=0.9]
df = df.sort_values(['hadm_id', 'lab_day'])
# 根据 hadm_id 分组并保留 lab_day 最小的行
lab_df = df.groupby('hadm_id').first().reset_index().drop_duplicates()

### 影像学

In [5]:
import pandas as pd

cxr_time = pd.read_csv("../MIMIC-IV-CXR/mimic-iv-cxr-2/mimic-cxr-2.0.0-metadata.csv")
cxr_time = cxr_time[["subject_id", "study_id", "StudyDate"]].drop_duplicates()
cxr_che = pd.read_csv("../MIMIC-IV-CXR/mimic-iv-cxr-2/mimic-cxr-2.0.0-chexpert.csv")
cxr_data = pd.merge(cxr_time,cxr_che)

adm_pn = pd.read_csv("pneumonia/adm_pn.csv", index_col=0)
df = pd.merge(adm_pn[["subject_id", "hadm_id", "admittime"]], cxr_data)
df['admittime'] = pd.to_datetime(df['admittime'])
df['StudyDate'] = pd.to_datetime(df['StudyDate'],format = "%Y%m%d")
df["cxr_day"] = (df["StudyDate"] - df["admittime"]).dt.days

df = df.drop(['subject_id', 'admittime', 'study_id','StudyDate'], axis=1)
df = df.groupby(['hadm_id','cxr_day']).mean().reset_index()
df = df.loc[(df["cxr_day"]<=2) & (df["cxr_day"]>=-2)]
df = df.fillna(0)
df = df.sort_values(['hadm_id', 'cxr_day'])
# 根据 hadm_id 分组并保留 lab_day 最小的行
cxr_df = df.groupby('hadm_id').first().reset_index().drop_duplicates()

## 预测模型

In [7]:
def logistic_regression_with_feature_processing(data,categorical_features,continuous_features, label, testsize):
    
    import pandas as pd
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import LabelEncoder, StandardScaler
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score
    
    data = data[categorical_features+continuous_features+label]
    # 处理缺失值
    imputer = SimpleImputer(strategy='mean') 
    data_filled = imputer.fit_transform(data)
    data_filled = pd.DataFrame(data_filled, columns=data.columns)

    # 特征编码
    if len(categorical_features)>0:
        label_encoder = LabelEncoder()
        for feature in categorical_features:
            data_filled[feature] = label_encoder.fit_transform(data_filled[feature])
            
    if len(continuous_features)>0:
        scaler = StandardScaler()
        data_filled[continuous_features] = scaler.fit_transform(data_filled[continuous_features])

    # 划分数据集
    X = data_filled.drop(label, axis=1)  # 假设目标变量名为'target'
    y = data_filled[label]
    X_train, X_test, y_train, y_test = train_test_split(X.astype("int"), y.astype("int"), test_size=testsize, random_state=42)

    # 训练逻辑回归模型
    lr_model = LogisticRegression()
    lr_model.fit(X_train, y_train)
    y_pred = lr_model.predict(X_test)
    
    accuracy = accuracy_score(y_test.astype('int'), y_pred.astype('int'))
    print("Accuracy: %.2f" % accuracy)  

    return y_pred, accuracy

In [46]:
data = pd.merge(patients_demo,full_diag)
data = pd.merge(data,lab_df)
data = pd.merge(data,cxr_df)
data = data.drop(["lab_day","cxr_day"],axis=1)

cate_features = [i for i in data.columns if len(set(data[i]))<=5 ]
continous_features = [i for i in data.columns if len(set(data[i]))>5]
print(cate_features,continous_features)

continous_features.remove("hadm_id")

label = ["sepsis3"]
adm_pn = pd.read_csv("pneumonia/adm_pn.csv", index_col=0)
data = pd.merge(data, adm_pn[["hadm_id"]+label])

# data.loc[data["mixed_infection"] == "BACT","mixed_infection"] = 1
# data.loc[data["mixed_infection"] == "VIRUS","mixed_infection"] = 0
# data = data.loc[data["mixed_infection"].isin([1,0])]
data = data.loc[data["hadm_id"].isin(adm_pn.loc[adm_pn["mixed_infection"]=="BACT","hadm_id"])]
data.loc[data["sepsis3"] == "t","sepsis3"] = 1
data.loc[data["sepsis3"].isna(),"sepsis3"] = 0

logistic_regression_with_feature_processing(data, cate_features, continous_features, label , testsize=0.2)

['gender', 'Congestive heart failure', 'Cardiacarrhythmias', 'Coronary artery atherosclerosis', 'Pulmonarycirculation', 'Hypertention', 'Chronicpulmonary', 'Heptic disease', 'Renal diseases', 'Blood abnormal', 'Diabetes', 'Neuro', 'Immunity suppression', 'Atelectasis', 'Cardiomegaly', 'Consolidation', 'Enlarged Cardiomediastinum', 'Fracture', 'Lung Lesion', 'Lung Opacity', 'No Finding', 'Pleural Effusion', 'Pleural Other', 'Pneumonia', 'Support Devices'] ['hadm_id', 'adm_age', 'ck_mb', 'ntprobnp', 'albumin', 'aniongap', 'bicarbonate', 'bun', 'calcium', 'chloride', 'creatinine', 'glucose', 'sodium', 'potassium', 'alt', 'alp', 'ast', 'bilirubin_total', 'ck_cpk', 'ld_ldh', 'Edema', 'Pneumothorax']
Accuracy: 0.70


  y = column_or_1d(y, warn=True)


(array([1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
        1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,
        0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
        1, 0, 1, 1, 1]),
 0.6989247311827957)

In [40]:
adm_pn

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,infection_type,admittime,dischtime,deathtime,death_event,...,mixed_infection,sofa_time,sofa_score,respiration,coagulation,liver,cardiovascular,cns,renal,sepsis3
0,10001176,23334588,1,4829,9,BACT,2186-11-29 03:56:00,2186-12-02 15:00:00,,0,...,BACT,,,,,,,,,
1,10004401,28128182,3,48241,9,BACT,2144-02-11 18:13:00,2144-02-19 14:30:00,,0,...,BACT,12/2/2144 19:00:00,3.0,2.0,0.0,0.0,1.0,0.0,0.0,t
2,10004733,27411876,4,48249,9,BACT,2174-12-04 11:28:00,2174-12-27 14:00:00,,0,...,BACT,4/12/2174 14:00:00,2.0,0.0,0.0,0.0,0.0,0.0,2.0,t
3,10009049,22995465,2,4829,9,BACT,2174-05-26 08:21:00,2174-05-31 14:15:00,,0,...,BACT,,,,,,,,,
4,10010663,22209635,2,J154,10,BACT,2146-09-28 23:55:00,2146-10-13 11:42:00,,0,...,BACT,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6370,19973319,21871885,1,J1000,10,VIRUS,2120-06-25 07:51:00,2120-06-29 17:01:00,,0,...,VIRUS,,,,,,,,,
6371,19975796,25848942,2,4870,9,VIRUS,2148-12-02 22:08:00,2148-12-14 16:00:00,,0,...,VIRUS,3/12/2148 01:00:00,3.0,0.0,0.0,0.0,3.0,0.0,0.0,t
6372,19983529,29127570,1,4871,9,VIRUS,2142-04-12 01:04:00,2142-04-12 15:42:00,,0,...,VIRUS,,,,,,,,,
6373,19986230,28928599,1,J101,10,VIRUS,2190-02-24 05:07:00,2190-02-26 18:06:00,,0,...,VIRUS,,,,,,,,,
