In [1]:
import pandas as pd
import numpy as np
import os
import math
import chardet
from collections import OrderedDict
import json
import re
import glob

In [2]:
data_path = r'1110101-1110823門診患者病歷資料.xlsx'
df = pd.read_excel(data_path)

In [3]:
#資料集中有空欄位，要做處理前須先將0填入
df = df.fillna(0)
index_chinese = []
for index,emr in enumerate(df.iloc[:,10]) :
    emr = str(emr)
    for word in emr :
        if  (u'\u4e00' <= word <= u'\u9fff') : #找尋是否含有中文字
            index_chinese.append(df.at[index,'病歷號'])
            break
index_chinese = list(OrderedDict.fromkeys(index_chinese))
print(len(index_chinese))

10778


In [4]:
index_eng = []
for index, patient in enumerate(df.iloc[:,1]) :
    if patient not in index_chinese :
        index_eng.append(patient)
index_eng = list(OrderedDict.fromkeys(index_eng))
print(len(index_eng))

1246


In [5]:
#抓取病人資料並可以透過函式返回dict格式的資料
class patient_inf ():
    def __init__(self , df , med_rec_no) :
        self.df = df 
        self.num = med_rec_no
    def get_inf (self) :
        inf = {}
        df_patient = self.df[self.df['病歷號']==self.num]
        inf['性別'] = df_patient.at[df_patient.index[0],'性別']
        inf['診斷碼'] = []
        inf['診斷碼'].append(df_patient.at[df_patient.index[0],'診斷碼'])
        if type(df_patient.at[df_patient.index[0],'診斷碼.1']) != str :
            inf['診斷碼'].append(df_patient.at[df_patient.index[0],'診斷碼.1'])
        if type(df_patient.at[df_patient.index[0],'診斷碼.2']) != str :
            inf['診斷碼'].append(df_patient.at[df_patient.index[0],'診斷碼.2'])
        for i in range(len(df_patient.index)) :
            if df_patient.iloc[i,9] == '徵侯' :
                inf['徵侯'] = df_patient.iloc[i,10]
            elif df_patient.iloc[i,9] == '病史' :
                inf['病史'] = df_patient.iloc[i,10]
            else :
                inf['處置'] = df_patient.iloc[i,10]
        return inf

In [6]:
#get patients' information from df
dataset_inf = {}
for patient in index_eng :
    dataset_inf[patient] = patient_inf(df, patient).get_inf()

In [7]:
# load abbr information
df_abbr = pd.read_excel('abbreviation.xlsx')
abbr_dict = {}
for index , abbr in enumerate(df_abbr.iloc[:,0]) :
    abbr_dict[abbr] = df_abbr.at[index,'Word']
print(abbr_dict)

{'BW': 'Body weight', 'BH': 'Body height', 'BT': 'Body temperature', 'BMI': 'Body Mass Index', 'TOCC': 'Travel history, Occupation, Contact history, Cluster', 'Holter': '24hr Holter EKG ', 'EKG': 'Electrocardiography', 'CXR': 'Chest x-ray', 'BP': 'blood pressure', 'RHB': 'Regular heart beat', 'ER': 'emergency room', 'Tx': 'therapy', 'abd': 'abdominal', 'WBC': 'white blood cells', 'URI': 'upper respiratory tract infection', 'AGE': 'Acute Gastroenteritis', 'LV': 'left ventricle', 'EF': 'Ejection friction', 'f/u': 'follow up', 'T4': 'thyroxine', 'RBC': 'red blood cells', 'epi': 'epithelial cell of urine', 'OPD': 'Outpatient department\xa0', 'B.S.': 'breathing sounds', 'HEERT': 'head, ear, eyes, nose, throat', 'RN': 'rhinorrhea', 'NO': 'Nasal obstruction', 'bil': 'bilateral', 'EGD': 'EsophagoGastroDuodenoscopy', 'S/P': 'Post-surgical', 'Bx': 'biopsy', 'HP': 'Helicobacter pylori', 'DRE': 'DIGITAL RECTAL EXAMINATION', 'GU': 'Gastric ulcer', 'CFS': 'Colonofibroscopy', 'DOE': 'dyspnea on exert

In [8]:
#replace abbreviation
df_emr = df['病歷內容'].copy()
print(df_emr.head())
for index, text in enumerate(df_emr) :
    if type(text) != str :
        df_emr[index] = str(text)
    for abbr in abbr_dict.keys() :
        text = re.sub(abbr,abbr_dict[abbr],str(text))
    text = text.replace('_x000D_', '\r')
    df_emr[index] = str(text)
# df_emr_changed = df_emr_changed.str.split(' ').apply(lambda x: ' '.join([abbr_dict.get(e, e) for e in x]))


0    (111/01/01), nasal mucosa congested and hypert...
1        (111/01/01) cough, sorethroat, RN for 2 days,
2    (111/01/01)BH:87cm, BW:11kg, BMI:14.5, BT:36.7...
3    (111/01/01), cough with sputum and rhinorrhea ...
4                             education and medication
Name: 病歷內容, dtype: object


In [9]:
#做另一份複製的資料以免影響到原先的檔案
df_inf_copy = df.copy()
df_inf_copy['病歷內容'] = df_emr

In [10]:
#抽取病人資料
dataset_inf = {}
for patient in index_eng :
    dataset_inf[patient] = patient_inf(df_inf_copy, patient).get_inf()

In [None]:
#save emr to excel
df_patient = pd.DataFrame.from_dict(dataset_inf, orient='index')
df_patient = df_patient.stack()
# df_patient.to_excel('EMR_processed_new.xlsx')
print(df_patient)

In [None]:
#count 診斷碼
diagnosis_count = {}
for idx in dataset_inf.keys() :
    for diag in dataset_inf[idx]['診斷碼'] :
        diagnosis_count[diag] = diagnosis_count.get(diag,0) + 1
print(diagnosis_count)
#下面這行是排序
diagnosis_count_sorted = sorted(diagnosis_count.items(),key=lambda x : x[1] ,reverse=True)


In [None]:
# sorted diagnosis
df_diag = pd.DataFrame(data=diagnosis_count_sorted,columns=['診斷碼','人數'])
# df_diag.to_excel('各診斷碼人數.xlsx')
print(df_diag)

In [None]:
# count 各組合
diagnosis_count_2 = {}
for key in dataset_inf.keys() :
    diag_set = ','.join(str(x) for x in dataset_inf[key]['診斷碼'])
    diagnosis_count_2[diag_set] = diagnosis_count_2.get(diag_set,0) + 1
diagnosis_count_2_sorted = sorted(diagnosis_count_2.items(),key=lambda x : x[1] ,reverse=True)
print(diagnosis_count_2_sorted)

In [None]:
df_diag_set = pd.DataFrame(data=diagnosis_count_2_sorted,columns=['診斷碼','人數'])
#df_diag_set.to_excel('emr_diag_set_new.xlsx')

In [22]:
import io
import os
import torch
from torch import nn
from torch.optim import Adam
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from ml_things import plot_dict, plot_confusion_matrix, fix_text
from sklearn.metrics import classification_report, accuracy_score
from transformers import (set_seed,
                          TrainingArguments,
                          Trainer,
                          GPT2Config,
                          GPT2Model,
                          GPT2Tokenizer,
                          AdamW, 
                          get_linear_schedule_with_warmup,
                          GPT2ForSequenceClassification)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from torchviz import make_dot

In [12]:
#preprocess dataset
#remove useless data
dataset_emr = {}
for key in dataset_inf.keys() :
    for diagnosis in [460, 461.9, 465.9, 466] :
    # for diagnosis in [460, 465.9] :
        for code in dataset_inf[key]['診斷碼'] :
            if code == diagnosis :
                dataset_emr[key] = dataset_inf[key]
                break
        if code == diagnosis :
            break

In [13]:
#做資料分割
print(len(dataset_emr.keys()))
dataset_train = {}
dataset_valid = {}
dataset_test = {}
for num,keys in enumerate(dataset_emr.keys()) :
    if num < 99 :
        dataset_test[keys] = dataset_emr[keys]
    elif num >= 99 and num < 793 :
        dataset_train[keys] = dataset_emr[keys]
    else :
        dataset_valid[keys] = dataset_emr[keys]
print(len(dataset_train.keys()))
print(len(dataset_valid.keys()))
print(len(dataset_test.keys()))

991
694
198
99


In [25]:
#做dataset
#多疾病預測的資料集
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token
class Dataset_V2(torch.utils.data.Dataset):
    def __init__(self, dict):
        self.keys = dict.keys()
        self.texts = []
        self.labels = []
    # Since the labels are defined by dict, we loop 
    # through each label.
        for keys in self.keys :
            diag_code = []
            self.texts.append(dict[keys]['徵侯'])
            for label in ['460', '461.9', '465.9', '466'] :
            #for label in ['460', '465.9'] :
                for diag in dict[keys]['診斷碼'] :
                    if str(diag) == str(label) :
                        diag_code.append(1)
                        break
                if str(diag) != str(label) :
                    diag_code.append(0)
            self.labels.append(diag_code)
        # Number of exmaples.
        self.n_examples = len(self.labels)
        # self.labels = [labels[label] for label in self.labels]
        self.texts = [tokenizer(text,
                                padding='max_length',
                                max_length=256,
                                truncation=True,
                                return_tensors="pt") for text in self.texts]
        
    def classes(self):
        return self.labels
    
    def __len__(self):
        return len(self.labels)
    
    def get_batch_labels(self, idx):
        # Get a batch of labels
        return np.array(self.labels[idx])
    
    def get_batch_texts(self, idx):
        # Get a batch of inputs
        return self.texts[idx]
    
    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y

In [18]:
#model
#model for 多疾病分類(二元)
class SimpleGPT2SequenceClassifier_V2(nn.Module):
    def __init__(self, hidden_size: int, num_classes:int ,max_seq_len:int, gpt_model_name:str):
        super(SimpleGPT2SequenceClassifier_V2,self).__init__()
        self.gpt2model = GPT2Model.from_pretrained(gpt_model_name)
        self.fc1 = nn.Linear(hidden_size*max_seq_len, num_classes)
        self.fc2 = nn.Linear(hidden_size*max_seq_len, num_classes)
        self.fc3 = nn.Linear(hidden_size*max_seq_len, num_classes)
        self.fc4 = nn.Linear(hidden_size*max_seq_len, num_classes)

        
    def forward(self, input_id, mask):
        """
        Args:
                input_id: encoded inputs ids of sent.
        """
        gpt_out, _ = self.gpt2model(input_ids=input_id, attention_mask=mask, return_dict=False)
        batch_size = gpt_out.shape[0]
        linear_output1 = self.fc1(gpt_out.view(batch_size,-1))
        linear_output2 = self.fc2(gpt_out.view(batch_size,-1))
        linear_output3 = self.fc3(gpt_out.view(batch_size,-1))
        linear_output4 = self.fc4(gpt_out.view(batch_size,-1))
        return linear_output1 , linear_output2, linear_output3, linear_output4

In [19]:
#個別訓練疾病種類
def train_V2(model, train_data, val_data, learning_rate, epochs):
    train, val = Dataset_V2(train_data), Dataset_V2(val_data)
    
    train_dataloader = torch.utils.data.DataLoader(train, batch_size=4, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=4)
    
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    
    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)
    history_train_acc = []
    history_valid_acc = []
    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()

    for epoch_num in range(epochs):
        total_acc_train_1 = 0
        total_acc_train_2 = 0
        total_acc_train_3 = 0
        total_acc_train_4 = 0
        total_loss_train = 0
        
        for train_input, train_label in tqdm(train_dataloader):
            #train_label = train_label.to(device).to(torch.int64)
            train_label = train_label.type(torch.LongTensor).to(device)

            mask = train_input['attention_mask'].to(device)
            input_id = train_input["input_ids"].squeeze(1).to(device)
            
            model.zero_grad()

            output1, output2, output3, output4  = model(input_id, mask)
            
            batch_loss1 = criterion(output1, train_label[:,0])
            batch_loss2 = criterion(output2, train_label[:,1])
            batch_loss3 = criterion(output3, train_label[:,2])
            batch_loss4 = criterion(output4, train_label[:,3])
            total_loss = (batch_loss1 + batch_loss2 + batch_loss3 + batch_loss4)/4
            total_loss_train += total_loss.item()
            
            acc1 = (output1.argmax(dim=1)==train_label[:,0]).sum().item()
            acc2 = (output2.argmax(dim=1)==train_label[:,1]).sum().item()
            acc3 = (output3.argmax(dim=1)==train_label[:,2]).sum().item()
            acc4 = (output4.argmax(dim=1)==train_label[:,3]).sum().item()
            total_acc_train_1 += acc1
            total_acc_train_2 += acc2
            total_acc_train_3 += acc3
            total_acc_train_4 += acc4

            total_loss.backward()
            optimizer.step()
        total_train_acc = [total_acc_train_1, total_acc_train_2, total_acc_train_3, total_acc_train_4]
        history_train_acc.append(total_train_acc)

        total_acc_val_1 = 0
        total_acc_val_2 = 0
        total_acc_val_3 = 0
        total_acc_val_4 = 0
        total_loss_val = 0
        
        with torch.no_grad():
            
            for val_input, val_label in val_dataloader:
                val_label = val_label.type(torch.LongTensor).to(device)
                mask = val_input['attention_mask'].to(device)
                input_id = val_input['input_ids'].squeeze(1).to(device)
                
                output1, output2, output3, output4  = model(input_id, mask)
            
                batch_loss1 = criterion(output1, val_label[:,0])
                batch_loss2 = criterion(output2, val_label[:,1])
                batch_loss3 = criterion(output3, val_label[:,2])
                batch_loss4 = criterion(output4, val_label[:,3])
                total_loss = (batch_loss1 + batch_loss2 + batch_loss3 + batch_loss4)/4
                total_loss_val += total_loss.item()
                
                acc1 = (output1.argmax(dim=1)==val_label[:,0]).sum().item()
                acc2 = (output2.argmax(dim=1)==val_label[:,1]).sum().item()
                acc3 = (output3.argmax(dim=1)==val_label[:,2]).sum().item()
                acc4 = (output4.argmax(dim=1)==val_label[:,3]).sum().item()
                total_acc_val_1 += acc1
                total_acc_val_2 += acc2
                total_acc_val_3 += acc3
                total_acc_val_4 += acc4
            total_valid_acc = [total_acc_val_1, total_acc_val_2, total_acc_val_3, total_acc_val_4]
            history_valid_acc.append(total_valid_acc)
                
                
            print(
            f"Epochs: {epoch_num + 1} | Train Loss: {total_loss_train/len(train_data): .3f} \
            | Train Accuracy1: {total_acc_train_1 / len(train_data): .3f} \
            | Train Accuracy2: {total_acc_train_2 / len(train_data): .3f} \
            | Train Accuracy3: {total_acc_train_3 / len(train_data): .3f} \
            | Train Accuracy4: {total_acc_train_4 / len(train_data): .3f} \
            | Val Loss: {total_loss_val / len(val_data): .3f} \
            | Val Accuracy1: {total_acc_val_1 / len(val_data): .3f} \
            | Val Accuracy2: {total_acc_val_2 / len(val_data): .3f} \
            | Val Accuracy3: {total_acc_val_3 / len(val_data): .3f} \
            | Val Accuracy4: {total_acc_val_4 / len(val_data): .3f} ")
    return history_train_acc, history_valid_acc
EPOCHS = 10
model = SimpleGPT2SequenceClassifier_V2(hidden_size=768, num_classes=2, max_seq_len=256, gpt_model_name="gpt2")
LR = 1e-5

# train_acc, valid_acc = train_V2(model, dataset_train, dataset_valid, LR, EPOCHS)

In [None]:
#繪製訓練過程
labels = ['sinusitis', 'nasopharyngitis', 'respiratory infection', 'bronchitis&Bronchiolitis']
colors = ['red','blue', 'purple', 'green']
x = [i for i in range(1,EPOCHS+1)]
for i in range(4) :
    y = []
    for j in range(len(train_acc)) :
        y.append(train_acc[j][i]/len(dataset_train))
    plt.plot(x, y, color=colors[i], linestyle="-", linewidth="2", markersize="16", marker=".", label=labels[i])
plt.legend()
plt.title('train_acc')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.show()

In [None]:
#繪製validation過程
labels = ['sinusitis', 'nasopharyngitis', 'respiratory infection', 'bronchitis&Bronchiolitis']
colors = ['red','blue', 'purple', 'green']
x = [i for i in range(1,EPOCHS+1)]
for i in range(4) :
    y = []
    for j in range(len(valid_acc)) :
        y.append(valid_acc[j][i]/len(dataset_valid))
    plt.plot(x, y, color=colors[i], linestyle="-", linewidth="2", markersize="16", marker=".", label=labels[i])
plt.legend()
plt.title('validation_acc')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.show()

In [37]:
model_pre = SimpleGPT2SequenceClassifier_V2(hidden_size=768, num_classes=2, max_seq_len=256, gpt_model_name="gpt2")
model_pre.load_state_dict(torch.load("Gpt2-model_0206.pt"))
model_pre.eval()

SimpleGPT2SequenceClassifier_V2(
  (gpt2model): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid

In [36]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
model_pre.to(device)
print(device)
test_dataset = Dataset_V2(dataset_test)
with torch.no_grad():
    mask = test_dataset[1][0]['attention_mask'].to(device)
    input_id = test_dataset[1][0]['input_ids'].squeeze(1).to(device)
    y1, y2, y3, y4 = model_pre(input_id,mask)
make_dot(model_pre(test_dataset[1][0]['input_ids'].squeeze(1).to(device),mask),params=dict(model_pre.named_parameters()))

cuda


  if LooseVersion(torch.__version__) < LooseVersion("1.9") and \
  if LooseVersion(torch.__version__) < LooseVersion("1.9") and \


ExecutableNotFound: failed to execute WindowsPath('dot'), make sure the Graphviz executables are on your systems' PATH

<graphviz.graphs.Digraph at 0x1ea0aa20610>

In [28]:
test_dataset[1]

({'input_ids': tensor([[50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50

In [None]:
def evaluate_V2(model, test_data):

    test = Dataset_V2(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=4)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

        
    # Tracking variables
    predictions_labels_1 = []
    predictions_labels_2 = []
    predictions_labels_3 = []
    predictions_labels_4 = []
    true_labels_1 = []
    true_labels_2 = []
    true_labels_3 = []
    true_labels_4 = []
    
    total_acc_test_1 = 0
    total_acc_test_2 = 0
    total_acc_test_3 = 0
    total_acc_test_4 = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output1, output2, output3, output4  = model(input_id, mask)
            acc1 = (output1.argmax(dim=1)==test_label[:,0]).sum().item()
            acc2 = (output2.argmax(dim=1)==test_label[:,1]).sum().item()
            acc3 = (output3.argmax(dim=1)==test_label[:,2]).sum().item()
            acc4 = (output4.argmax(dim=1)==test_label[:,3]).sum().item()
            total_acc_test_1 += acc1
            total_acc_test_2 += acc2
            total_acc_test_3 += acc3
            total_acc_test_4 += acc4
            
            # add original labels
            true_labels_1 += test_label[:,0].cpu().numpy().flatten().tolist()
            true_labels_2 += test_label[:,1].cpu().numpy().flatten().tolist()
            true_labels_3 += test_label[:,2].cpu().numpy().flatten().tolist()
            true_labels_4 += test_label[:,3].cpu().numpy().flatten().tolist()
            # get predicitons to list
            predictions_labels_1 += output1.argmax(dim=1).cpu().numpy().flatten().tolist()
            predictions_labels_2 += output2.argmax(dim=1).cpu().numpy().flatten().tolist()
            predictions_labels_3 += output3.argmax(dim=1).cpu().numpy().flatten().tolist()
            predictions_labels_4 += output4.argmax(dim=1).cpu().numpy().flatten().tolist()
    
    print(f'Test Accuracy: {((total_acc_test_1+total_acc_test_2+total_acc_test_3+total_acc_test_4) /4)/len(test_data): .3f}')
    return [true_labels_1,true_labels_2,true_labels_3,true_labels_4], [predictions_labels_1,predictions_labels_2,predictions_labels_3,predictions_labels_4]   
true_labels, pred_labels = evaluate_V2(model, dataset_test)

In [None]:
# Plot confusion matrix.
labels = {'True':1 ,'False':0}
fig, ax = plt.subplots(2,2,figsize=(10, 10))
for i in range(4) :
    cm = confusion_matrix(y_true=true_labels[i], y_pred=pred_labels[i], labels=range(len(labels)))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=list(labels.keys()))
    disp.plot(ax=ax[i//2][i%2])