In [None]:
import pickle
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load code embedded types dictionary
with open("/content/drive/MyDrive/greenDevil/진짜 쓸 거/hospital73_pretraining_v2/output_prefix.types", "rb") as f:
    types_dict = pickle.load(f)

# Label Sepsis
# 1 (case) : Patient with at least one Sepsis diagnosis
# (icd9=038.9)

# ICD-9 codes for Sepsis
disease_codes = ['038.9']

# get corresponding embedded codes
disease_tokens = []
for code in disease_codes:
  if code in types_dict:
    disease_tokens.append(types_dict[code])

print(f'disease tokens: {disease_tokens}')

disease tokens: [39]


In [None]:
# labeling function
def code_processing(codes, visits):  # codes: list of diagnosis tokens, visits: visit number mapping to each code in the sequence list
  found = 0  # flag whether the code is found
  new_codes, new_visits = [], []

  for code, visit in zip(codes, visits):
        if code in disease_tokens:
            found = 1
            continue   # pass the code/visit
        new_codes.append(code)
        new_visits.append(visit)

  return found, new_codes, new_visits

### Load Data from pickled list
The pickled list is a list of lists where each sublist represent a patient record that looks like **[ pt_id,label, seq_list , segment_list ]** where
- Label: 1: ARF (case) , 0 control
- seq_list: list of all medical codes in all visits
- segment_list: the visit number mapping to each code in the sequence list

In [None]:
# 파일명
infile = '/content/drive/MyDrive/greenDevil/진짜 쓸 거/hospital73_pretraining_v2/output_prefix.bencs.train'
outfile = '/content/my_bertft.train_73.pkl'

'''
Input output_prefix.bencs:

[
 patient_id,         # string or int: the patient’s unique ID
 los_list,           # list of length-of-stay values for each visit
 time_to_next_list,  # list of time (in days) to the next visit
 token_seq,          # a flattened list of all diagnosis token IDs
 visit_segments      # a list with the same length as token_seq, marking which visit each token came from

]
'''

# 데이터 불러오기
sequences = np.load(infile, allow_pickle=True)

import pprint
print('----sequences----')
pprint.pprint(sequences[:10])

bert_inputs = []
labels = []

for item in sequences:
    pt_id = item[0]
    label, seq_list, segment_list = code_processing(item[3], item[4])

    bert_inputs.append((pt_id, label, seq_list, segment_list))
    labels.append(label)

print('----bert inputs----')
pprint.pprint(bert_inputs[:10])

print('labels:', np.unique(labels, return_counts=True))

# 저장
with open(outfile, 'wb') as f:
    pickle.dump(bert_inputs, f)

print(f"{outfile} 저장 완료!")



----sequences----
[['002-24088', [2], [0], [174], [1]],
 ['002-39047', [9, 9], [281, 0], [73, 16, 73], [1, 2, 2]],
 ['002-21002',
  [9, 9],
  [47, 0],
  [147, 124, 59, 147, 124, 59],
  [1, 1, 1, 2, 2, 2]],
 ['002-27886', [6, 6], [3, 0], [102, 67, 52, 102], [1, 1, 1, 2]],
 ['002-22764', [11], [0], [71, 57, 22], [1, 1, 1]],
 ['002-58565', [6], [0], [100, 48], [1, 1]],
 ['002-42799', [13], [0], [65, 364], [1, 1]],
 ['002-63236',
  [23],
  [0],
  [64, 2, 105, 4, 16, 50, 125, 13, 49, 22, 6, 20],
  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
 ['002-31539', [15], [0], [241, 228, 307, 13], [1, 1, 1, 1]],
 ['002-54528', [0], [0], [197], [1]]]
----bert inputs----
[('002-24088', 0, [174], [1]),
 ('002-39047', 0, [73, 16, 73], [1, 2, 2]),
 ('002-21002', 0, [147, 124, 59, 147, 124, 59], [1, 1, 1, 2, 2, 2]),
 ('002-27886', 0, [102, 67, 52, 102], [1, 1, 1, 2]),
 ('002-22764', 0, [71, 57, 22], [1, 1, 1]),
 ('002-58565', 0, [100, 48], [1, 1]),
 ('002-42799', 0, [65, 364], [1, 1]),
 ('002-63236',
  0,
  [64,