In [1]:
from bs4 import BeautifulSoup
from tqdm import tqdm
import os
import spacy
import numpy as np
from presidio_evaluator.data_objects import InputSample, Span
import json

In [2]:
nlp = spacy.load("en_core_web_trf", disable=["ner"])

In [3]:
dir = '/data/datasets/n2c2/2012/raw/train/'

In [None]:
xmls = []
for file_name in tqdm(os.listdir(dir)):
    if not file_name.endswith(".xml") or "_" in file_name:
        print(file_name)
        continue
    with open(os.path.join(dir, file_name), encoding='utf-8') as f:
        xmls.append({"file_name": file_name,
                     "content": BeautifulSoup(f.read(), features="xml")})

In [12]:
nlp.pipe_names

['transformer', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer']

In [5]:
docs = []
processed = []
i = 0
with tqdm(xmls) as pbar:
    for xml in pbar:
        i += 1

        doc = nlp(xml["content"].ClinicalNarrativeTemporalAnnotation.TEXT.text)
        docs.append(doc)
        processed.append(i)
        pbar.update(1)
            
processed = np.array(processed)
processed = processed - 1

100%|██████████| 190/190 [00:59<00:00,  3.22it/s]


Generate spans

In [8]:
input_samples = []

for i, ind in enumerate(tqdm(processed)):
    doc = docs[i]
    xml = xmls[ind]
    spans = [Span(entity_type=tag["type"],
                  entity_value=tag["text"],
                  start_position=int(tag["start"]),
                  end_position=int(tag["start"]) + len(tag["text"]))
             for tag in xml["content"].TAGS if tag.name == "EVENT"]
    
    input_sample = InputSample(doc.text, masked=None, spans=spans, create_tags_from_span = True, scheme="BILOU")
    input_samples.append(input_sample)

  0%|          | 0/190 [00:00<?, ?it/s]

loading model en_core_web_trf


100%|██████████| 190/190 [01:01<00:00,  3.08it/s]


In [9]:
input_samples = [input_sample.to_dict() for input_sample in input_samples]

In [18]:
with open('/data/datasets/n2c2/2012/processed/test.json', 'w+', encoding="utf-8") as f:
    json.dump(input_samples, f)

In [10]:
input_samples[0]

{'full_text': "\nAdmission Date :\n2019-01-19\nDischarge Date :\n2019-01-24\nService : MEDICINE\nAllergies :\nHistory of Present Illness :\nThis is a 37 y/o male patient with PMH Type I DM , HTN , gastroparesis , ESRD on HD ( last in 12/26 per patient ) who presents to CMED CCU with hypertensive urgency unable from the Larry .\nThe patient early this morning to the Ruthie with his usual nausea , vomiting , abdominal pain and was found to be hypertensive to 256/110 , HR 102 , T 99.2 , RR 22 , 92% RA .\nHistory is difficult to obtain from patient d/t somnolence and lack of desire to participate in interview , but per Frank note he was diaphoretic and ' writing in pain ', vomiting clear substance .\nHe was given ativan a total of 4 mg of ativan , 6 mg of dilaudid , labetolol 20 mg IV x 1 and hydralazine 10 mg IV x 1 without good response ( 200/99 ).\nHe recieved 2L NS and was started on labetolol gtt and BP decreased to 161/79 .\nTranferred to CMED CCU for further management while on labe