In [42]:
from bs4 import BeautifulSoup
from tqdm import tqdm
import os
import spacy
import numpy as np
from presidio_evaluator.data_objects import InputSample, Span
import json

In [43]:
N2C2_TO_PRESIDIO = {
    'DATE': 'DATE_TIME',
    'HOSPITAL': 'ORGANIZATION',
    'PATIENT': 'PERSON',
    'MEDICALRECORD': 'MEDICALRECORD',
    'AGE': 'AGE',
    'COUNTRY': 'LOCATION',
    'IDNUM': 'IDNUM',
    'DOCTOR': 'PERSON',
    'USERNAME': 'USERNAME',
    'STREET': 'LOCATION',
    'CITY': 'LOCATION',
    'STATE': 'LOCATION',
    'ZIP': 'LOCATION',
    'PROFESSION': 'PROFESSION',
    'PHONE': 'PHONE_NUMBER',
    'ORGANIZATION': 'ORGANIZATION',
    'BIOID': 'BIOID',
    'DEVICE': 'DEVICE',
    'LOCATION-OTHER': 'LOCATION',
    'FAX': 'PHONE_NUMBER',
    'EMAIL': 'EMAIL_ADDRESS',
    'HEALTHPLAN': 'HEALTHPLAN',
    'URL': 'URL',
    'O': 'O'
}

In [44]:
nlp = spacy.load("en_core_web_trf")

In [53]:
dir = '../data/n2c2/2014/training-PHI-Gold-Set1/'

In [54]:
xmls = []
for file_name in tqdm(os.listdir(dir)):
    if not file_name.endswith(".xml") or "_" in file_name:
        continue
    with open(os.path.join(dir, file_name), encoding='utf-8') as f:
        xmls.append({"file_name": file_name,
                     "content": BeautifulSoup(f.read(), features="xml")})

100%|██████████| 1043/1043 [00:00<00:00, 2231.38it/s]


In [55]:
nlp.pipe_names

['transformer', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']

In [56]:
docs = []
processed = []
i = 0
with tqdm(xmls) as pbar:
    for xml in pbar:
        i += 1
        try:
            doc = nlp(xml["content"].deIdi2b2.TEXT.text)
            docs.append(doc)
            processed.append(i)
            pbar.update(1)
        except:
            continue
            
processed = np.array(processed)
processed = processed - 1

100%|██████████| 521/521 [05:22<00:00,  1.62it/s]


Generate spans

In [57]:
input_samples = []

for i, ind in enumerate(tqdm(processed)):
    doc = docs[i]
    xml = xmls[ind]
    spans = [Span(entity_type=N2C2_TO_PRESIDIO[tag["TYPE"]],
                  entity_value=tag["text"],
                  start_position=int(tag["start"]),
                  end_position=int(tag["start"]) + len(tag["text"]))
             for tag in xml["content"].deIdi2b2.TAGS if tag.name is not None]
    
    input_sample = InputSample(doc.text, masked=None, spans=spans, create_tags_from_span = True, scheme="BILOU")
    input_samples.append(input_sample)

100%|██████████| 445/445 [05:13<00:00,  1.42it/s]


In [58]:
input_samples = [input_sample.to_dict() for input_sample in input_samples]

In [59]:
with open('../data/n2c2/set_1.json', 'w+', encoding="utf-8") as f:
    json.dump(input_samples, f)

In [60]:
input_samples[0]

{'full_text': "\n\n\nRecord date: 2094-12-09\n\n                     RAH EMERGENCY DEPT VISIT\n \nDUVALL,BRADY C   425-03-15-1                 VISIT DATE: 12/09/94\nPRESENTING COMPLAINT and HISTORY OF PRESENTING COMPLAINT:  The \npatient is a 72-year-old gentleman who presents to the Emergency \nDepartment.  He reports nightly fevers.  He is two week status post \nAICD replacement.  He feels this is a malaria exacerbation.  The \npatient had malaria in Cambodia, and reports he did not have \ntreatment.  His past medical history is also positive for \nnon-insulin-dependent diabetes mellitus, aortic valve replacement \nwith a porcine valve, and ventricular fibrillation.  He is on \nGlucophage and aspirin.  He has no known drug allergies.  The \npatient feels well.  He says that each night he gets fever up to \n101 degrees F., although it is not that high each night.  The \npatient's past medical history is also significant for CAD, with an \nMI in 2092 and ventricular tachycardia in 2086