In [1]:
import sys

In [2]:
sys.path.insert(0, "../..")

# TextSectionizer
Sometimes, you may not want to process an entire document with spaCy. You may instead want to extract specific sections and then process them independently. To do this, you can use the `TextSectionizer` and process a text. Just like the `Sectionizer`, this class comes with default patterns which can be modified or added to.

In [3]:
with open("../discharge_summary.txt") as f:
    text = f.read()

In [4]:
from medspacy.section_detection import TextSectionizer

In [5]:
sectionizer = TextSectionizer()

In [6]:
sectionizer.section_titles

{'addendum',
 'allergies',
 'chief_complaint',
 'comments',
 'diagnoses',
 'family_history',
 'history_of_present_illness',
 'hospital_course',
 'imaging',
 'labs_and_studies',
 'medications',
 'neurological',
 'observation_and_plan',
 'other',
 'past_medical_history',
 'patient_education',
 'patient_instructions',
 'physical_exam',
 'problem_list',
 'reason_for_examination',
 'signature',
 'social_history'}

Unlike the `Sectionizer` patterns, the `pattern` value can only be a string, which will be interpreted as a case-insensitive regular expression. You can add to the `TextSectionizer` with the same `.add()` method:

In [7]:
sectionizer.patterns[:5]

[{'section_title': 'addendum', 'pattern': 'ADDENDUM:'},
 {'section_title': 'addendum', 'pattern': 'Addendum:'},
 {'section_title': 'allergies', 'pattern': 'ALLERGIC REACTIONS:'},
 {'section_title': 'allergies', 'pattern': 'ALLERGIES:'},
 {'section_title': 'chief_complaint', 'pattern': 'CC:'}]

In [8]:
new_patterns = [
    {"section_title": "visit_information", "pattern": "admi(t|ssion) date:"},
    {"section_title": "labs_and_studies", "pattern": "pertinent results:"}
]

In [9]:
sectionizer.add(new_patterns)

# Using Text Sectionizer
We can get the split up document by calling `sectionizer(text)`. This returns a list of 3-tuples which contain:
- `section_title`: The string of the section title
- `section_header`: The span of text matched by the pattern
- `section_text`: The span of text contained in the entire section

In [10]:
sections = sectionizer(text)

In [11]:
print(sections[1])

('other', 'Service:', 'Service: SURGERY\n\n')


In [12]:
for (section_title, section_header, section_text) in sections[:3]:
    print(section_title)
    print(section_header)
    print()
    print(section_text)
    print("---"*5)

visit_information
Admission Date:

Admission Date:  [**2573-5-30**]              Discharge Date:   [**2573-7-1**]

Date of Birth:  [**2498-8-19**]             Sex:   F


---------------
other
Service:

Service: SURGERY


---------------
allergies
Allergies:

Allergies:
Hydrochlorothiazide

Attending:[**First Name3 (LF) 1893**]

---------------


You can unpack these tuples by using the Python `zip(*tuples)` function:

In [13]:
section_titles, section_headers, section_texts = zip(*sections)

In [14]:
section_titles

('visit_information',
 'other',
 'allergies',
 'chief_complaint',
 'history_of_present_illness',
 'past_medical_history',
 'social_history',
 'family_history',
 'hospital_course',
 'medications',
 'observation_and_plan',
 'patient_instructions',
 'signature')

In [15]:
section_headers

('Admission Date:',
 'Service:',
 'Allergies:',
 'Chief Complaint:',
 'History of Present Illness:',
 'Past Medical History:',
 'Social History:',
 'Family History:',
 'Brief Hospital Course:',
 'Discharge Medications:',
 'Discharge Diagnosis:',
 'Discharge Instructions:',
 'Signed electronically by:')

In [16]:
section_texts

('Admission Date:  [**2573-5-30**]              Discharge Date:   [**2573-7-1**]\n\nDate of Birth:  [**2498-8-19**]             Sex:   F\n\n',
 'Service: SURGERY\n\n',
 'Allergies:\nHydrochlorothiazide\n\nAttending:[**First Name3 (LF) 1893**]\n',
 'Chief Complaint:\nAbdominal pain\n\nMajor Surgical or Invasive Procedure:\nPICC line [**6-25**]\nERCP w/ sphincterotomy [**5-31**]\n\n\n',
 'History of Present Illness:\n74y female with type 2 dm and a recent stroke affecting her\nspeech, who presents with 2 days of abdominal pain. Imaging shows no evidence of metastasis.\n\n',
 "Past Medical History:\n1. Colon cancer dx'd in [**2554**], tx'd with hemicolectomy, XRT,\nchemo. Last colonoscopy showed: Last CEA was in the 8 range\n(down from 9)\n2. Type II Diabetes Mellitus\n3. Hypertension\n\n",
 'Social History:\nMarried, former tobacco use. No alcohol or drug use.\n\n',
 'Family History:\nMother with stroke at age 82. no early deaths.\n2 daughters- healthy\n\n\n',
 'Brief Hospital Course:\nM

## Limiting sections
Once you identify the sections in a document, you can then exclude any other sections which aren't relevant. You can then process each document separately or combine into a smaller, more selective document.

In [17]:
relevant_section_titles = ["present_illness", "medication"]
relevant_sections = [section for (section_title, section_header, section) in sections 
                   if section_title in relevant_section_titles]

In [18]:
relevant_section_titles

['present_illness', 'medication']

In [19]:
relevant_text = "\n\n".join(relevant_sections)

In [20]:
import spacy
from medspacy.visualization import visualize_ent 

In [21]:
nlp = spacy.load("en_info_3700_i2b2_2012")



In [22]:
nlp

<spacy.lang.en.English at 0x113672fd0>

In [23]:
doc = nlp(relevant_text)

In [24]:
visualize_ent(doc)