In [1]:
!pip install tika pyyaml
!pip install regex
# !jupyter kernelspec list

You should consider upgrading via the '/home/delta/Desktop/meta/ontology-graphs/venv/bin/python3 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/delta/Desktop/meta/ontology-graphs/venv/bin/python3 -m pip install --upgrade pip' command.[0m


## Read PDF into python-text

In [2]:
import re
import yaml
import json
import regex

# parsing the pdf
from tika import parser 

# turning stuff to a tree
from anytree import Node, RenderTree, AsciiStyle, PostOrderIter
from anytree.importer import DictImporter
from anytree.exporter import DictExporter
from anytree.search import findall

DSM_V_location = './raw/DSM_V.pdf'
DSM_ToC_location = './raw/toc.yaml'

In [3]:
raw = parser.from_file(DSM_V_location)
content = raw['content']

## Process Table of Contents for controlling results

In [4]:
with open(DSM_ToC_location) as f:
    # use safe_load instead load
    ToC = {"name": "DSM_V", "id":"100000", 'children': yaml.safe_load(f)}

In [5]:
importer = DictImporter()
exporter = DictExporter()
toc_node = importer.import_(ToC)
id = 100001
for node in PostOrderIter(toc_node):
    while '  ' in node.name:
        node.name = node.name.replace('  ', ' ')  
    setattr(node, 'id', str(id))
    id += 1

In [6]:
with open('./raw/toc.json', 'w+') as f:
    json.dump(exporter.export(toc_node), f)

In [7]:
def get_name(content):
    name = ' '.join([
        k for k in content.split('\n') 
        if len(k) > 0 and k[0].isupper()
    ])
    return name

## Find Disorders that have an organized structure

In [8]:
# possible stuff found inside a block describing a disorder
SECTIONS = [
    'Diagnostic Criteria',
    'Recording Procedures',
    'Specifiers',  
    'Diagnostic Features',
    'Associated Features Supporting Diagnosis',
    'Prevalence',
    'Development and Course',
    'Risk and Prognostic Factors',
    'Culture-Related Diagnostic Issues',
    'Gender-Related Diagnostic Issues',
    'Functional Consequences of',
    'Differential Diagnosis',
    'Comorbidity',
    'Relationship to Other Classifications',
]

# step by step information about diagnostic criteria
STEPS = [
    'Diagnostic Criteria', 'Coding note:', 'Specify if:', 'Specify whether:',
    'A.', 'B.', 'C.', 'D.', 'E.',
    'F.', 'G.', 'H.', 'I.', 'J.',
]
SUBSTEPS = ["1.", "2.", "3.", "4.", "5.", "6.", "7.", "8."]

In [9]:
def get_sections(content, steps=SECTIONS):
    """Helper function separating the text into blocks of information
    based on the expected steps that should be included."""

    base = {}
    previous_step = ""
    data = []
    found_steps = []
    for row in content.split("\n"):
        for step in steps:
            if step in found_steps:
                continue
            found = regex.findall(
                "(" + step + "){e<=3}", row[: len(step) + 3], overlapped=True
            )
            if row.startswith(step) or row == step or len(found):
                base[previous_step] = "\n".join(data)
                previous_step = step
                data = []
                found_steps.append(step)
        data.append(row)
    base[previous_step] = "\n".join(data)
    if "" in base:
        del base[""]
    return base


def create_structure(content, steps):
    """Sub-section list filtering.
    Example: 1. 2. 3.
    Example: A. B. C.
    """

    for each in steps:
        content = content.replace(each, "\n" + each)
    base = {}
    previous_step = "Beginning"
    data = []
    for row in content.split("\n"):
        for step in steps:
            if step in row:
                base[previous_step] = "\n".join(data).replace("\n\n", "-")
                previous_step = step
                data = []
        data.append(row)
    base[previous_step] = "\n".join(data)
    if "" in base:
        del base[""]
    return base


def structure_body(content, steps=STEPS, subsections=SUBSTEPS):
    structured = create_structure(content, steps)
    for each, val in structured.items():
        sub = create_structure(val, substeps)
        structured[each] = "\n".join(sub.values())
    return "\n\n".join(structured.values())


def create_sections(content, sections=SECTIONS, structured=['Diagnostic Criteria']):
    sections = get_sections(content, steps)
    for key, val in sections.items():
        if key in structured:
            sections[key] = structure_body(val)
    return sections

In [10]:
previous = None
next_name = None
structured_disorders = []

for i in re.finditer(r'^Diagnostic Criteria', content, flags=re.MULTILINE):

    # decide name of disorder based on previous lines
    diag_start = i.span()[0]

    # skip first time, and store text there
    if previous is None:
        next_name = get_name(content[diag_start-80:diag_start])
        previous = diag_start
        continue

    # decide naming
    name = next_name 
    next_name = get_name(content[diag_start-80:diag_start])
    
    # get content of disorder
    text = content[previous:diag_start].replace(name, '')
    disorder = {}
    disorder['body'] = text
    disorder['sections'] = get_sections(text)
    disorder['name'] = name.replace('  ', ' ')
 
    # prepare for next round
    previous = diag_start

    # skip problematic runs
    if len(disorder.keys()) < 2:
        continue
    structured_disorders.append(disorder)

In [11]:
len(structured_disorders)

158

### Post-Processing
* some extracted data have no name, check by hand their cases
* some names have issues
* some names include too much, reduce

In [12]:
diag1 = 'Note: A tic is a sudden, rapid, recurrent, nonrhythmic motor movement or vocalization.'
diag2 = 'A. The development of a reversible substance-specific syndrome attributable to recent in'
diag3 = 'A. Daily use of tobacco for at least several weeks.'
diag4 = 'A. Following cessation of use of a hallucinogen, the reexperiencing of one or more of the'
diag5 = 'A. Presence of obsessions, compulsions, or both'
diag6 = 'C. The cognitive deficits do not occur exclusively in the context of a delirium.'
diag7 = 'A. Lack of, or significantly reduced, sexual interest/arousal, as manifested by at least '
diag8 = 'A. Polysomnograpy demonstrates episodes of decreased respiration associated with el'

empty_name_fix = {
    'Tic Disorders': diag1,
    'Other (or Unknown) Substance Intoxication': diag2,
    'Tobacco Withdrawal': diag3,
    'Hallucinogen Persisting Perception Disorder': diag4,
    'Obsessive-Compulsive Disorder': diag5,
    'Mild Neurocognitive Disorder': diag6,
    'Female Sexual Interest/Arousal Disorder': diag7,
    'Sleep-Related Hypoventilation': diag8,
}

In [13]:
def fix_name(name):
    """Removing unneeded variations of names that doesnt allow coordination
    between extracted info and table of contents. 
    This has been done post processing, but its 
    being written here for keeping clean."""

    name = name[name.rfind('.')+1:]
    return (
        name.replace('l\/l', 'M')
        .replace('IVI', 'M')
        .replace('Disinhiblted', 'Disinhibited')
        .replace('Dereallzation', 'Derealization')
        .replace('-induced', '-Induced')
        .replace('Reiated', 'Related')
        .replace('Genlto', 'Genito')
        .replace('A positive family history ', '')
        .replace(
            'Unspecified Tobacco-Related Disorder Tobacco Use Disorder',
            'Tobacco Use Disorder'
        )
        .replace(
            'Unspecified Stimulant-Related Disorder Stimulant Use Disorder',
            'Stimulant Use Disorder'
        )
        .replace(
            'Induced Disorders Unspecified Inhalant-Related Disorder Inhalant Use Disorder',
            'Inhalant Use Disorder'
        )
        .replace(
            'Disorder Unspecified Hallucinogen-Related Disorder Phencyclidine Use Disorder',
            'Phencyclidine Use Disorder'
        )
        .replace(
            'Induced Disorders Unspecified Cannabis-Related Disorder Cannabis Use Disorder',
            'Cannabis Use Disorder'
        )
        .replace(
            'Unspecified Alcohol-Related Disorder Alcohol Use Disorder',
            'Alcohol Use Disorder'
        )
        .replace(
            'Induced Disorders Unspecified Caffeine-Related Disorder Caffeine Intoxication',
            'Caffeine Intoxication'
        )
        .replace(
            'Disorder Attention-Deficit/Hyperactivity Disorder',
            'Attention-Deficit/Hyperactivity Disorder Attention-Deficit/Hyperactivity Disorder'
        )
    ).lstrip()


def fix_empty_name_disorders(disorder):
    """fix cases where the name is empty."""
    for disorder_name, diagnosis in empty_name_fix.items():
        if diagnosis in disorder['body']:
            return disorder_name
    return ""
    # raise Exception('Disorder Not Found in Post Processing.')

## Turn the processed disorders into a tree

In [14]:
for disorder in structured_disorders:
    involved_nodes = []
    
    # fix name issues
    disorder['name'] = fix_name(disorder['name'])

    # if name doesnt exist, then fix it
    if disorder['name'] == '' or disorder['name'] == ' ':
        disorder['name'] = fix_empty_name_disorders(disorder)

    # find name in the table of contents
    for subchild in toc_node.children:

        # if its exact name
        involved_nodes += findall(
            subchild, 
            filter_=lambda node: disorder['name'] == node.name
        )
        if len(involved_nodes) == 0:
            # if its exact name
            involved_nodes += findall(
                subchild, 
                filter_=lambda node: disorder['name'] == node.parent.name + " " + node.name
            )
        if len(involved_nodes) == 0:
            # if not exact name
            involved_nodes += findall(
                subchild, 
                filter_=lambda node: 
                disorder['name'] in node.parent.name + " " + node.name or 
                disorder['name'] in node.name + " " + node.parent.name
            )

    # catch errors before going further
    if len(involved_nodes) == 0:
        print("Error: Disease '" + disorder['name'] + "' was not found in Table of Contents.")
        continue
    if len(involved_nodes) > 1:
        print("Error: Disease '" + disorder['name'] + "' was found more than once in Table of Contents.")
        continue

    # further post-processing
    node = involved_nodes[0]
    if 'sections' in disorder and disorder['sections'] != []:
        node.children = [
            Node(name=key, body=val, type="section")
            for key, val in disorder['sections'].items()
        ]
    for key, val in disorder.items():
        if key == 'name':
            pass
        setattr(node, key, val)

Error: Disease 'Section II' was not found in Table of Contents.
Error: Disease 'Elements of a Diagnosis ' was not found in Table of Contents.


In [15]:
with open('../dsm_v/dsm_v.json', 'w+') as f:
    json.dump(exporter.export(toc_node), f)