# Parsing ICD-10-PCS XML file

## Importing etree

In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
#Change setting to avoid dataframe from truncating
pd.options.display.max_rows = 500
pd.options.display.width = 500
pd.options.display.max_colwidth = 500
pd.options.display.max_columns = 500

In [2]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()
import warnings        # To suppress warning alert
warnings.filterwarnings('ignore')

## Parsing xml file into etree

In [3]:
# tree = ET.parse("/Users/medicalagent3/pcs2020_xml_parse.xml")
tree = ET.parse("/icd10pcs_tables_2020.xml")
root = tree.getroot()

## Parsing nodes

In [5]:
def parse_icd():
    code_dict = {}
    for n in tqdm(range(len(root.findall("./pcsTable")))):
        node1 = root.findall("./pcsTable[{}]//*[@pos='1']/label".format(n+1))
        node2 = root.findall("./pcsTable[{}]//*[@pos='2']/label".format(n+1))
        node3 = root.findall("./pcsTable[{}]//*[@pos='3']/label".format(n+1))
        node4 = root.findall("./pcsTable[{}]//*[@pos='4']/label".format(n+1))
        node5 = root.findall("./pcsTable[{}]//*[@pos='5']/label".format(n+1))
        node6 = root.findall("./pcsTable[{}]//*[@pos='6']/label".format(n+1))
        node7 = root.findall("./pcsTable[{}]//*[@pos='7']/label".format(n+1))

        nodes = [node1,node2,node3,node4,node5,node6,node7]

        pos1,pos2,pos3,pos4,pos5,pos6,pos7 = [],[],[],[],[],[],[] 

        unique_code = [pos1,pos2,pos3,pos4,pos5,pos6,pos7]

        t1,t2,t3,t4,t5,t6,t7 = [],[],[],[],[],[],[]

        unique_title = [t1,t2,t3,t4,t5,t6,t7]

        for val in range(7):
            nd = nodes[val]
            for i in nd:
                unique_title[val].append(i.text)
                unique_code[val].append(i.attrib['code'])

        for n1,m1 in zip(pos1,t1):
            title = m1
            code = n1
            for n2,m2 in zip(pos2,t2):
                title1 = title + " @" + m2
                code1 = code + n2
                for n3,m3 in zip(pos3,t3):
                    title2 = title1 + " @" +  m3
                    code2 = code1 + n3
                    for n4,m4 in zip(pos4,t4):
                        title3 = title2 + " @" +  m4
                        code3 = code2 + n4
                        for n5,m5 in zip(pos5,t5):
                            title4 = title3 + " @" +  m5
                            code4 = code3 + n5
                            for n6,m6 in zip(pos6,t6):
                                title5 = title4 + " @" +  m6
                                code5 = code4 + n6
                                for n7,m7 in zip(pos7,t7):
                                    ntitle = title5 + " @" +  m7
                                    code6 = code5 + n7
                                    code_dict[code6] = ntitle
    return code_dict

In [6]:
bruteforce_icd10 = parse_icd()

HBox(children=(FloatProgress(value=0.0, max=880.0), HTML(value='')))




## Converting to dataframe

In [14]:
brute_icd_df = pd.DataFrame.from_dict(bruteforce_icd10,orient='index').reset_index().rename(columns={'index':'code',0:'title'})
brute_icd_df

Unnamed: 0,code,title
0,0016070,Medical and Surgical @Central Nervous System and Cranial Nerves @Bypass @Cerebral Ventricle @Open @Autologous Tissue Substitute @Nasopharynx
1,0016071,Medical and Surgical @Central Nervous System and Cranial Nerves @Bypass @Cerebral Ventricle @Open @Autologous Tissue Substitute @Mastoid Sinus
2,0016072,Medical and Surgical @Central Nervous System and Cranial Nerves @Bypass @Cerebral Ventricle @Open @Autologous Tissue Substitute @Atrium
3,0016073,Medical and Surgical @Central Nervous System and Cranial Nerves @Bypass @Cerebral Ventricle @Open @Autologous Tissue Substitute @Blood Vessel
4,0016074,Medical and Surgical @Central Nervous System and Cranial Nerves @Bypass @Cerebral Ventricle @Open @Autologous Tissue Substitute @Pleural Cavity
...,...,...
368457,XW0DXV2,New Technology @Anatomical Regions @Introduction @Mouth and Pharynx @External @Gilteritinib Antineoplastic @New Technology Group 2
368458,XW0DXV3,New Technology @Anatomical Regions @Introduction @Mouth and Pharynx @External @Gilteritinib Antineoplastic @New Technology Group 3
368459,XW0DXV4,New Technology @Anatomical Regions @Introduction @Mouth and Pharynx @External @Gilteritinib Antineoplastic @New Technology Group 4
368460,XXE5XM5,"New Technology @Physiological Systems @Measurement @Circulatory @External @Infection, Whole Blood Nucleic Acid-base Microbial Detection @New Technology Group 5"
