In [15]:
import xml.etree.ElementTree as etree
import codecs
import csv
import time
import os
import re
import pandas as pd

In [3]:
PATH_WIKI_XML = os.getcwd()
FILENAME_WIKI = 'enwiki-latest-pages-articles.xml'
FILENAME_ARTICLES = 'articles.csv'
FILENAME_REDIRECT = 'articles_redirect.csv'
FILENAME_TEMPLATE = 'articles_template.csv'
FILENAME_DRUG = 'articles_drugs.csv'
FILENAME_DISEASE = 'articles_diseases.csv'
ENCODING = "utf-8"

In [97]:
# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)
def strip_tag_name(t):
    t = elem.tag
    idx = k = t.rfind("}")
    if idx != -1:
        t = t[idx + 1:]
    return t
def get_drugbox(s):
    beg = (s.rfind('{{Drugbox'))
    end  =(s.rfind('\n}}'))
    if( end == -1):
        end = end =(s.rfind('}}\n'))
    if( end == -1):
        end = end =(s.rfind('}}\n<!--'))
    if( end == -1):
        end = end =(s.rfind('}}\n=='))
    if( end == -1):
        end = end =(s.rfind('}}\n\d'))
    s = s[beg: end+2]
    return s
def get_medical(s):
    beg = (s.rfind('{{Medical resources'))
    end  =(s.rfind('\n}}'))
    s = s[beg: end+3]
    return s
def find_cas(s):
    s = re.findall(r'CAS_number\s*?=\s?[0-9]*-?[0-9]*-?[0-9]*',s)
    if(len(s)>0):   
        s = s[0]
        equal = s.rfind('=')
        #if there is a space after the equal remove it
        if(s[equal+1]==' '):
            s = s[equal+2:]
        else: 
            s = s[equal+1:]
    else:
        s = 'NOT FOUND'
    return s
def find_icd10(s):
    s = s.replace('|','')
    s = s.replace('{{ICD10',"")
    icd10 = re.findall('\w{1}\d{2,6}',s)
    return icd10

In [5]:
pathWikiXML =FILENAME_WIKI
pathArticles = FILENAME_ARTICLES
pathArticlesRedirect =  FILENAME_REDIRECT
pathTemplateRedirect =  FILENAME_TEMPLATE
pathDrugsArticles =FILENAME_DRUG
pathDiseaseArticles = FILENAME_DISEASE

In [None]:
totalCount = 0
articleCount = 0
redirectCount = 0
templateCount = 0
title = None
icd9=[]
icd10=[]
start_time = time.time()

with codecs.open(pathDrugsArticles, "w", ENCODING) as drugsFH, \
        codecs.open(pathDiseaseArticles, "w", ENCODING) as diseaseFH:
    drugsWriter = csv.writer(drugsFH, quoting=csv.QUOTE_MINIMAL)
    diseaseWriter = csv.writer(diseaseFH, quoting=csv.QUOTE_MINIMAL)
    
    diseaseWriter.writerow(['id','title','ICD9','ICD10'])
    drugsWriter.writerow(['id', 'title', 'CAS #'])
    for event, elem in etree.iterparse(pathWikiXML, events=('start', 'end')):
        tname = strip_tag_name(elem.tag)

        if event == 'start':
            if tname == 'page':
                title = ''
                id = -1
                redirect = ''
                inrevision = False
                ns = 0
            elif tname == 'revision':
                # Do not pick up on revision id's
                inrevision = True
        else:
            if tname == 'title':
                title = elem.text
            elif tname == 'id' and not inrevision:
                id = int(elem.text)
            elif tname == 'redirect':
                redirect = elem.attrib['title']
            elif tname == 'ns':
                ns = int(elem.text)
            elif tname == 'page':
                totalCount += 1
            elif(tname =='text'):
                cont = elem.text
                #check if cont exists
                if(type(cont)==type('alex')):
                    #check if the drugbox template exists 
                    if (cont.find('Drugbox')>-1):
                        #get the drugbox template
                        result = get_drugbox(cont)
                        #get the CAS number
                        cas = find_cas(result)
                        drugsWriter.writerow([id, title, cas])
                        #print('done Drug')
                    if (cont.find('Medical resources')>-1):
                        #get the drugbox template
                        result = get_medical(cont)
                        #get the CAS number
                        tempICD9 = re.findall('{{ICD9.*',result)
                        tempICD10 = re.findall('{{ICD10.*',result)
                        if len(tempICD9)>0:
                            icd9 = re.findall('\w?\d{3}\.?\d?',tempICD9[0])
                        if len(tempICD10)>0:
                            icd10 = find_icd10(tempICD10[0])
                        diseaseWriter.writerow([id, title, icd9, icd10])
                        #print('done')
                        
            
    elem.clear()
elapsed_time = time.time() - start_time
print("Elapsed time: {}".format(hms_string(elapsed_time)))