In [1]:
import xml.etree.ElementTree as etree
import codecs
import csv
import time
import os
import re
import pandas as pd
import numpy as np

In [2]:
PATH_WIKI_XML = os.getcwd()
FILENAME_WIKI = 'enwiki-latest-pages-articles.xml'
FILENAME_ARTICLES = 'articles.csv'
FILENAME_REDIRECT = 'articles_redirect.csv'
FILENAME_TEMPLATE = 'articles_template.csv'
FILENAME_DRUG = 'articles_drugs.csv'
FILENAME_DISEASE = 'articles_diseases.csv'
ENCODING = "utf-8"

In [3]:
# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)
def strip_tag_name(t):
    t = elem.tag
    idx = k = t.rfind("}")
    if idx != -1:
        t = t[idx + 1:]
    return t
def get_drugbox(s):
    beg = (s.rfind('{{Drugbox'))
    end  =(s.rfind('\n}}'))
    if( end == -1):
        end = end =(s.rfind('}}\n'))
    if( end == -1):
        end = end =(s.rfind('}}\n<!--'))
    if( end == -1):
        end = end =(s.rfind('}}\n=='))
    if( end == -1):
        end = end =(s.rfind('}}\n\d'))
    s = s[beg: end+2]
    return s
def get_med_cond(s):
    beg = (s.rfind('{{Infobox medical condition'))
    end  =(s.rfind('\n}}'))
    if( end == -1):
        end = end =(s.rfind('}}\n'))
    if( end == -1):
        end = end =(s.rfind('}}\n<!--'))
    if( end == -1):
        end = end =(s.rfind('}}\n=='))
    if( end == -1):
        end = end =(s.rfind('}}\n\d'))
    s = s[beg: end+2]
    return s
def get_medical(s):
    beg = (s.rfind('{{Medical resources'))
    end  =(s.rfind('\n}}'))
    s = s[beg: end+3]
    return s
def find_unii(s):
    s = re.findall(r'UNII\s*?=\s?.*',s)
    if(len(s)>0):   
        s = s[0]
        equal = s.rfind('=')
        #if there is a space after the equal remove it
        if(s[equal+1]==' '):
            s = s[equal+2:]
        else: 
            s = s[equal+1:]
    else:
        s = ''
    return s
def find_icd10(s):
    s = s.replace('|','')
    s = s.replace('{{ICD10',"")
    icd10 = re.findall('\w{1}\d{2,6}',s)
    return icd10

def find_medication(s):
    s = re.findall(r'medication\s*?=.*<',s)
    if(len(s)== 0 ):
        s = []
    else:
        s = s[0]
        s = s[:-1]
        s = s[s.find('=')+2:]
        s = s.replace('[[','')
        s = s.replace(']]','')
        s = s.split(',')
        s
    return s
def pad_array(arr,length):
    for i in range(0,length-len(arr)):
        arr.append(np.NaN)
    return arr
def find_max_medications(df):
    return max([len(x) for x in df[df.medications.map(len) > 0].medications.values])
def get_row_with_medications(df):
    df =  df[df.medications.map(len) > 0]
    df.reset_index(drop=True,inplace=True)
    return df
def get_row_with_unii(df):
    df =  df[df.unii.map(len) > 0]
    df.reset_index(drop=True,inplace=True)
    return df
def set_meds_columns(df,max_col):
    current_meds = df.medications.values
    padded_meds = [pad_array(el,max_col) for el in current_meds]
    cols = [f"medication_{i}" for i in range(max_col)]
    col_df = pd.DataFrame(padded_meds, columns=cols)
    final = pd.concat([df,col_df], axis=1)
    return final
def clean_row(row):
    for i in range (len(row)):
        el = row[i]
        end = el.rfind('<')
        if end !=-1:
            new_el = el[:end]
            row[i]=new_el
        if i>0:
            row[i]= row[i][:]
        end = el.rfind('(')
        if end !=-1:
            new_el = el[:end]
            row[i]=new_el
        if i>0:
            row[i]= row[i][:]   
    return row

In [4]:
pathWikiXML =FILENAME_WIKI
pathArticles = FILENAME_ARTICLES
pathArticlesRedirect =  FILENAME_REDIRECT
pathTemplateRedirect =  FILENAME_TEMPLATE
pathDrugsArticles =FILENAME_DRUG
pathDiseaseArticles = FILENAME_DISEASE

In [None]:
totalCount = 0
title = None
icd9=[]
icd10=[]
#-----------arrays for drugs_df----------------DO NOT CHANGE NAMES
drugs_name = []
drugs_id = []
drugs_unii = []
#----arrays for diseases_df---------------------DO NOT CHANGE NAMES
diseases_name = []
diseases_id = []
diseases_icd9 = []
diseases_icd10 = []
diseases_meds = []
#-------------------------------------------
start_time = time.time()
for event, elem in etree.iterparse(pathWikiXML, events=('start', 'end')):
    tname = strip_tag_name(elem.tag)

    if event == 'start':
        if tname == 'page':
            title = ''
            id = -1
            redirect = ''
            inrevision = False
            ns = 0
        elif tname == 'revision':
            # Do not pick up on revision id's
            inrevision = True
    else:
        if tname == 'title':
            title = elem.text
        elif tname == 'id' and not inrevision:
            id = int(elem.text)
        elif tname == 'redirect':
            redirect = elem.attrib['title']
        elif tname == 'ns':
            ns = int(elem.text)
        elif tname == 'page':
            totalCount += 1
        elif(tname =='text'):
            cont = elem.text
            #check if cont exists
            if(type(cont)==type('alex')):
                #check if the drugbox template exists 
                if (cont.find('Drugbox')>-1 ):
                    #get the drugbox template
                    result = get_drugbox(cont)
                    #get the CAS number
                    unii = find_unii(result)
                    #drugsWriter.writerow([id, title, cas])
                    drugs_name.append(title)
                    drugs_id.append(id)
                    drugs_unii.append(unii)
                    #print('done Drug')
                if (cont.find('Infobox medical condition')>-1 ):
                    #get the drugbox template
                    result = get_med_cond(cont)
                    #get the CAS number
                    meds = find_medication(result)
                    #drugsWriter.writerow([id, title, meds])
                    #print('done condition')
                    if (cont.find('Medical resources')>-1):
                        #get the drugbox template
                        result = get_medical(cont)
                        #get the CAS number
                        
                        tempICD9 = re.findall('{{ICD9.*',result)
                        tempICD10 = re.findall('{{ICD10.*',result)
                        if len(tempICD9)>0:
                            icd9 = re.findall('\w?\d{3}\.?\d?',tempICD9[0])
                        if len(tempICD10)>0:
                            icd10 = find_icd10(tempICD10[0])
                    diseases_name.append(title)
                    diseases_id.append(id )
                    diseases_meds.append(meds)
                    diseases_icd9.append(icd9)
                    diseases_icd10.append(icd10)
            if (id%100000 ==0):
                drugs_df = pd.DataFrame({ "id": drugs_id,"name": drugs_name, "unii": drugs_unii})
                diseases_df = pd.DataFrame({"id":diseases_id , "name": diseases_name , "icd9": diseases_icd9 , "icd10": diseases_icd10, "medications": diseases_meds })
                diseases_with_meds = get_row_with_medications(diseases_df)
                drugs_df.to_csv('new_drugs_with_unii.csv')
                diseases_with_meds.to_csv('new_diseases_with_med.csv')
                print(f'created arrays at id: {id}')
                    #diseaseWriter.writerow([id, title, icd9, icd10])


elem.clear()
elapsed_time = time.time() - start_time
print("Elapsed time: {}".format(hms_string(elapsed_time)))

created arrays at id: 400000
created arrays at id: 600000
created arrays at id: 1300000
created arrays at id: 1400000
created arrays at id: 1500000
created arrays at id: 1700000
created arrays at id: 1800000
created arrays at id: 1900000
created arrays at id: 2000000
created arrays at id: 2100000
created arrays at id: 2200000
created arrays at id: 2300000
created arrays at id: 2600000
created arrays at id: 2800000


In [6]:
drugs_df = pd.DataFrame({ "id": drugs_id,"name": drugs_name, "unii": drugs_unii})
drugs_with_unii = get_row_with_unii(drugs_df)

In [7]:
diseases_df = pd.DataFrame({"id":diseases_id , "name": diseases_name , "icd9": diseases_icd9 , "icd10": diseases_icd10, "medications": diseases_meds })

In [8]:
diseases_with_meds = get_row_with_medications(diseases_df)
diseases_with_meds.medications.apply(clean_row)
max_meds = find_max_medications(diseases_with_meds)
test = set_meds_columns(diseases_with_meds, max_meds)

In [9]:
test

Unnamed: 0,id,name,icd9,icd10,medications,medication_0,medication_1,medication_2,medication_3,medication_4,medication_5,medication_6,medication_7
0,1776,Arthritis,"[710, 719]","[M00, m00, M25, m20]","[Ibuprofen, paracetamol , nan, nan, nan, nan,...",Ibuprofen,paracetamol,,,,,,
1,4531,Bipolar disorder,[324.0],"[Q273, q20, Q280, q20, Q282, q20]","[Lithium , antipsychotics, anticonvulsants, ...",Lithium,antipsychotics,anticonvulsants,,,,,
2,4581,Bacterial vaginosis,[616.1],[N76],"[Clindamycin or metronidazole, nan, nan, nan, ...",Clindamycin or metronidazole,,,,,,,
3,4746,Plague (disease),[020],[A20],"[Gentamicin and a fluoroquinolone, nan, nan, n...",Gentamicin and a fluoroquinolone,,,,,,,
4,5876,Coronary artery disease,[780.0],"[R402, r40]","[Aspirin, beta blockers, Medical use of nitr...",Aspirin,beta blockers,Medical use of nitroglycerin|nitroglycerin,statins,,,,
5,7012,Chagas disease,[243],"[E00, e00]","[Benznidazole, nifurtimox, nan, nan, nan, nan...",Benznidazole,nifurtimox,,,,,,
6,7038,Candidiasis,[112],"[B37, b35]","[Clotrimazole, nystatin, fluconazole, nan, n...",Clotrimazole,nystatin,fluconazole,,,,,
7,10528,Essential tremor,[333.1],"[G250, g20]","[Beta blockers; propranolol, nadolol, timolo...",Beta blockers; propranolol,nadolol,timolol,primidone,anti-epileptics; topiramate,gabapentin,levetiracetam; benzodiazepines,
8,13492,Hyperthyroidism,[240.9],"[E01, e00, E05, e00]","[Beta blockers, methimazole, nan, nan, nan, n...",Beta blockers,methimazole,,,,,,
9,15354,Interstitial cystitis,[595.1],"[N301, n30]","[Ibuprofen, pentosan polysulfate, amitriptyl...",Ibuprofen,pentosan polysulfate,amitriptyline,,,,,
