In [1]:
from tqdm import tqdm
import json
import pandas as pd
import time
from itertools import chain
from bs4 import BeautifulSoup
from urllib.parse import unquote
import html
import re
import os
current_directory = os.getcwd().replace('code', '')
data_path = os.path.join(current_directory, 'data')
output_path = os.path.join(current_directory, 'outputs')

In [2]:
start_time = time.time()

with open(os.path.join(output_path, 'multistream_text.json'), 'r', encoding='utf-8') as jsonf:
  jsondata = json.load(jsonf)

end_time = time.time()
print(f"Loading time: {end_time - start_time:.2f} seconds")

Loading time: 5.13 seconds


In [3]:
len(jsondata)

53981

In [2]:
# wiki_id and qid of our dewiki data
df_qid = pd.read_csv(os.path.join(output_path, 'wiki_id_qid.csv'))
df_qid

Unnamed: 0,page id,qid
0,1,Q734916
1,3,Q1121
2,5,Q160726
3,7,Q569322
4,10,Q200694
...,...,...
3399665,12711401,Q38052393
3399666,12711404,Q1325721
3399667,12711406,Q119789306
3399668,12711411,Q912981


In [3]:
df_cui = pd.read_csv(os.path.join(output_path, 'qid_cui_tui.csv'))
df_cui

Unnamed: 0,entity,CUI,QID,TUI,Semantic type
0,http://www.wikidata.org/entity/Q722,C0042306,Q722,T131,Hazardous or Poisonous Substance
1,http://www.wikidata.org/entity/Q722,C0042306,Q722,T196,"Element, Ion, or Isotope"
2,http://www.wikidata.org/entity/Q9655,C0003842,Q9655,T023,"Body Part, Organ, or Organ Component"
3,http://www.wikidata.org/entity/Q9644,C0022417,Q9644,T030,Body Space or Junction
4,http://www.wikidata.org/entity/Q9377,C0022646,Q9377,T023,"Body Part, Organ, or Organ Component"
...,...,...,...,...,...
792673,http://www.wikidata.org/entity/Q21368774,C5761229,Q21368774,T204,Eukaryote
792674,http://www.wikidata.org/entity/Q50380357,C5761230,Q50380357,T204,Eukaryote
792675,http://www.wikidata.org/entity/Q10502785,C5761232,Q10502785,T004,Fungus
792676,http://www.wikidata.org/entity/Q11611665,C3697327,Q11611665,T201,Clinical Attribute


In [5]:
len(set(df_cui['CUI'].to_list()))

763859

In [6]:
df_mesh = pd.read_csv(os.path.join(output_path, 'qid_mesh.csv'))
df_mesh

Unnamed: 0,entity,MeSH_ID,QID
0,http://www.wikidata.org/entity/Q31,D001530,Q31
1,http://www.wikidata.org/entity/Q155,D001938,Q155
2,http://www.wikidata.org/entity/Q68,D003201,Q68
3,http://www.wikidata.org/entity/Q144,D004285,Q144
4,http://www.wikidata.org/entity/Q183,D005858,Q183
...,...,...,...
38672,http://www.wikidata.org/entity/Q104992442,C007572,Q104992442
38673,http://www.wikidata.org/entity/Q111267276,C071074,Q111267276
38674,http://www.wikidata.org/entity/Q118240704,C000634046,Q118240704
38675,http://www.wikidata.org/entity/Q7253248,C440216,Q7253248


In [8]:
len(set(df_mesh['MeSH_ID'].to_list()))

38607

In [9]:
df_doid = pd.read_csv(os.path.join(output_path, 'qid_doid.csv'))
df_doid

Unnamed: 0,entity,DOID,QID
0,http://www.wikidata.org/entity/Q8277,DOID:2377,Q8277
1,http://www.wikidata.org/entity/Q8285,DOID:437,Q8285
2,http://www.wikidata.org/entity/Q16495,DOID:14351,Q16495
3,http://www.wikidata.org/entity/Q12135,DOID:150,Q12135
4,http://www.wikidata.org/entity/Q36855,DOID:0050211,Q36855
...,...,...,...
10613,http://www.wikidata.org/entity/Q102296686,DOID:0112055,Q102296686
10614,http://www.wikidata.org/entity/Q102293888,DOID:0112066,Q102293888
10615,http://www.wikidata.org/entity/Q102296946,DOID:0112074,Q102296946
10616,http://www.wikidata.org/entity/Q102296910,DOID:0112076,Q102296910


In [10]:
len(set(df_doid['DOID'].to_list()))

10609

In [8]:
# read mesh
df_mesh_cui = pd.read_csv(os.path.join(output_path, 'mesh_cui_tui.csv'))
df_mesh_cui

Unnamed: 0,CUI,MeSH_ID,TUI,Semantic type
0,C0000005,D012711,T116,"Amino Acid, Peptide, or Protein"
1,C0000005,D012711,T121,Pharmacologic Substance
2,C0000005,D012711,T130,"Indicator, Reagent, or Diagnostic Aid"
3,C0000039,D015060,T109,Organic Chemical
4,C0000039,D015060,T121,Pharmacologic Substance
...,...,...,...,...
688095,C5779476,D001452,T061,Therapeutic or Preventive Procedure
688096,C5779477,D055876,T070,Natural Phenomenon or Process
688097,C5779478,D009680,T070,Natural Phenomenon or Process
688098,C5779479,D007858,T041,Mental Process


In [9]:
# read mesh
df_doid_cui = pd.read_csv(os.path.join(output_path, 'doid_cui_tui.csv'))
df_doid_cui

Unnamed: 0,DOID,CUI,TUI,Semantic type
0,DOID:0001816,C0018923,T191,Neoplastic Process
1,DOID:0001816,C0854893,T191,Neoplastic Process
2,DOID:0002116,C0033999,T047,Disease or Syndrome
3,DOID:0014667,C0025517,T047,Disease or Syndrome
4,DOID:0040002,C0004058,T047,Disease or Syndrome
...,...,...,...,...
7060,SYMP:0000617,C0028856,T047,Disease or Syndrome
7061,SYMP:0000618,C0231471,T033,Finding
7062,SYMP:0000619,C0039621,T033,Finding
7063,SYMP:0000620,C0238551,T184,Sign or Symptom


In [10]:
df_mentions_qid = pd.read_csv(os.path.join(output_path, 'mentions_url_including_redirect.csv'))
df_mentions_qid

Unnamed: 0,mention,url,page id,error,qid
0,Getreide,https://de.wikipedia.org/wiki/Getreide,1944.0,200.0,Q12117
1,Gattung,https://de.wikipedia.org/wiki/Gattung%20%28Bio...,4003287.0,200.0,Q34740
2,Fingerhirsen,https://de.wikipedia.org/wiki/Fingerhirsen,3719543.0,200.0,Q163915
3,Familie,https://de.wikipedia.org/wiki/Familie%20%28Bio...,1704.0,200.0,Q35409
4,Süßgräser,https://de.wikipedia.org/wiki/S%C3%BC%C3%9Fgr%...,4951.0,200.0,Q43238
...,...,...,...,...,...
468411,Dalvirus,https://de.wikipedia.org/wiki/Dalvirus,0.0,404.0,
468412,Deevirus,https://de.wikipedia.org/wiki/Deevirus,0.0,404.0,
468413,Dobrovirus,https://de.wikipedia.org/wiki/Dobrovirus,0.0,404.0,
468414,Thurisazvirus,https://de.wikipedia.org/wiki/Thurisazvirus,0.0,404.0,


In [11]:
filtered_df = df_mentions_qid[df_mentions_qid['qid'].notna()]
filtered_df

Unnamed: 0,mention,url,page id,error,qid
0,Getreide,https://de.wikipedia.org/wiki/Getreide,1944.0,200.0,Q12117
1,Gattung,https://de.wikipedia.org/wiki/Gattung%20%28Bio...,4003287.0,200.0,Q34740
2,Fingerhirsen,https://de.wikipedia.org/wiki/Fingerhirsen,3719543.0,200.0,Q163915
3,Familie,https://de.wikipedia.org/wiki/Familie%20%28Bio...,1704.0,200.0,Q35409
4,Süßgräser,https://de.wikipedia.org/wiki/S%C3%BC%C3%9Fgr%...,4951.0,200.0,Q43238
...,...,...,...,...,...
468395,Simultaninfektion,https://de.wikipedia.org/wiki/Simultaninfektion,2695226.0,200.0,Q1243018
468396,Immunfluoreszenz,https://de.wikipedia.org/wiki/Immunfluoreszenz...,833738.0,200.0,Q592324
468397,molekulare Masse,https://de.wikipedia.org/wiki/molekulare%20Masse,31795.0,301.0,Q182854
468400,Gelfiltration,https://de.wikipedia.org/wiki/Gelfiltration,566275.0,200.0,Q854422


In [12]:
df_cui

Unnamed: 0,entity,CUI,QID,TUI,Semantic type
0,http://www.wikidata.org/entity/Q722,C0042306,Q722,T131,Hazardous or Poisonous Substance
1,http://www.wikidata.org/entity/Q722,C0042306,Q722,T196,"Element, Ion, or Isotope"
2,http://www.wikidata.org/entity/Q9655,C0003842,Q9655,T023,"Body Part, Organ, or Organ Component"
3,http://www.wikidata.org/entity/Q9644,C0022417,Q9644,T030,Body Space or Junction
4,http://www.wikidata.org/entity/Q9377,C0022646,Q9377,T023,"Body Part, Organ, or Organ Component"
...,...,...,...,...,...
792673,http://www.wikidata.org/entity/Q21368774,C5761229,Q21368774,T204,Eukaryote
792674,http://www.wikidata.org/entity/Q50380357,C5761230,Q50380357,T204,Eukaryote
792675,http://www.wikidata.org/entity/Q10502785,C5761232,Q10502785,T004,Fungus
792676,http://www.wikidata.org/entity/Q11611665,C3697327,Q11611665,T201,Clinical Attribute


In [13]:
# extract qid and cui values using Pandas
qid_dict = df_qid.set_index('page id')['qid'].to_dict()
wikicui_dict = df_cui.groupby('QID')['CUI'].apply(list).to_dict()
mesh_dict = df_mesh.set_index('QID')['MeSH_ID'].to_dict()
doid_dict = df_doid.set_index('QID')['DOID'].to_dict()
mention_dict = filtered_df.set_index('url')['qid'].to_dict()
mesh_cui_dict = df_mesh_cui.groupby('MeSH_ID')['CUI'].apply(list).to_dict()
doid_cui_dict = df_doid_cui.groupby('DOID')['CUI'].apply(list).to_dict()
tui_dict = df_cui.set_index('CUI')['TUI'].to_dict()
st_dict = df_cui.set_index('CUI')['Semantic type'].to_dict()


In [15]:
def extract_text_within_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    extracted_texts = []
    for tag in soup.find_all('a'):
        url = re.findall(r'href="(.*?)"', str(tag))
        extracted_texts.append((url[0], tag.text, text.find(str(tag)), text.find(str(tag)) + len(tag.text)))
        text = text.replace(str(tag), tag.text)
    return extracted_texts

jsonarray = []

for i in tqdm(jsondata):
    decoded_text = html.unescape(i['text'])
    decoded_tag = unquote(decoded_text)

    mentions = extract_text_within_tags(decoded_text)
    updated_mentions = []
    for url, mention, start, end in mentions:
        link = 'https://de.wikipedia.org/wiki/' + url
        qid_m = mention_dict.get(link, 'None') 
        wcui_m = list(set(wikicui_dict.get(qid_m, [])))
        mesh_m = mesh_dict.get(qid_m, 'None')
        mesh_cui_m = list(set(mesh_cui_dict.get(mesh_m, [])))
        doid_m = doid_dict.get(qid_m, 'None')
        doid_cui_m = list(set(doid_cui_dict.get(doid_m, [])))
        if len(wcui_m) == 1:
            cui_m = wcui_m[0]
            tui_m = tui_dict.get(cui_m, 'None')
            st_m = st_dict.get(cui_m, 'None')
        elif len(wcui_m) < 1:
            if len(mesh_cui_m) == 1:
                cui_m = mesh_cui_m[0]
                tui_m = tui_dict.get(cui_m, 'None')
                st_m = st_dict.get(cui_m, 'None')
            elif len(mesh_cui_m) < 1:
                if len(doid_cui_m) == 1:
                    cui_m = doid_cui_m[0]
                    tui_m = tui_dict.get(cui_m, 'None')
                    st_m = st_dict.get(cui_m, 'None')
                else:
                    cui_m = 'None'
                    tui_m = 'None'
                    st_m = 'None'
            else:
                    cui_m = 'None'
                    tui_m = 'None'
                    st_m = 'None'
        else:
            cui_m = 'None'
            tui_m = 'None'
            st_m = 'None'

        dic_mention = {
            "mention": mention,
            "start_index": start,
            "end_index": end,
            "mention_link": link,
            "qid": qid_m,
            "cui": cui_m,
            "tui": tui_m, 
            "semantic_type": st_m,
            "wikidata_cui": wcui_m, 
            "mesh": mesh_m, 
            "mesh_cui": mesh_cui_m,
            "doid": doid_m,
            "doid_cui": doid_cui_m 
        }
        updated_mentions.append(dic_mention)

    # replace html tags with extracted strings
    decoded_tag = re.sub(r'<a href=".*?">(.*?)</a>', r'\1', decoded_tag)
    qid = qid_dict.get(int(i['id']), 'None')
    wcui = list(set(wikicui_dict.get(qid, [])))
    tui = list(set(chain.from_iterable([tui_dict.get(i, 'None') for i in wcui])))
    st = list(set(chain.from_iterable([st_dict.get(i, 'None') for i in wcui])))
    mesh = mesh_dict.get(qid, 'None')
    mesh_cui = list(set(mesh_cui_dict.get(mesh, [])))
    doid = doid_dict.get(qid, 'None')
    doid_cui = list(set(doid_cui_dict.get(doid, [])))
    if len(wcui) == 1:
            cui = wcui[0]
            tui = tui_dict.get(cui, 'None')
            st = st_dict.get(cui, 'None')
    elif len(wcui) < 1:
        if len(mesh_cui) == 1:
            cui = mesh_cui[0]
            tui = tui_dict.get(cui, 'None')
            st = st_dict.get(cui, 'None')
        elif len(mesh_cui) < 1:
            if len(doid_cui) == 1:
                cui = doid_cui[0]
                tui = tui_dict.get(cui, 'None')
                st = st_dict.get(cui, 'None')
            else:
                cui = 'None'
                tui = 'None'
                st = 'None'
        else:
                cui = 'None'
                tui = 'None'
                st = 'None'
    else:
        cui = 'None'
        tui = 'None'
        st = 'None'
    dic = {
        "id": i['id'],
        "url": i['url'],
        "title": i['title'],
        "text": decoded_tag,
        "qid": qid, 
        "cui": cui,
        "tui": tui, 
        "semantic_type": st,
        "wikidata_cui": wcui, 
        "mesh": mesh, 
        "mesh_cui": mesh_cui,
        "doid": doid,
        "doid_cui": doid_cui,
        "mentions": updated_mentions
    }
    jsonarray.append(dic)

  0%|          | 0/53981 [00:00<?, ?it/s]

100%|██████████| 53981/53981 [04:33<00:00, 197.45it/s]


In [16]:
def serialize_sets(obj):
    if isinstance(obj, set):
        return list(obj)

    return obj
    
with open(os.path.join(output_path, 'WikiMed-DE.json'), 'w', encoding='utf-8') as jsonf:
  jsonString = json.dumps(jsonarray, indent=4, ensure_ascii=False, default=serialize_sets)
  jsonf.write(jsonString)

### WikiMed-DE-BEL

In [51]:
jsondata[0]['text']

{'id': '1159055',
 'url': 'https://de.wikipedia.org/wiki?curid=1159055',
 'title': 'Foniohirse',
 'text': 'Die Foniohirse ("Digitaria exilis"), auch Hungerreis, Hungerhirse oder Acha genannt, ist eine Getreideart aus der Gattung der Fingerhirsen ("Digitaria") in der Familie der Süßgräser (Poaceae), von der es zahlreiche Landsorten gibt. Zur gleichen Gattung gehört das auch als „Schwarzer Fonio“ bezeichnete Iburu ("Digitaria iburua").\nBeschreibung.\nDie Foniohirse ist eine einjährige, aufrechte, krautige Pflanze, die Wuchshöhen von 35 bis 75 Zentimetern erreicht. Sie hat kurze Laubblätter. Die Fingerähren bestehen aus zwei bis fünf schmalen Teilähren, die bis zu 15 Zentimeter lang werden. Die Ährchen sind einblütig. Die Karyopsen sind mit ein bis 1,5 Millimeter sehr klein; die Farbe reicht von weiß über gelblich bis purpurn.\nFonio reift nach verschiedenen Angaben schneller als alle anderen Getreidearten. Manche Varietäten können bereits sechs bis acht Wochen nach der Aussaat geerntet 

In [30]:
def extract_text_within_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    extracted_texts = []
    for tag in soup.find_all('a'):
        url = re.findall(r'href="(.*?)"', str(tag))
        extracted_texts.append((url[0], tag.text, text.find(str(tag)), text.find(str(tag)) + len(tag.text)))
        text = text.replace(str(tag), tag.text)
    return extracted_texts
import re
jsonarray = []
for i in tqdm(jsondata):
    decoded_text = html.unescape(i['text'])
    decoded_tag = unquote(decoded_text)
    decoded_tag = re.sub(r'<a href=".*?">(.*?)</a>', r'\1', decoded_tag)
    mentions = extract_text_within_tags(decoded_text)
    updated_mentions = []
    for url, mention, start, end in mentions:
        link = 'https://de.wikipedia.org/wiki/' + url
        qid_m = mention_dict.get(link, 'None') 
        wcui_m = list(set(wikicui_dict.get(qid_m, [])))
        mesh_m = mesh_dict.get(qid_m, 'None')
        mesh_cui_m = list(set(mesh_cui_dict.get(mesh_m, [])))
        doid_m = doid_dict.get(qid_m, 'None')
        doid_cui_m = list(set(doid_cui_dict.get(doid_m, [])))
        if len(wcui_m) == 1:
            cui_m = wcui_m[0]
            tui_m = tui_dict.get(cui_m, 'None')
            st_m = st_dict.get(cui_m, 'None')
        elif len(wcui_m) < 1:
            if len(mesh_cui_m) == 1:
                cui_m = mesh_cui_m[0]
                tui_m = tui_dict.get(cui_m, 'None')
                st_m = st_dict.get(cui_m, 'None')
            elif len(mesh_cui_m) < 1:
                if len(doid_cui_m) == 1:
                    cui_m = doid_cui_m[0]
                    tui_m = tui_dict.get(cui_m, 'None')
                    st_m = st_dict.get(cui_m, 'None')
                else:
                    cui_m = 'None'
                    tui_m = 'None'
                    st_m = 'None'
            else:
                    cui_m = 'None'
                    tui_m = 'None'
                    st_m = 'None'
        else:
            cui_m = 'None'
            tui_m = 'None'
            st_m = 'None'
        
        if cui_m != 'None':
            if start > 0 and end < len(decoded_tag) - 1:
                start_context = decoded_tag[start-1]
                end_context = decoded_tag[end]
                #print(decoded_tag[start:end])
                #print(decoded_tag[start-1:end+1])
                #print(start_context, end_context, end_context.isalpha())
                if start_context.isspace() and not end_context.isalpha():
                        dic_mention = {
                        "mention": mention,
                        "start_index": start,
                        "end_index": end,
                        "mention_link": link,
                        "qid": qid_m,
                        "cui": cui_m,
                        "tui": tui_m, 
                        "semantic_type": st_m,
                        "wikidata_cui": wcui_m, 
                        "mesh": mesh_m, 
                        "mesh_cui": mesh_cui_m,
                        "doid": doid_m,
                        "doid_cui": doid_cui_m 
                        }
                        updated_mentions.append(dic_mention)


    # replace html tags with extracted strings
    qid = qid_dict.get(int(i['id']), 'None')
    wcui = list(set(wikicui_dict.get(qid, [])))
    tui = list(set(chain.from_iterable([tui_dict.get(i, 'None') for i in wcui])))
    st = list(set(chain.from_iterable([st_dict.get(i, 'None') for i in wcui])))
    mesh = mesh_dict.get(qid, 'None')
    mesh_cui = list(set(mesh_cui_dict.get(mesh, [])))
    doid = doid_dict.get(qid, 'None')
    doid_cui = list(set(doid_cui_dict.get(doid, [])))
    if len(wcui) == 1:
            cui = wcui[0]
            tui = tui_dict.get(cui, 'None')
            st = st_dict.get(cui, 'None')
    elif len(wcui) < 1:
        if len(mesh_cui) == 1:
            cui = mesh_cui[0]
            tui = tui_dict.get(cui, 'None')
            st = st_dict.get(cui, 'None')
        elif len(mesh_cui) < 1:
            if len(doid_cui) == 1:
                cui = doid_cui[0]
                tui = tui_dict.get(cui, 'None')
                st = st_dict.get(cui, 'None')
            else:
                cui = 'None'
                tui = 'None'
                st = 'None'
        else:
                cui = 'None'
                tui = 'None'
                st = 'None'
    else:
        cui = 'None'
        tui = 'None'
        st = 'None'
    dic = {
        "id": i['id'],
        "url": i['url'],
        "title": i['title'],
        "text": decoded_tag,
        "qid": qid, 
        "cui": cui,
        "tui": tui, 
        "semantic_type": st,
        "wikidata_cui": wcui, 
        "mesh": mesh, 
        "mesh_cui": mesh_cui,
        "doid": doid,
        "doid_cui": doid_cui,
        "mentions": updated_mentions
    }
    jsonarray.append(dic)
    #break
            


100%|██████████| 53981/53981 [04:22<00:00, 205.47it/s]


In [31]:
def serialize_sets(obj):
    if isinstance(obj, set):
        return list(obj)

    return obj
    
with open(os.path.join(output_path, 'WikiMed-DE-BEL.json'), 'w', encoding='utf-8') as jsonf:
  jsonString = json.dumps(jsonarray, indent=4, ensure_ascii=False, default=serialize_sets)
  jsonf.write(jsonString)

In [33]:
jsonarray[0]

{'id': '1159055',
 'url': 'https://de.wikipedia.org/wiki?curid=1159055',
 'title': 'Foniohirse',
 'text': 'Die Foniohirse ("Digitaria exilis"), auch Hungerreis, Hungerhirse oder Acha genannt, ist eine Getreideart aus der Gattung der Fingerhirsen ("Digitaria") in der Familie der Süßgräser (Poaceae), von der es zahlreiche Landsorten gibt. Zur gleichen Gattung gehört das auch als „Schwarzer Fonio“ bezeichnete Iburu ("Digitaria iburua").\nBeschreibung.\nDie Foniohirse ist eine einjährige, aufrechte, krautige Pflanze, die Wuchshöhen von 35 bis 75 Zentimetern erreicht. Sie hat kurze Laubblätter. Die Fingerähren bestehen aus zwei bis fünf schmalen Teilähren, die bis zu 15 Zentimeter lang werden. Die Ährchen sind einblütig. Die Karyopsen sind mit ein bis 1,5 Millimeter sehr klein; die Farbe reicht von weiß über gelblich bis purpurn.\nFonio reift nach verschiedenen Angaben schneller als alle anderen Getreidearten. Manche Varietäten können bereits sechs bis acht Wochen nach der Aussaat geerntet 