In [1]:
!pip install wikipedia
!pip install lxml



In [4]:
import re
import json
import requests
import wikipedia

from bs4 import BeautifulSoup

In [3]:
def extract_infobox(term):
    url = "https://en.wikipedia.org/wiki/" + term
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "lxml")
    tbl = soup.find("table", {"class": "infobox"})
    if not tbl:
        return {}
    list_of_table_rows = tbl.findAll("tr")
    info = {}
    for tr in list_of_table_rows:
        th = tr.find("th")
        td = tr.find("td")
        if th is not None and td is not None:
            innerText = ""
            for elem in td.recursiveChildGenerator():
                if isinstance(elem, str):
                    # remove references
                    clean = re.sub("([\[]).*?([\]])", "\g<1>\g<2>", elem.strip())
                    # add a simple space after removing references for word-separation
                    innerText += clean.replace("[]", "") + " "
                elif elem.name == "br":
                    innerText += "\n"
            info[th.text] = innerText
    return info

In [4]:
def get_wiki_sections(body):
    sections = {}
    start = True
    for sec in body.split('\n\n\n== '):
        if start:
            sections['Description'] = sec
            start = False
            continue    
        # get name
        ind = sec.find('\n')
        if ind < 0:
            continue
        name = sec[:ind]
        sections[name[: name.rfind(' ')]] = sec.replace(name, '').lstrip('\n')
    return sections

In [5]:
def extract_wikipedia(topic):
    page = wikipedia.page(topic)

    # get the title of the page
    return {
        "name": page.title,
        "categories": page.categories,
        "body": page.content,
        "links": page.links,
        "references": page.references,
        "summary": page.summary,
        "sections": get_wiki_sections(page.content),
        "infobox": extract_infobox(topic)
    }

In [11]:
def extraction(wiki, name):
    values = {
        "name": name,
        "body": "",
    }
    group_children = []
    for group, pages in wiki.items():
        subval = {
            "name": group,
            "body": "",
        }
        children = []
        for page in pages:
            children.append(extract_wikipedia(page))
        subval['children'] = children
        group_children.append(subval)
    values['children'] = group_children
    return values

In [6]:
therapies = {
    "psychodynamic and humanistic psychotherapies":[
        "psychoanalysis",
        "psychodynamic psychotherapy",
        "humanistic psychotherapy",
    ],
    "behavioral and cognitive-behavior psychotherapies":[
        "behaviour therapy",
        "cognitive therapy",
        "cognitive behavioral therapy",
    ],
    "other modes of clinical intervention":[
        "group therapy",
        "couples therapy",
        "family therapy",
        "community psychology",
        "self-help",
    ]
}

In [18]:
therapy_data = extraction(therapies, 'basic_therapies')
with open('data/basic_therapies.json', 'w+') as f:
    json.dump(therapy_data, f)

In [16]:
silver_lining = {
    "psychotic drugs":[
        "Aripiprazole",
        "Quetiapine",
    ],
    "Bipolar disorder drugs":[
        "Lithium salts",
    ],
    "anxiety disorder drugs":[
        "Alprazolam",
        "Clonazepam",
        "Trazodone",
    ],
    "depressive disorder drugs":[
        "Venlafaxine",
    ]
}

data = extraction(silver_lining, 'silver_lining_drugs')
with open('data/silver_lining_drugs.json', 'w+') as f:
    json.dump(data, f)