In [5]:
import wikipediaapi  # pip install wikipedia-api
import pandas as pd
import concurrent.futures
from tqdm import tqdm

def wiki_scrape(topic_name, verbose=True):
    def wiki_link(link):
        try:
            page = wiki_api.page(link)
            if page.exists():
                d = {'page': link, 'text': page.text, 'link': page.fullurl,
                     'categories': list(page.categories.keys())}
                return d
        except:
            return None

    wiki_api = wikipediaapi.Wikipedia(language='en',
        extract_format=wikipediaapi.ExtractFormat.WIKI)
    page_name = wiki_api.page(topic_name)
    if not page_name.exists():
        print('page {} does not exist'.format(topic_name))
        return
    page_links = list(page_name.links.keys())
    progress = tqdm(desc='Links Scraped', unit='', total=len(page_links)) if verbose else None
    sources = [{'page': topic_name, 'text': page_name.text, 'link': page_name.fullurl,
                'categories': list(page_name.categories.keys())}]
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        future_link = {executor.submit(wiki_link, link): link for link in page_links}
        for future in concurrent.futures.as_completed(future_link):
            data = future.result()
            progress.update(1) if verbose else None
            if data:
                sources.append(data)
    progress.close() if verbose else None
    blacklist = ('Template', 'Help:', 'Category:', 'Portal:', 'Wikipedia:', 'Talk:')
    sources = pd.DataFrame(sources)
    sources = sources[(len(sources['text']) > 20)
                      & ~(sources['page'].str.startswith(blacklist))]
    sources['categories'] = sources.categories.apply(lambda x: [y[9:] for y in x])
    sources['topic'] = topic_name
    print ('Wikipedia pages scraped:', len(sources))
    return sources

In [6]:
wiki_data = wiki_scrape('Gastroesophageal reflux disease')

Links Scraped: 100%|██████████| 338/338 [01:45<00:00,  3.20/s]

Wikipedia pages scraped: 329





In [7]:
wiki_data.head(10)

Unnamed: 0,page,text,link,categories,topic
0,Gastroesophageal reflux disease,"Gastroesophageal reflux disease (GERD), also k...",https://en.wikipedia.org/wiki/Gastroesophageal...,"[All articles with unsourced statements, Artic...",Gastroesophageal reflux disease
1,Abdominal angina,Abdominal angina is abdominal pain after eatin...,https://en.wikipedia.org/wiki/Abdominal_angina,"[All articles with unsourced statements, Artic...",Gastroesophageal reflux disease
2,Achalasia,"Esophageal achalasia, often referred to simply...",https://en.wikipedia.org/wiki/Esophageal_achal...,"[All articles with unsourced statements, Artic...",Gastroesophageal reflux disease
3,Achlorhydria,"Achlorhydria, also known as hypochlorhydria, r...",https://en.wikipedia.org/wiki/Achlorhydria,"[All articles needing additional references, A...",Gastroesophageal reflux disease
4,Abdominopelvic cavity,The abdominopelvic cavity is a body cavity tha...,https://en.wikipedia.org/wiki/Abdominopelvic_c...,"[Abdomen, Wikipedia articles with TA98 identif...",Gastroesophageal reflux disease
5,Accessory digestive gland,The human digestive system consists of the gas...,https://en.wikipedia.org/wiki/Human_digestive_...,"[All articles to be expanded, All articles wit...",Gastroesophageal reflux disease
6,Acute liver failure,Acute liver failure is the appearance of sever...,https://en.wikipedia.org/wiki/Acute_liver_failure,"[All articles with unsourced statements, Artic...",Gastroesophageal reflux disease
7,Adenomyomatosis,Adenomyomatosis is a benign condition characte...,https://en.wikipedia.org/wiki/Adenomyomatosis,"[Gallbladder, Hepatology]",Gastroesophageal reflux disease
8,Adhesion (medicine),Adhesions are fibrous bands that form between ...,https://en.wikipedia.org/wiki/Adhesion_(medicine),"[Abdominal pain, All accuracy disputes, All ar...",Gastroesophageal reflux disease
9,Acute pancreatitis,Acute pancreatitis (AP) is a sudden inflammati...,https://en.wikipedia.org/wiki/Acute_pancreatitis,"[All articles with incomplete citations, All a...",Gastroesophageal reflux disease


In [8]:
import pandas as pd
import re
import spacy
import neuralcoref

nlp = spacy.load('en_core_web_sm')
neuralcoref.add_to_pipe(nlp)


def entity_pairs(text, coref=True):
    text = re.sub(r'\n+', '.', text)  
    text = re.sub(r'\[\d+\]', ' ', text)  
    text = nlp(text)
    if coref:
        text = nlp(text._.coref_resolved)  # resolve coreference clusters
    sentences = [sent.string.strip() for sent in text.sents]  # split text into sentences
    ent_pairs = list()
    for sent in sentences:
        sent = nlp(sent)
        spans = list(sent.ents) + list(sent.noun_chunks)  # collect nodes
        spans = spacy.util.filter_spans(spans)
        with sent.retokenize() as retokenizer:
            [retokenizer.merge(span) for span in spans]
        dep = [token.dep_ for token in sent]
        if (dep.count('obj')+dep.count('dobj'))==1 \
                and (dep.count('subj')+dep.count('nsubj'))==1:
            for token in sent:
                if token.dep_ in ('obj', 'dobj'):  # identify object nodes
                    subject = [w for w in token.head.lefts if w.dep_
                               in ('subj', 'nsubj')]  # identify subject nodes
                    if subject:
                        subject = subject[0]
                        # identify relationship by root dependency
                        relation = [w for w in token.ancestors if w.dep_ == 'ROOT']  
                        if relation:
                            relation = relation[0]
                            # add adposition or particle to relationship
                            if relation.nbor(1).pos_ in ('ADP', 'PART'):  
                                relation = ' '.join((str(relation),
                                        str(relation.nbor(1))))
                        else:
                            relation = 'unknown'
                        subject, subject_type = refine_ent(subject, sent)
                        token, object_type = refine_ent(token, sent)
                        ent_pairs.append([str(subject), str(relation), str(token),
                                str(subject_type), str(object_type)])
    filtered_ent_pairs = [sublist for sublist in ent_pairs
                          if not any(str(x) == '' for x in sublist)]
    pairs = pd.DataFrame(filtered_ent_pairs, columns=['subject',
                         'relation', 'object', 'subject_type',
                         'object_type'])
    print('Entity pairs extracted:', str(len(filtered_ent_pairs)))
    return pairs


def refine_ent(ent, sent):
    unwanted_tokens = (
        'PRON',  # pronouns
        'PART',  # particle
        'DET',  # determiner
        'SCONJ',  # subordinating conjunction
        'PUNCT',  # punctuation
        'SYM',  # symbol
        'X',  # other
        )
    ent_type = ent.ent_type_  # get entity type
    if ent_type == '':
        ent_type = 'NOUN_CHUNK'
        ent = ' '.join(str(t.text) for t in
                nlp(str(ent)) if t.pos_
                not in unwanted_tokens and t.is_stop == False)
    elif ent_type in ('NOMINAL', 'CARDINAL', 'ORDINAL') and str(ent).find(' ') == -1:
        t = ''
        for i in range(len(sent) - ent.i):
            if ent.nbor(i).pos_ not in ('VERB', 'PUNCT'):
                t += ' ' + str(ent.nbor(i))
            else:
                ent = t.strip()
                break
    return ent, ent_type

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [None]:
pairs = entity_pairs(wiki_data.loc[0,'text'])