In [35]:
import mwparserfromhell as mwparser
import gzip
import json

In [36]:
import mwapi
session = mwapi.Session("https://en.wikipedia.org", user_agent="<jeffrey.arnold@gmail.com>")

In [37]:
rev = session.get(action="query", titles="Data science", prop='revisions', rvprop='content', rvslots='main')

In [38]:
text = list(rev['query']['pages'].values())[0]['revisions'][0]['slots']['main']['*']

In [39]:
parsed = mwparser.parse(text)
list(parsed.ifilter_wikilinks(matches="Category:"))

['[[Category:Information science]]',
 '[[Category:Computer occupations]]',
 '[[Category:Computational fields of study]]',
 '[[Category:Data analysis]]']

In [41]:
import re

def clean_template_name(x):
    return str(x.name).lower().strip().replace(" ", "_").replace("-", "_")
    
def is_image(x):
    _RE_IMAGE = re.compile('^(?:File|Image|Media):', flags=re.I)    
    return _RE_IMAGE.match(str(x.title))

def get_images(x):
    for cat in x.ifilter_wikilinks():
        if is_image(x):
            yield x

def is_category(x):
    _RE_CATEGORY = re.compile('^Category:', flags=re.I)
    return _RE_CATEGORY.match(str(x.title))

def get_categories(x):
    for cat in x.ifilter_wikilinks():
        if is_category(x):
            yield x
            
def is_ref(x):
    if is_tag(x, "ref$"):
        return True
    if is_template(x, pattern="(ref|note)$"):
        return True
    return False

def is_tag(x, pattern=None):
    out = isinstance(x, mwparser.nodes.Tag)
    if out and pattern is not None:
        out = re.match(pattern, str(x.tag), re.I)
    return out

def is_heading(x):
    return isinstance(x, mwparser.nodes.Heading)

def is_wikilink(x):
    return isinstance(x, mwparser.nodes.Wikilink)

def is_template(x, pattern=None):
    out = isinstance(x, mwparser.nodes.Template)
    if out and pattern is not None:
        out = re.match(pattern, clean_template_name(x), re.I)
    return out

class WikicodeConverter:
 
    def __init__(self, parser = mwparser.parser.Parser(), tags_keep=[], tags_remove=[],
                 templates_keep=[], templates_remove=[], headings_remove=[]):
        self._parser = parser
        self.tags_keep = tags_keep
        self.tags_remove = tags_remove
        self.templates_keep = templates_keep
        self.templates_remove = templates_remove
        self.headings_remove = headings_remove
      
    def _strip_tag(self, x):
        tag = x.tag.lower().strip()
        if tag in self.tags_keep:
            out = f"<{tag.tag}>"
        elif tag in self.tags_remove:
            out = None
        else:
            out = x.__strip__()
        return out
    
    def _strip_template(self, x):
        name = clean_template_name(x)
        if (self.templates_keep and 
            re.match('|'.join(self.templates_keep), name)):
            out = "{{" + name + "}}"
        elif re.match("|".join(self.templates_remove), name):
            out = None
        else:
            out = x.__strip__()
        return out    
    
    def _span(self, i, text, label=None):
        if text is None:
            start = max(0, i - 1)
            end = max(1, i)
        else:
            start = i
            end = len(text)
        return {'start': start, 'end': end, 'label': label}
    
    def convert(self, content):
        """Convert Wiki markup to plain text."""
        wikicode = mwparser.parse(content)
        texts = []
        references = []
        templates = []
        # then concatenate the stripped text of each section
        tok = 0
        for i, section in enumerate(wikicode.get_sections(flat=True, include_lead=True, include_headings=True)):
            # ignore headers
            headings = section.filter_headings()
            if len(headings) and str(headings[0].title).strip().lower() in self.headings_remove:
                continue
            for node in section.nodes:
                nodestr = None
                # references needs to preceed tags and templates since they
                # have both forms
                if is_ref(node):
                    nodestr = " "
                    references.append(tok)
                elif is_tag(node):
                    if str(node.tag).lower() in ("table", "img"):
                        nodestr = None       
                    else:
                        nodestr = self._strip_tag(node)
                elif is_template(node):
                    nodestr = self._strip_template(node)
                    if nodestr is None or not len(nodestr):
                        nodestr = " "
                    templates.append(self._span(tok, nodestr, clean_template_name(node)))
                elif is_category(node) or is_image(node):
                    pass
                elif is_heading(node):
                    nodestr = None
                else:
                    nodestr = str(node.__strip__(normalize=True))
                if nodestr is not None:
                    tok += len(nodestr)
                    texts.append(nodestr)
        return {'text': ''.join(texts), 'references': references, 'templates': templates}


In [42]:
templates_keep = []
converter = WikicodeConverter(tags_remove=["img", "table"],
                              templates_remove=["infobox", "reflist", "notelist"],
                              headings_remove=["see also", "bibliography", "references", "external links"])

In [43]:
cleaned = converter.convert(text)

In [51]:
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(cleaned['text'])

In [47]:
from spacy.tokens import Token

Token.set_extension('reference', default=False, force=True)
references = cleaned['references']
for tok in reversed(doc):
    if len(references) and not tok.is_space and tok.idx <= references[-1]:
        # this is needed to handle multiple references
        while len(references) and tok.idx <= references[-1]:
            references.pop()
        tok._.set('reference', True)

In [48]:
for tok in doc:
    if tok._.get('reference'):
        print(tok, tok._.get('reference'))

. True
. True
, True
, True
" True
. True
" True
. True
. True
. True
, True
. True
" True
. True
. True
. True
. True
) True
, True
. True
, True
" True
method True
" True
, True
Analytics True
. True
. True
. True
Analytics True
. True
. True
” True
advantage True
. True
. True
, True
. True
. True
. True
. True
. True
. True
, True
, True
. True
. True
. True
. True
. True
" True


In [49]:
for i in cleaned['references']:
    print(i, [tok for tok in doc if tok.idx <= i and not tok.is_whitespace][-1])

In [None]:
tok.whitespace

In [None]:
tok = doc[0]

In [None]:
tok.is_space

In [None]:
[tok for tok in doc]