In [57]:
import tqdm
import pandas as pd
from lxml import etree
import resource

import mwapi
import mwtypes
import requests

In [58]:
XML_FILE = "xml_data/sample.xml"
CSV_OUTPUT = "csv_data/sample-output.csv"

In [59]:
def strip_tag_name(t):
    t = t.tag
    idx = t.rfind("}")
    if idx != -1:
        t = t[idx + 1:]
    return t

def score2sum(score_doc, weights):
    weighted_sum = 0
    for cl, proba in score_doc['probability'].items():
        weighted_sum += weights[cl] * proba
    return weighted_sum

def fetch_wp10_score(rev_id):
    response = requests.get('https://ores.wikimedia.org/v3/scores/enwiki/{0}/wp10'.format(rev_id))
    return response.json()['enwiki']['scores'][str(rev_id)]['wp10']['score']

In [107]:
class PageParser(object):
    def __init__(self):
        self.isPage = False
        self.pages = []
        self.current_page = {}
        self.ores_weights = {'Stub': 1, 'Start': 2, 'C': 3, 'B': 4, 'GA': 5, 'FA': 6}
        self.session = mwapi.Session("https://en.wikipedia.org")
        self.rvlimit = 100
    
    def handle(self, event, element):
        stipped_elem = strip_tag_name(element)
        if event == "start" and stipped_elem == "page":
            self.isPage = True
        elif event == "end" and stipped_elem == "page":
            self.isPage = False
            self.finalize_page()
        
        if event == "end" and self.isPage:
            self.parse_page_element(element)
    
    def parse_page_element(self, element):
        stripped_tag = strip_tag_name(element)
        if element.text:
            texts = [element.text.strip()] + [child.tail.strip() for child in element if child.tail]
            self.current_page[stripped_tag] = " ".join(texts)

    def get_revision(self):
        for response_doc in self.session.get(action='query', prop='revisions',
                                             titles=self.current_page["title"],
                                             rvprop=['ids', 'timestamp'],
                                             rvlimit=self.rvlimit, rvdir="older", 
                                             formatversion=2, continuation=True):
            rev_docs = pd.DataFrame(response_doc['query']['pages'][0]['revisions'])
            rev = rev_docs[rev_docs.timestamp == self.current_page["timestamp"]]
            if len(rev) == 1:
                rev_id = rev["revid"].values[0]
                return score2sum(fetch_wp10_score(rev_id), self.ores_weights)
        return None
    
    def finalize_page(self):
        self.current_page["ORES"] = self.get_revision()
        self.pages.append(self.current_page)
        self.current_page = {}

In [108]:
page_parser = PageParser()

Sending requests with default User-Agent.  Set 'user_agent' on mwapi.Session to quiet this message.


In [None]:
for event, elem in tqdm.tqdm_notebook(etree.iterparse(XML_FILE, events=('start', 'end'))):
    page_parser.handle(event, elem)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [9]:
df.to_csv(CSV_OUTPUT)