In [62]:
import requests
from bs4 import BeautifulSoup
import os
import time
from keras.utils import get_file
try:
    from urllib.request import urlretrieve
except ImportError:
    from urllib import urlretrieve
import xml.sax

import subprocess
import mwparserfromhell
import json
import re

In [63]:
index = requests.get('https://dumps.wikimedia.org/enwiki/').text
index

'<html>\r\n<head><title>Index of /enwiki/</title></head>\r\n<body bgcolor="white">\r\n<h1>Index of /enwiki/</h1><hr><pre><a href="../">../</a>\r\n<a href="20181101/">20181101/</a>                                          21-Dec-2018 01:35                   -\r\n<a href="20181120/">20181120/</a>                                          02-Jan-2019 01:27                   -\r\n<a href="20181201/">20181201/</a>                                          21-Jan-2019 01:34                   -\r\n<a href="20181220/">20181220/</a>                                          02-Feb-2019 01:28                   -\r\n<a href="20190101/">20190101/</a>                                          09-Jan-2019 03:53                   -\r\n<a href="20190120/">20190120/</a>                                          25-Jan-2019 04:47                   -\r\n<a href="20190201/">20190201/</a>                                          13-Feb-2019 22:50                   -\r\n<a href="latest/">latest/</a>             

In [64]:
soup_index = BeautifulSoup(index, 'html.parser')
soup_index

<html>
<head><title>Index of /enwiki/</title></head>
<body bgcolor="white">
<h1>Index of /enwiki/</h1><hr/><pre><a href="../">../</a>
<a href="20181101/">20181101/</a>                                          21-Dec-2018 01:35                   -
<a href="20181120/">20181120/</a>                                          02-Jan-2019 01:27                   -
<a href="20181201/">20181201/</a>                                          21-Jan-2019 01:34                   -
<a href="20181220/">20181220/</a>                                          02-Feb-2019 01:28                   -
<a href="20190101/">20190101/</a>                                          09-Jan-2019 03:53                   -
<a href="20190120/">20190120/</a>                                          25-Jan-2019 04:47                   -
<a href="20190201/">20190201/</a>                                          13-Feb-2019 22:50                   -
<a href="latest/">latest/</a>                                            13

In [65]:
dumps = [a['href'] for a in soup_index.find_all('a') 
             if a.has_attr('href') and a.text[:-1].isdigit()]
dumps

['20181101/',
 '20181120/',
 '20181201/',
 '20181220/',
 '20190101/',
 '20190120/',
 '20190201/']

In [66]:
for dump_url in sorted(dumps, reverse=True):
    print(dump_url)
    dump_html = index = requests.get('https://dumps.wikimedia.org/enwiki/' + dump_url).text
    soup_dump = BeautifulSoup(dump_html, 'html.parser')
    pages_xml = [a['href'] for a in soup_dump.find_all('a') 
                 if a.has_attr('href') and a['href'].endswith('-pages-articles.xml.bz2')]
    if pages_xml:
        break
    time.sleep(0.8)

20190201/


In [67]:
pages_xml

['/enwiki/20190201/enwiki-20190201-pages-articles.xml.bz2']

In [68]:
pages_xml[0]

'/enwiki/20190201/enwiki-20190201-pages-articles.xml.bz2'

In [69]:
wikipedia_dump = pages_xml[0].rsplit('/')[-1]
wikipedia_dump

'enwiki-20190201-pages-articles.xml.bz2'

In [70]:
wikipedia_dump = pages_xml[0].rsplit('/')[-1]
url = url = 'https://dumps.wikimedia.org/' + pages_xml[0] 
path = get_file(wikipedia_dump, url)

In [71]:
url

'https://dumps.wikimedia.org//enwiki/20190201/enwiki-20190201-pages-articles.xml.bz2'

In [72]:
path

'C:\\Users\\GAO\\.keras\\datasets\\enwiki-20190201-pages-articles.xml.bz2'

In [73]:
def process_article(title, text):
    rotten = [(re.findall('\d\d?\d?%', p), re.findall('\d\.\d\/\d+|$', p), p.lower().find('rotten tomatoes')) for p in text.split('\n\n')]
    rating = next(((perc[0], rating[0]) for perc, rating, idx in rotten if len(perc) == 1 and idx > -1), (None, None))
    wikicode = mwparserfromhell.parse(text)
    film = next((template for template in wikicode.filter_templates() 
                 if template.name.strip().lower() == 'infobox film'), None)
    if film:
        properties = {param.name.strip_code().strip(): param.value.strip_code().strip() 
                      for param in film.params
                      if param.value.strip_code().strip()
                     }
        links = [x.title.strip_code().strip() for x in wikicode.filter_wikilinks()]
        return (title, properties, links) + rating

In [78]:
class WikiXmlHandler(xml.sax.handler.ContentHandler):
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._movies = []
        self._curent_tag = None

    def characters(self, content):
        if self._curent_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        if name in ('title', 'text'):
            self._curent_tag = name
            self._buffer = []

    def endElement(self, name):
        if name == self._curent_tag:
            self._values[name] = ' '.join(self._buffer)

        if name == 'page':
            movie = process_article(**self._values)
            if movie:
                self._movies.append(movie)

In [79]:
import bz2
parser = xml.sax.make_parser()
handler = WikiXmlHandler()
parser.setContentHandler(handler)

In [80]:
with bz2.BZ2File(path, "r") as raw_data:
    try:
        for line in raw_data:
            parser.feed(line)
    except KeyboardInterrupt:
        print("Sorry")

Sorry


In [81]:
with open('generated/wp_movies.ndjson', 'wt') as fout:
    for movie in handler._movies:
         fout.write(json.dumps(movie) + '\n')