In [None]:
! pip install requests
! pip install tensorflow==2.0.0-beta1 --user --ignore-installed
! pip install keras
! pip install mwparserfromhell
! pip install ipynb

In [None]:
import requests
from bs4 import BeautifulSoup # parses HTML
from keras.utils import get_file
import os # file system management
import sys
import xml.sax # parsesg xml
import re
import bz2
import subprocess # processes bz2 compressed files line by line
import mwparserfromhell # parsing Wiki Code
import pandas as pd
import matplotlib.pyplot as plt
import gc
import json
from multiprocessing.dummy import Pool as Threadpool
from itertools import chain # chains List of lists to single list
from multiprocessing import Pool 
import tqdm # tracks progress
from functools import partial # sends keyword arguments in map
from timeit import default_timer as timer
from ipynb.fs.full.functions import find_people
from json import JSONEncoder

Access a dump (snapshot) of the entire Wikipedia

In [None]:
base_url = 'https://dumps.wikimedia.org/enwiki/'
index = requests.get(base_url).text

Use BeautifulSoup to create a parse tree that can be used to extract data from HTML, and find the links on the page

In [None]:
soup_index = BeautifulSoup(index, 'html.parser')
dumps = [a['href'] for a in soup_index.find_all('a') if 
         a.has_attr('href')]
dumps

Choose a complete dump and find all available files in the dump

In [None]:
dump_url = base_url + '20190701/'
# Retrieve the html
dump_html = requests.get(dump_url).text
# Convert to a soup
soup_dump = BeautifulSoup(dump_html, 'html.parser')
# Find list elements with the class file
soup_dump.find_all('li', {'class': 'file'})[:3]

Limit to only the files containing 'pages-articles' to get only the recent versions of the articles

In [None]:
files = []

# Search through all files
for file in soup_dump.find_all('li', {'class': 'file'}):
    text = file.text
    # Select the relevant files
    if 'pages-articles-multistream' in text:
        files.append((text.split()[0], text.split()[1:]))
        
files[:5]

Further filter for partitioned files with 'xml-p'

In [None]:
files_to_download = [file[0] for file in files if '.xml-p' in file[0]]
files_to_download[-5:]

Check if file already exists and if not download file

In [None]:
os.getcwd()

In [None]:
keras_home = 'C:\\Users\\efan\\.keras\\datasets\\'
data_paths = []
file_info = []

# Iterate through each file
for file in files_to_download:
    path = keras_home + file
    
    # Check to see if the path exists (if the file is already downloaded)
    if not os.path.exists(keras_home + file):
        print('Downloading')
        # If not, download the file
        data_paths.append(get_file(file, dump_url + file))
        # Find the file size in MB
        file_size = os.stat(path).st_size / 1e6
        
        # Find the number of articles
        file_articles = int(file.split('p')[-1].split('.')[-2]) - int(file.split('p')[-2])
        file_info.append((file, file_size, file_articles))
        
    # If the file is already downloaded find some information
    else:
        data_paths.append(path)
        # Find the file size in MB
        file_size = os.stat(path).st_size / 1e6
        
        # Find the number of articles
        file_number = int(file.split('p')[-1].split('.')[-2]) - int(file.split('p')[-2])
        file_info.append((file.split('-')[-1], file_size, file_number))

Sort by file size to see the largest files

In [None]:
sorted(file_info, key = lambda x: x[1], reverse = True)[:5]

Find out how many partitions there are in total

In [None]:
print(f'There are {len(file_info)} partitions.')

Put this info into a dataframe to plot the file sizes

In [None]:
%matplotlib inline
file_df = pd.DataFrame(file_info, columns = ['file', 'size (MB)', 'articles']).set_index('file')
file_df['size (MB)'].plot.bar(color = 'red', figsize = (12, 6));

Find out the total file size

In [None]:
print(f"The total size of files on disk is {file_df['size (MB)'].sum() / 1e3} GB")

Parsing Step 1: Extract info from XML

In [None]:
def process_article(title, text, timestamp, template = 'Infobox person'):
    """Process a wikipedia article looking for template"""
    
    # Create a parsing object
    wikicode = mwparserfromhell.parse(text)
    
    # Search through templates for the template
    matches = wikicode.filter_templates(matches = template)
    
    # Filter out errant matches
    matches = [x for x in matches if x.name.strip_code().strip().lower() == template.lower()]
    
    if len(matches) >= 1:
        # Extract unnested information from infobox
        properties = {param.name.strip_code().strip(): param.value.strip_code().strip() 
                      for param in matches[0].params
                      if param.value.strip_code().strip()}
        
        # Extract Birth Date
        try:
            birth_date1 = wikicode.filter_templates(matches = 'birth date')[-1].params
            birth_date = [str(x) for x in birth_date1 if not '=' in x]
        except:
            birth_date = 'unknown'
            
        # Extract spouse
        try:
            spouse1 = wikicode.filter_templates(matches = 'marriage') + wikicode.filter_templates(matches = 'married')
            spouse2 = [x.params for x in spouse1 if x.name.lower() == 'marriage' or x.name.lower() == 'married']
            spouse = [list(str(y) for y in x) for x in spouse2]
            nested = 'nested'
        except:
            spouse = [str(properties.get('spouse'))] + re.findall(r'\((.*?)\)', str(properties.get('spouse')))
            nested = 'not_nested'
        
        # Extract internal wikilinks - filter for categories they belong to
        wikilinks1 = [x.title.strip_code().strip() for x in wikicode.filter_wikilinks()]
        wikilinks = [str(x) for x in wikilinks1 if 'Category:' in x]
        label = [x for x in wikilinks if '21st-century' in x and 'act' in x]
        
        # Extract gender and nationality
        if 'actresses' in str(label):
            gender = 'female'
            try:
                nationality = [re.match('Category:21st-century (.*) actresses',x).group(1) for x in label]
            except:
                nationality = 'unknown'
        else:
            gender = 'male'
            try:
                nationality = [re.match('Category:21st-century (.*) male actors',x).group(1) for x in label]
            except:
                nationality = 'unknown'
                
        # Extract nested and unnested awards
        awards1 = wikicode.filter_templates(matches ='awards')
        awards2 = [str(x.params[0]) for x in awards1 if x.name.lower()=='awards']
        awards3 = [str(x) for x in wikilinks if 'winner' in x]
        awards = awards2+awards3
        
        return (title, birth_date, spouse, nested, gender, nationality, awards)

Parsing step 2: set up XML Handler to parse XML using SAX

In [None]:
class WikiXmlHandler(xml.sax.handler.ContentHandler):
    """Parse through XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._people = []
        self._article_count = 0
        self._non_matches = []

    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('title', 'text', 'timestamp'):
            self._current_tag = name
            self._buffer = []

    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)

        if name == 'page' and (re.search('21st\-century\s+(?:(?!\s+actors")(?:.|\n))*\s+actors', str(self._buffer))
                               or
                               re.search('21st\-century\s+(?:(?!\s+actors")(?:.|\n))*\s+actresses', str(self._buffer))):
            self._article_count += 1
            # Search through the page to see if the page is a person
            people = process_article(**self._values, template = 'Infobox person')
            # Append to the list of people
            if people:
                self._people.append(people)

Parsing Step 3: Set up parser to decompress one line from compressed file at a time, pass to XML Handler to process and write to json

In [None]:
def find_people(data_path, limit = None, save = True):
    """Find all the people from a compressed wikipedia XML dump.
       `limit` could limit to only return a set number of books.
        If save, books are saved to partition directory based on file name"""

    # Object for handling xml
    handler = WikiXmlHandler()

    # Parsing object
    parser = xml.sax.make_parser()
    parser.setContentHandler(handler)

    # Iterate through compressed file
    for i, line in enumerate(subprocess.Popen(['bzcat'], 
                             stdin = open(data_path), 
                             stdout = subprocess.PIPE).stdout):
        try:
            parser.feed(line)
        except StopIteration:
            break
            
        # Optional limit
        if limit is not None and len(handler._people) >= limit:
            return handler._people
    
    if save:
        partition_dir = 'C:\\Users\\efan\\wiki\\partitions\\'
        # Create file name based on partition name
        p_str = data_path.split('-')[-1].split('.')[-2]
        out_dir = partition_dir + f'{p_str}.ndjson'

        # Open the file
        with open(out_dir, 'w') as fout:
            # Write as json
            for person in handler._people:
                fout.write(json.dumps(person) + '\n')
        
        print(f'{len(os.listdir(partition_dir))} files processed.', end = '\r')

    # Memory management
    del handler
    del parser
    gc.collect()
    return None

In [None]:
partitions = [keras_home + file for file in os.listdir(keras_home) if 'xml-p' in file]
len(partitions), partitions[-1]

Run a CPU Count

In [None]:
os.cpu_count()

Process all the partitions with a progress bar

In [None]:
# Create a pool of workers to execute processes
pool = Pool(processes = 4)

start = timer()
results = []

# Map (service, tasks), applies function to each partition
for x in tqdm.tqdm_notebook(pool.imap_unordered(find_people, partitions), total = len(partitions)):
    results.append(x)

pool.close()
pool.join()

end = timer()
print(f'{end - start} seconds elapsed.')