In [1]:
#!pip install beautifulsoup
#!wget https://dumps.wikimedia.org/enwiki/20190420/enwiki-20190420-pages-articles-multistream1.xml-p10p30302.bz2
#!bunzip2 enwiki-20190420-pages-articles-multistream1.xml-p10p30302.bz2    

In [2]:
# Count lines, words, characters in XML data file
!wc enwiki-20190420-pages-articles-multistream1.xml-p10p30302

 4586335 74563548 642819893 enwiki-20190420-pages-articles-multistream1.xml-p10p30302


In [3]:
from bs4 import BeautifulSoup
import pandas as pd

In [4]:
def get_pages_from_xml_file(filename, start_tag='<page>', end_tag='</page>'):
    """Yields each page from the specified XML data file."""
    page = None
    with open(filename) as f:
        for line in f:
            if start_tag in line:
                page = []
                page.append(line)
            elif end_tag in line:
                page.append(line)
                page_xml = ''.join(page)
                yield page_xml
                page = None
            else:
                if page is not None:
                    page.append(line)

In [5]:
filename = 'enwiki-20190420-pages-articles-multistream1.xml-p10p30302'
pages = list(get_pages_from_xml_file(filename))

In [6]:
def get_title_from_page_xml(page_xml):
    """Returns the title of the given page."""
    soup = BeautifulSoup(page_xml, 'lxml')
    return soup.select_one('title').text

In [7]:
len(pages)  # Show the number of pages

19822

In [8]:
%time titles = [get_title_from_page_xml(page) for page in pages]

CPU times: user 58.1 s, sys: 1.69 s, total: 59.8 s
Wall time: 1min 1s


In [9]:
titles[:5] + ['...'] + titles[-5:]  # Show the first and last 5 titles

['AccessibleComputing',
 'Anarchism',
 'AfghanistanHistory',
 'AfghanistanGeography',
 'AfghanistanPeople',
 '...',
 'The Lord of the Rings/One Ring',
 'Tax Freedom Day',
 'Tax',
 'Transhumanism',
 'TARDIS']

In [10]:
print(pages[0])  # Print the first page

  <page>
    <title>AccessibleComputing</title>
    <ns>0</ns>
    <id>10</id>
    <redirect title="Computer accessibility" />
    <revision>
      <id>854851586</id>
      <parentid>834079434</parentid>
      <timestamp>2018-08-14T06:47:24Z</timestamp>
      <contributor>
        <username>Godsy</username>
        <id>23257138</id>
      </contributor>
      <comment>remove from category for seeking instructions on rcats</comment>
      <model>wikitext</model>
      <format>text/x-wiki</format>
      <text xml:space="preserve">#REDIRECT [[Computer accessibility]]

{{R from move}}
{{R from CamelCase}}
{{R unprintworthy}}</text>
      <sha1>42l0cvblwtb4nnupxm6wo000d27t6kf</sha1>
    </revision>
  </page>



In [11]:
def get_text_from_page_xml(page):
    """Returns the text of the given page."""
    soup = BeautifulSoup(page, 'lxml')
    return soup.select_one('text').text

In [12]:
page = pages[0]
soup = BeautifulSoup(page, 'lxml')
text = get_text_from_page_xml(page)
text, type(text)

('#REDIRECT [[Computer accessibility]]\n\n{{R from move}}\n{{R from CamelCase}}\n{{R unprintworthy}}',
 str)

In [13]:
data = []
for page in pages:
    row = {
        'title': get_title_from_page_xml(page),
        'text': get_text_from_page_xml(page),
    }
    data.append(row)

In [30]:
soup.

[<html><body><page>
 <title>AccessibleComputing</title>
 <ns>0</ns>
 <id>10</id>
 <redirect title="Computer accessibility"></redirect>
 <revision>
 <id>854851586</id>
 <parentid>834079434</parentid>
 <timestamp>2018-08-14T06:47:24Z</timestamp>
 <contributor>
 <username>Godsy</username>
 <id>23257138</id>
 </contributor>
 <comment>remove from category for seeking instructions on rcats</comment>
 <model>wikitext</model>
 <format>text/x-wiki</format>
 <text xml:space="preserve">#REDIRECT [[Computer accessibility]]
 
 {{R from move}}
 {{R from CamelCase}}
 {{R unprintworthy}}</text>
 <sha1>42l0cvblwtb4nnupxm6wo000d27t6kf</sha1>
 </revision>
 </page>
 </body></html>]

In [16]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,text,title
0,#REDIRECT [[Computer accessibility]]\n\n{{R fr...,AccessibleComputing
1,{{redirect2|Anarchist|Anarchists|the fictional...,Anarchism
2,#REDIRECT [[History of Afghanistan]]\n\n{{Redi...,AfghanistanHistory
3,#REDIRECT [[Geography of Afghanistan]]\n\n{{Re...,AfghanistanGeography
4,#REDIRECT [[Demographics of Afghanistan]]\n\n{...,AfghanistanPeople


### Challenge

#### 1. Create a Pandas dataframe containing the title and text of each page.

* Implement the `get_text_from_page_xml` function above.
* Re-create the dataframe with the text field filled in.

#### 2. Identify the five pages that have the _longest_ text.

* Find the length of each page's `<text>...</text>` element and add it to your dataframe.
* Sort the data frame by text length, descending.
* What are the titles of the five longest articles?

In [31]:
!pip install wikitextparser

Collecting wikitextparser
[?25l  Downloading https://files.pythonhosted.org/packages/02/e0/2061fa2c41f925e36b5dbb4130e7d425671bde40e967367cdcbec875f1fb/wikitextparser-0.24.3-py3-none-any.whl (87kB)
[K    100% |████████████████████████████████| 92kB 4.2MB/s ta 0:00:011
Collecting regex (from wikitextparser)
[?25l  Downloading https://files.pythonhosted.org/packages/11/d9/e37129676d508adf833fb3e3c3fbcb4e5a10183cf45b6c7edbaa57b4a1f2/regex-2019.04.14.tar.gz (644kB)
[K    100% |████████████████████████████████| 645kB 5.2MB/s ta 0:00:011
[?25hBuilding wheels for collected packages: regex
  Building wheel for regex (setup.py) ... [?25ldone
[?25h  Stored in directory: /Users/jacobcrabb/Library/Caches/pip/wheels/ae/35/86/47caa8baa5e9340dcb02a719f64a7091900e28af7368d35731
Successfully built regex
Installing collected packages: regex, wikitextparser
Successfully installed regex-2019.4.14 wikitextparser-0.24.3
[33mYou are using pip version 19.0.3, however version 19.1 is available.
You sho

In [79]:
import wikitextparser as wtp

In [80]:
parsed = wtp.parse(df.text[row])
parsed.templates

[Template('{{short description|Fictional time-travelling device}}'),
 Template('{{Other uses}}'),
 Template('{{Use British English|date=November 2012}}'),
 Template('{{Use dmy dates|date=November 2012}}'),
 Template('{{Infobox fictional artifact\n| name            = TARDIS\n| image           = Tardis BBC Television Center.jpg\n| caption         = TARDIS prop used between 2010 and 2017.\n| source          = [[Doctor Who]]\n| source_type     = TV\n| company         = [[BBC]]\n| first           = [[An Unearthly Child]]\n| date            = 1963\n| creator         = {{unbulleted list|[[Sydney Newman]]|[[C. E. Webber]]|[[Donald Wilson (writer and producer)|Donald Wilson]]}}\n| episode_creator = [[Anthony Coburn]]\n| genre           = [[Science fiction]]\n| type            = [[Time travel|Time machine]]/[[spacecraft]]\n| uses            = Travels through time and space\n| traits          = Can change its outer dimensions and inner layout, impregnable, telepathic \n| affiliation     = [[Time 

In [81]:

for index in range(len(parsed.templates)):
    if str(parsed.templates[index]) in df.text[1]:
        df.text[1] = df.text[1].replace(f'{parsed.templates[index]}', "")

In [82]:
df.text[1]

'\n\n\n\n\n\n\'\'\'Anarchism\'\'\' is an [[anti-authoritarian]] [[political philosophy]] that advocates [[Self-governance|self-governed]] societies based on voluntary, [[cooperative]] institutions and the rejection of [[Hierarchy|hierarchies]] those societies view as unjust. These institutions are often described as [[Stateless society|stateless societies]], although several authors have defined them more specifically as distinct institutions based on non-hierarchical or [[Free association (communism and anarchism)|free associations]]. Anarchism holds the [[Sovereign state|state]] to be undesirable, unnecessary and harmful.\n\nAnarchism is often considered a [[Far-left politics|far-left]]   ideology and much of its [[Anarchist economics|economics]] and [[Anarchist law|legal philosophy]] reflect [[Libertarian socialism|anti-authoritarian interpretations]] of [[Anarcho-communism|communism]], [[Collectivist anarchism|collectivism]], [[Anarcho-syndicalism|syndicalism]], [[Mutualism (econom

In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19822 entries, 0 to 19821
Data columns (total 2 columns):
text     19822 non-null object
title    19822 non-null object
dtypes: object(2)
memory usage: 309.8+ KB
