In [109]:
# Count lines, words, characters in XML data file
!wc enwiki-20190420-pages-articles-multistream1.xml-p10p30302

 4586335 74563548 642819893 enwiki-20190420-pages-articles-multistream1.xml-p10p30302


In [110]:
from bs4 import BeautifulSoup
import pandas as pd

In [111]:
def get_pages_from_xml_file(filename, start_tag='<page>', end_tag='</page>'):
    """Yields each page from the specified XML data file."""
    page = None
    with open(filename) as f:
        for line in f:
            if start_tag in line:
                page = []
                page.append(line)
            elif end_tag in line:
                page.append(line)
                page_xml = ''.join(page)
                yield page_xml
                page = None
            else:
                if page is not None:
                    page.append(line)

In [112]:
filename = 'enwiki-20190420-pages-articles-multistream1.xml-p10p30302'
pages = list(get_pages_from_xml_file(filename))

In [113]:
def get_title_from_page_xml(page_xml):
    """Returns the title of the given page."""
    soup = BeautifulSoup(page_xml, 'lxml')
    return soup.select_one('title').text

In [114]:
len(pages)  # Show the number of pages

19822

In [115]:
%time titles = [get_title_from_page_xml(page) for page in pages]

CPU times: user 1min 2s, sys: 3.63 s, total: 1min 5s
Wall time: 1min 8s


In [116]:
titles[:5] + ['...'] + titles[-5:]  # Show the first and last 5 titles

['AccessibleComputing',
 'Anarchism',
 'AfghanistanHistory',
 'AfghanistanGeography',
 'AfghanistanPeople',
 '...',
 'The Lord of the Rings/One Ring',
 'Tax Freedom Day',
 'Tax',
 'Transhumanism',
 'TARDIS']

In [117]:
print(pages[0])  # Print the first page

  <page>
    <title>AccessibleComputing</title>
    <ns>0</ns>
    <id>10</id>
    <redirect title="Computer accessibility" />
    <revision>
      <id>854851586</id>
      <parentid>834079434</parentid>
      <timestamp>2018-08-14T06:47:24Z</timestamp>
      <contributor>
        <username>Godsy</username>
        <id>23257138</id>
      </contributor>
      <comment>remove from category for seeking instructions on rcats</comment>
      <model>wikitext</model>
      <format>text/x-wiki</format>
      <text xml:space="preserve">#REDIRECT [[Computer accessibility]]

{{R from move}}
{{R from CamelCase}}
{{R unprintworthy}}</text>
      <sha1>42l0cvblwtb4nnupxm6wo000d27t6kf</sha1>
    </revision>
  </page>



In [118]:
def get_text_from_page_xml(page):
    """Returns the text of the given page."""
    soup = BeautifulSoup(page, 'lxml')
    return soup.select_one('text').text

In [119]:
page = pages[0]
soup = BeautifulSoup(page, 'lxml')
text = get_text_from_page_xml(page)
text, type(text)

('#REDIRECT [[Computer accessibility]]\n\n{{R from move}}\n{{R from CamelCase}}\n{{R unprintworthy}}',
 str)

In [120]:
data = []
for page in pages:
    row = {
        'title': get_title_from_page_xml(page),
        'text': get_text_from_page_xml(page),
    }
    data.append(row)

In [121]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,text,title
0,#REDIRECT [[Computer accessibility]]\n\n{{R fr...,AccessibleComputing
1,{{redirect2|Anarchist|Anarchists|the fictional...,Anarchism
2,#REDIRECT [[History of Afghanistan]]\n\n{{Redi...,AfghanistanHistory
3,#REDIRECT [[Geography of Afghanistan]]\n\n{{Re...,AfghanistanGeography
4,#REDIRECT [[Demographics of Afghanistan]]\n\n{...,AfghanistanPeople


### Challenge

#### 1. Create a Pandas dataframe containing the title and text of each page.

* Implement the `get_text_from_page_xml` function above.
* Re-create the dataframe with the text field filled in.

#### 2. Identify the five pages that have the _longest_ text.

* Find the length of each page's `<text>...</text>` element and add it to your dataframe.
* Sort the data frame by text length, descending.
* What are the titles of the five longest articles?

In [122]:
text_length = []
for row in range(len(df.text)):
    text_length.append(len(df.text[row]))

In [123]:
df['text_length'] = 'text_length'

In [124]:
df.columns

Index(['text', 'title', 'text_length'], dtype='object')

In [125]:
for row in range(len(df)):
    df.text_length[row] = text_length[row]

In [126]:
high_values = df.text_length.sort_values(ascending=False).head().index

In [127]:
df.loc[high_values]

Unnamed: 0,text,title,text_length
10165,{{short description|Wikipedia list page}}\n{{s...,List of compositions by Johann Sebastian Bach,432964
15129,{{about|the country}}\n{{pp-semi-indef}}\n{{pp...,Pakistan,376957
15287,{{Redirect|Philippine|the town in the Netherla...,Philippines,357950
9299,{{Short description|Overview of relations}}{{U...,Foreign relations of India,317725
8838,{{short description|Aspect of history}}\n{{pp-...,History of India,316816


In [128]:
longest_entries_df = df.loc[high_values]

In [129]:
longest_entries_df = longest_entries_df.reset_index(drop=True)

In [130]:
longest_entries_df

Unnamed: 0,text,title,text_length
0,{{short description|Wikipedia list page}}\n{{s...,List of compositions by Johann Sebastian Bach,432964
1,{{about|the country}}\n{{pp-semi-indef}}\n{{pp...,Pakistan,376957
2,{{Redirect|Philippine|the town in the Netherla...,Philippines,357950
3,{{Short description|Overview of relations}}{{U...,Foreign relations of India,317725
4,{{short description|Aspect of history}}\n{{pp-...,History of India,316816


# cleaning up the wiki text

In [131]:
# a bit messy and hard to read
print(longest_entries_df.text[0][:2000])

{{short description|Wikipedia list page}}
{{see also|List of compositions by Johann Sebastian Bach by BWV number}}
{{Very long|date=November 2018}}
[[File:Canon BWV 1076.jpg|thumb|upright=1.21| ''Canon triplex a 6'': first printed in 1747 (below), it appears on both versions of the portrait [[Elias Gottlob Haussmann|Haussmann]] made of Bach (1746, 1748 – above). In the 19th-century [[Bach Gesellschaft]] edition the canon was published in Volume 45<sup>1</sup>, p. 138. In 1950 the piece was assigned the number 1076 in [[Wolfgang Schmieder|Schmieder]]'s catalogue of Bach's works (BWV). The 1998 edition of that catalogue (BWV<sup>2a</sup>) mentions Haussmann's paintings as original sources for the work (p. 438), and likewise the Bach digital website gives a description of both paintings as sources for the piece (linked from Bach digital Work page {{BDW|1262}}).]]
{{Lists of compositions by Johann Sebastian Bach}}
[[Johann Sebastian Bach]] composed [[Bach cantata|cantatas]], [[List of mote

In [132]:
#bring in a parsing library
import wikitextparser as wtp

for file_num in [0,1,2,3,4]:
    parsed = wtp.parse(longest_entries_df.text[file_num])
    to_remove = parsed.templates

    #look for the wikitext templates and remove them
    #from the text
    for index in range(len(to_remove)):
        if str(to_remove[index]) in longest_entries_df.text[file_num]:
            longest_entries_df.text[file_num] = longest_entries_df.text[file_num].replace(f'{to_remove[index]}', "")

In [133]:
#a sampling of what it looks like now
#much easier to read
print(longest_entries_df.text[0][:2000])




[[File:Canon BWV 1076.jpg|thumb|upright=1.21| ''Canon triplex a 6'': first printed in 1747 (below), it appears on both versions of the portrait [[Elias Gottlob Haussmann|Haussmann]] made of Bach (1746, 1748 – above). In the 19th-century [[Bach Gesellschaft]] edition the canon was published in Volume 45<sup>1</sup>, p. 138. In 1950 the piece was assigned the number 1076 in [[Wolfgang Schmieder|Schmieder]]'s catalogue of Bach's works (BWV). The 1998 edition of that catalogue (BWV<sup>2a</sup>) mentions Haussmann's paintings as original sources for the work (p. 438), and likewise the Bach digital website gives a description of both paintings as sources for the piece (linked from Bach digital Work page ).]]

[[Johann Sebastian Bach]] composed [[Bach cantata|cantatas]], [[List of motets by Johann Sebastian Bach|motets]], [[Bach's church music in Latin|masses, Magnificats]], [[Passions (Bach)|Passions]], [[List of oratorios by Johann Sebastian Bach|oratorios]], [[List of chorale harmonisa