In [8]:
import os
import time
import concurrent.futures
from collections import Counter
import re
from bs4 import BeautifulSoup

# Analyzing Wikipedia Pages


In this guided project, we'll be analyzing 54 megabytes worth of articles to figure out patterns in the Wikipedia writing and content presentation style. The articles were scraped by hitting random pages on Wikipedia, then downloading the contents using the requests package. The scraping code is in this folder, in the scrape_random.py file

Our main goals will be to:

- Extract only the text from the Wikipedia pages, and remove all HTML and Javascript markup.
- Remove common page headers and footers from the Wikipedia pages.
- Figure out what tags are the most common in Wikipedia pages.
- Figure out patterns in the text.

In [9]:
wiki_list = ['wiki/'+el for el in os.listdir('wiki')]
print(len(wiki_list))

with open('wiki/Yarkant_County.html', encoding='UTF-8') as f:
    for line in f:
        print(line)


999
<!DOCTYPE html>

<html class="client-nojs" lang="en" dir="ltr">

<head>

<meta charset="UTF-8"/>

<title>Yarkant County - Wikipedia</title>

<script>document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(\s|$)/, "$1client-js$2" );</script>

<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Yarkant_County","wgTitle":"Yarkant County","wgCurRevisionId":757597549,"wgRevisionId":757597549,"wgArticleId":879098,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Pages containing links to subscription-only content","Articles with Chinese-language external links","CS1 maint: Unrecognized language","Pages using ISBN magic links","Articles containing simplified Chinese-language text","Articles containing Uyghur-language text","Coordinates on Wikidata","Pages using infobox set

Now that we know the file structure, and the structure of a single file, we can read in all of the files. This will get us started in our explorations.

In [10]:
def full_content_list(file): #should return the content of a file
    with open(file) as f:
        return f.read()

start = time.time()
pool = concurrent.futures.ThreadPoolExecutor(max_workers=2)
content = pool.map(full_content_list, wiki_list)
content = list(content)

print(time.time() - start)
# print(content[1])

articles = [el.strip('wiki/''.html') for el in wiki_list]
print(articles[1]) #We've created a list that contains the article names


0.3208339214324951
Valentin_Yanin


We ran a series of experiments and found out that in this case threads with 2 workers give better speed in comparison with processes and threads with more than 2 workers. Approximately 0.2 seconds versus 0.5 seconds and 0.3 seconds, respectively.


Now that we've read in the data files, we can remove the extraneous markup that's outside the div#content tag that most of the content seems to be inside. We can use the BeautifulSoup package for this. BeautifulSoup enables us to extract all of the content inside a specific tag.

Using the BeautifulSoup package, we'll parse each wiki article, then extract the div with id content and everything inside it.

In [11]:
def content_list(file):
    soup = BeautifulSoup(file, 'html.parser')
    return str(soup.find_all("div", id="content")[0])

start = time.time()
pool = concurrent.futures.ProcessPoolExecutor(max_workers=2)
parsed = pool.map(content_list, content)
parsed = list(parsed)

print(time.time() - start)
print(parsed[1])

27.389646291732788
<div class="mw-body" id="content" role="main">
<a id="top"></a>
<div id="siteNotice"><!-- CentralNotice --></div>
<div class="mw-indicators">
</div>
<h1 class="firstHeading" id="firstHeading" lang="en">Valentin Yanin</h1>
<div class="mw-body-content" id="bodyContent">
<div id="siteSub">From Wikipedia, the free encyclopedia</div>
<div id="contentSub"></div>
<div class="mw-jump" id="jump-to-nav">
					Jump to:					<a href="#mw-head">navigation</a>, 					<a href="#p-search">search</a>
</div>
<div class="mw-content-ltr" dir="ltr" id="mw-content-text" lang="en"><p><b>Valentin Lavrentievich Yanin</b> (<a href="/wiki/Russian_language" title="Russian language">Russian</a>: <span lang="ru" xml:lang="ru">Валентин Лаврентьевич Янин</span>, born 6 February 1929 in <a href="/wiki/Kirov,_Kirov_Oblast" title="Kirov, Kirov Oblast">Vyatka</a>) is a leading <a href="/wiki/Russia" title="Russia">Russian</a> historian who has authored 700 books and articles. He has also edited a number 


This operation is quite slow and requires significant CPU resources. It looks like using as many available processors as possible doesn't speed things up very much.




In [12]:


def count_tags(html):
    soup = BeautifulSoup(html, 'html.parser')
    tags = {}
    for tag in soup.find_all():
        tags[tag.name] = tags.get(tag.name, 0) + 1
    return tags

start = time.time()
pool = concurrent.futures.ProcessPoolExecutor(max_workers=2)
tags_list = pool.map(count_tags, parsed)
tags_list = list(tags_list)

tags = {}
for dict_ in tags_list:
    for key in dict_:
        tags[key] = tags.get(key, 0) + dict_[key]

print(time.time() - start)
print(tags)


13.833491802215576
{'b': 14455, 'sup': 11157, 'sub': 151, 'blockquote': 58, 'a': 161065, 'li': 85779, 'tr': 27300, 'img': 6701, 'br': 4986, 'wbr': 85, 'h6': 1, 'p': 7998, 'ol': 858, 'mstyle': 2, 'hr': 51, 'div': 28581, 'math': 2, 'annotation': 2, 'big': 75, 'caption': 200, 's': 10, 'semantics': 2, 'dd': 1376, 'del': 2, 'table': 4010, 'samp': 2, 'rb': 16, 'span': 67350, 'dt': 334, 'ul': 10972, 'h4': 117, 'noscript': 999, 'q': 76, 'small': 3272, 'mrow': 2, 'dl': 457, 'h1': 999, 'center': 64, 'u': 51, 'pre': 1, 'audio': 2, 'map': 2, 'rp': 32, 'source': 2, 'th': 14472, 'ruby': 16, 'cite': 3563, 'i': 18246, 'bdi': 4, 'mo': 2, 'h2': 4045, 'area': 39, 'font': 40, 'h3': 777, 'abbr': 3665, 'h5': 4, 'td': 57673, 'code': 108, 'rt': 16, 'strong': 599}


We ran a series of experiments and found out that in this case processes with 2 workers give better speed in comparison with threads. Approximately 13 seconds versus 27 seconds respectively.



For now, let's find the most frequently used words in all files. Only words longer than 5 characters will be counted.

In [None]:

def word_counter(html): 
    soup = BeautifulSoup(html, 'html.parser')
    words = {}
    text = soup.get_text()
    text = text.lower()
    text = re.findall("\w{5,}", text)
#     text = text.split(" ")
#     text = [w for w in text if len(w) >= 5]
    words = {}
    for word in text:
        words[word] = words.get(word, 0) + 1
    return words

start = time.time()
pool = concurrent.futures.ProcessPoolExecutor(max_workers=2)
words_list = pool.map(word_counter, parsed)
words_list = list(words_list)

words = {}
for dict_ in words_list:
    for key in dict_:
        words[key] = words.get(key, 0) + dict_[key]

print(time.time() - start)
words

14.15831470489502


{'kingdombuildings': 1,
 '67461': 1,
 '757428306': 1,
 'empireknights': 1,
 'motto': 13,
 '761408194': 1,
 'пушкина': 1,
 '768381931': 1,
 'additionally': 9,
 'carbonell': 1,
 'optioned': 2,
 'pietro': 7,
 'tripiṭaka': 2,
 'avogadro': 1,
 'busca': 1,
 'nubicus': 1,
 'tuždimъ': 1,
 'reappears': 2,
 'intequam': 1,
 'reviewer': 4,
 'gulbenkian': 1,
 'taungbyu': 1,
 'bentley': 8,
 'planners': 1,
 'talesh': 2,
 'wetpaint': 1,
 'curtain': 2,
 'stovsky': 1,
 'zhejiang': 11,
 'kriss': 1,
 'rencontre': 1,
 'pratincola': 1,
 'posidonia': 3,
 '2015pages': 2,
 'playerss': 4,
 'yeldon': 1,
 'okehampton': 1,
 'crandon': 1,
 'hardly': 3,
 'nargestan': 1,
 'skinnigrove': 1,
 'korchvandan': 1,
 'shipley': 1,
 'gurudarshan': 1,
 'aggression': 3,
 'jharsuguda': 2,
 'renera': 1,
 'saskia': 1,
 'valdenuño': 1,
 'expectations': 9,
 '42028': 2,
 'poses': 1,
 'maisons': 2,
 'reviewed': 5,
 'n81009469': 1,
 'северная': 1,
 'strassburg': 1,
 'amazonaws': 1,
 'ottavio': 1,
 'finland1993': 1,
 'bærum': 1,
 'fadak

Only selecting the top 10 words from each article speeds up performance quite a bit.


For now, let's answer the question: what articles are most commonly linked to from our articles?
Only articles name longer than 5 characters will be counted.

In [34]:

def count_tags(html):
    soup = BeautifulSoup(html, 'html.parser')
    articles = {}
    for tag in soup.find_all('a'):
        if len(tag.text) > 5:
            articles[tag.text] = articles.get(tag.text, 0) + 1
    return articles

start = time.time()
pool = concurrent.futures.ProcessPoolExecutor(max_workers=2)
links_list = pool.map(count_tags, parsed)
links_list = list(links_list)

links = {}
for dict_ in links_list:
    for key in dict_:
        links[key] = links.get(key, 0) + dict_[key]

print(time.time() - start)

14.515275478363037


We ran a series of experiments and found out that in this case processes with 2 workers give better speed in comparison with threads. Approximately 13 seconds versus 27 seconds respectively.

In [35]:
links_dct = links.items() 
sorted(links_dct, key=lambda i: i[1], reverse=True)[:100]

[('navigation', 1000),
 ('Categories', 999),
 ('search', 999),
 ('expanding it', 477),
 ('All stub articles', 407),
 ('the original', 251),
 ('Coordinates', 229),
 ('Learn how and when to remove this template message', 221),
 ('Coordinates on Wikidata', 200),
 ('Living people', 159),
 ('United States', 119),
 ('4 References', 114),
 ('improve this article', 111),
 ('adding citations to reliable sources', 102),
 ('citation needed', 98),
 ('3 References', 96),
 ('Archived', 94),
 ('Authority control', 91),
 ('4 External links', 88),
 ('West German', 84),
 ('Wikipedia articles with VIAF identifiers', 84),
 ('WorldCat Identities', 84),
 ('5 References', 74),
 ('Time zone', 74),
 ('verification', 72),
 ("Articles with 'species' microformats", 71),
 ('Scientific classification', 69),
 ('Articles with hCards', 66),
 ('Internet Movie Database', 64),
 ('Pages using deprecated coordinates format', 63),
 ('Labour', 62),
 ('All articles with unsourced statements', 61),
 ('Liberal Democrat', 58),
 