# Python Web Scraping
In this lab, you are to continue to build on the Dr. Who popularity solution.  What remains
is to evaluate the popularity of each Dr. Who actor by
using the page views of the actor’s Wikipedia page as a proxy for their popularity.

##  Using the Names + BeautifulSoup the Get the Stats
Using the exact same principles used to collect the list of Dr. Who actors,
we now need to collect the 30-day page view stat for each actor.

The pseudocode for this activity is roughly as follows:

1. Explore the HTML underlying an example Wikipedia stats page:
https://en.wikipedia.org/w/index.php?title=Jodie_Whittaker&action=info
Look (**hard**) for a pattern that will allow you to capture the Page views in the past 30 days.
Turns out there is perfect pattern you should be able to exploit.
2. For each actor, combine the actor name with the Wikipedia URL string as a parameter
 - Fetch the stats web page by GET(ting) the URL just constructed
 - Parse the returned HTML using Beautiful Soup
 - Find the stats using your previously observed exploitable pattern
 - remove any noise from the stats string number
 - convert stats string to integer via int()
 - track the actor’s stat using a list or dictionary
3. Sort the actor stats in descending order
4. print the top 5

Have a beer – you deserve it!

In [1]:
from requests.exceptions import HTTPError
import requests
from bs4 import BeautifulSoup
import re

EW_URL = 'http://ew.com/tv/doctor-who-actors/'

def simple_get(url, *args, **kwargs):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        resp = requests.get(url, *args, **kwargs)
        # If the response was successful, no Exception will be raised
        resp.raise_for_status()

    except HTTPError as http_err:
        print(f'HTTP error occurred: {http_err}')
        raise http_err
    except Exception as err:
        print(f'Other error occurred: {err}')
        raise err

    return resp

def who_actors(url):
    resp = simple_get(url, timeout=5)
    html = resp.text

    # sanity check. is this HTML?
    assert re.search('html', resp.headers['Content-Type'], re.IGNORECASE)

    soup = BeautifulSoup(html, 'html.parser')

    # to be returned
    actor_list = []

    for img in soup.find_all('img', title=re.compile(r'^Slide\s+\d+:\s+[A-Z]')):

        # I want the name from the title attribute which looks like this:
        # Slide 10: Sixth Doctor: Colin Baker
        # Another good use for REs.
        # This RE starts the same as before; however, after the first :
        # the [^:]+[:]\s+ says "gobble up all (one ore more) characters that
        # are not a : until you run into a colon
        # that is followed by one or more spaces. After that,
        # capture all remaining characters in a group named <actor>"
        #
        title = img['title']

        m = re.search(r'^Slide\s+\d+:[^:]+[:]\s+(?P<actor>.*)$', title)
        # if no match, then I've screwed up something
        assert m is not None
        if m:
            actor_list.append(m.group('actor'))

    # Great, got my list of actors. Return to caller
    return actor_list

'''
    # PHASE 2:
    # Collect the stats from Wikipedia
    # for each who actor
'''

def who_stats(dr_who):
    url = 'https://en.wikipedia.org/w/index.php'

    # Notice that navigation to the info page is a query param
    resp = simple_get(url, params={'title':dr_who, 'action': 'info'})
    # get the decoded payload.  the text() method uses metadata to devine encoding.
    html = resp.text

    # By inspection of HTTP results you will find that the
    # stat we seek is extremely easy to find:
    # <div class="mw-pvi-month">58,243</div>
    # the <div> tag has a class attr designed to display the "pvi" - page view in...months!

    soup = BeautifulSoup(html, 'html.parser')

    #  Only need a find (not find_all) since there is only a single tag
    # that has a class attr = mw-pvi-month

    div = soup.find('div', class_='mw-pvi-month')
    # sanity check
    assert div is not None
    # this text may have commas which need to be removed
    # prior to parsing as an int
    return int(div.text.replace(',',''))

def main():
    # PHASE 1:
    # Get the Dr.Who actors from EW_URL
    actor_list = who_actors(EW_URL)

    # PHASE 2:
    # Collect the stats from Wikipedia
    # for each who actor
    #
    actor_stats_dict = {}

    for a in actor_list:
        # the names from the EW are separated by \s chars.  In wikipedia urls,
        # those spaces need to become underscores (_)
        wiki_a = a.replace(' ', '_')
        pvim_stat = who_stats(wiki_a)
        actor_stats_dict[a] = pvim_stat

    # PHASE 3:
    # Sort number of views in desc order
    sorted_actor_list = sorted(actor_stats_dict, key=actor_stats_dict.get, reverse=True)

    print("Drum roll please...\nThe top 5 Dr. Who actors are:")
    for a in sorted_actor_list[0:5]:
        cnt = actor_stats_dict[a]
        print(f'\t{a} : {cnt}')

if __name__ == "__main__":
    main()

Drum roll please...
The top 5 Dr. Who actors are:
	David Tennant : 120349
	Matt Smith : 113168
	Jodie Whittaker : 94588
	John Hurt : 87538
	Christopher Eccleston : 86257
