In [14]:
import json, time, urllib.parse, csv # For storing data and manipulating it

import requests # For API requests
import pandas as pd # For data manipulation

In [3]:
#########
#
#    CONSTANTS
#

# The REST API 'pageviews' URL - this is the common URL/endpoint for all 'pageviews' API requests
API_REQUEST_PAGEVIEWS_ENDPOINT = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/'

# This is a parameterized string that specifies what kind of pageviews request we are going to make
# In this case it will be a 'per-article' based request. The string is a format string so that we can
# replace each parameter with an appropriate value before making the request
API_REQUEST_PER_ARTICLE_PARAMS = 'per-article/{project}/{access}/{agent}/{article}/{granularity}/{start}/{end}'

# The Pageviews API asks that we not exceed 100 requests per second, we add a small delay to each request
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

# When making a request to the Wikimedia API they ask that you include a "unique ID" that will allow them to
# contact you if something happens - such as - your code exceeding request limits - or some other error happens
REQUEST_HEADERS = {
    'User-Agent': '<ecorpron@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2022',
}

# This is the file that contains all of the dinasaur article headers
df = pd.read_csv("data/dinosaur_genera.cleaned.SEPT.2022 - dinosaur_genera.cleaned.SEPT.2022.csv")
ARTICLE_TITLES = df.name
# This template is used to map parameter values into the API_REQUST_PER_ARTICLE_PARAMS portion of an API request. The dictionary has a
# field/key for each of the required parameters. In the example, below, we only vary the article name, so the majority of the fields
# can stay constant for each request. Of course, these values *could* be changed if necessary.
ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE = {
    "project":     "en.wikipedia.org",
    "access":      "desktop",
    "agent":       "user",
    "article":     "",             # this value will be set/changed before each request
    "granularity": "monthly",
    "start":       "2015010100",
    "end":         "2022093000"
}

In [4]:
#########
#
#    PROCEDURES/FUNCTIONS
#

# Performs a single API request for an article. Will return a json response broken into months of the article
# in the specified date ranges
def request_pageviews_per_article(article_title = None, 
                                  endpoint_url = API_REQUEST_PAGEVIEWS_ENDPOINT, 
                                  endpoint_params = API_REQUEST_PER_ARTICLE_PARAMS, 
                                  request_template = ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE,
                                  headers = REQUEST_HEADERS):
    # Make sure we have an article title
    if not article_title: return None
    
    # Titles are supposed to have spaces replaced with "_" and be URL encoded
    article_title_encoded = urllib.parse.quote(article_title.replace(' ','_'))
    request_template['article'] = article_title_encoded
    
    # now, create a request URL by combining the endpoint_url with the parameters for the request
    request_url = endpoint_url+endpoint_params.format(**request_template)
    
    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like Wikipedia - or other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

In [4]:
print('Starting API requests for desktop views for desktop dinosaur articles')

# Creates a new file with specified name, even if it already exists
with open('dino_monthly_desktop_start201501-end202209.json', 'w') as outfile:    
    fileToBe = []
    
    # Loops through each article.
    # Skips the first one since it appears to be a dead link
    for article in ARTICLE_TITLES[1:]:
        # retrieves the view information
        views = request_pageviews_per_article(article)
        
        # Loops through each month of data in the view
        for month in views['items']:
            # Copies the month over to be saved
            fileToBe.append(month)
    # Saves the data as a json file
    json.dump(fileToBe, outfile, indent = 4)
print('API requests finished, file created with name '+'dino_monthly_desktop_start201501-end202209.json')

starting
ending


In [5]:
# Sets values for mobile-app view data
articlePageviewsParamsMobileApp = {
    "project":     "en.wikipedia.org",
    "access":      "mobile-app",
    "agent":       "user",
    "article":     "",
    "granularity": "monthly",
    "start":       "2015010100",
    "end":         "2022093000"
}
# Sets values for mobile-web view data
articlePageviewsParamsMobileWeb = {
    "project":     "en.wikipedia.org",
    "access":      "mobile-web",
    "agent":       "user",
    "article":     "",
    "granularity": "monthly",
    "start":       "2015010100",
    "end":         "2022093000"
}

print('Starting API requests for desktop views for all web dinosaur articles')
with open('dino_monthly_mobile_start201501-end202209.json', 'w') as outfile:
    fileToBe = []

    # Runs through all articles
    for article in ARTICLE_TITLES[1:]:
        # Gets the views for both web access ways
        appViews = request_pageviews_per_article(article, request_template = articlePageviewsParamsMobileApp)
        webViews = request_pageviews_per_article(article, request_template = articlePageviewsParamsMobileWeb)
        
        for i in range(len(appViews['items'])):
            # Goes through each month and adds the view data together
            appViews['items'][i]['views'] += webViews['items'][i]['views']
            fileToBe.append(appViews['items'][i])
    # Saves the combined view data in a json file
    json.dump(fileToBe, outfile, indent = 4)
print('API requests finished, file created with name '+'dino_monthly_mobile_start201501-end202209.json')

starting
ending


In [6]:
#  Sets values for cumulative views
articlePageviewsParamsAll = {
    "project":     "en.wikipedia.org",
    "access":      "all-access",
    "agent":       "user",
    "article":     "",
    "granularity": "monthly",
    "start":       "2015010100",
    "end":         "2022093000"
}

print('Starting API requests for all views on dinosaur articles')
with open('dino_monthly_cumulative_start201501-end202209.json', 'w') as outfile:
    fileToBe = []
    
    # Runs through all articles
    for article in ARTICLE_TITLES[1:]:
        # Gets the views for current article
        views = request_pageviews_per_article(article, request_template = articlePageviewsParamsAll)
        
        # resets total to zero
        total = 0
        
        # Runs through each month of data
        for month in views['items']:
            # sums all previous months with current month
            total += month['views']
            # saves into current month viewing
            month['views'] = total
            fileToBe.append(month)
    
    # Saves cumulative view data in JSON file
    json.dump(fileToBe, outfile, indent = 4)
print('API requests finished, file created with name '+'dino_monthly_cumulative_start201501-end202209.json')

starting
ending
