In [1]:
import json, time, urllib.parse, csv # For storing data and manipulating it

import requests # For API requests
import pandas as pd # For data manipulation

In [2]:
# Load in data
dfPol = pd.read_csv("data/politicians_by_country_SEPT.2022.csv")
dfPop = pd.read_csv("data/population_by_country_2022.csv")
#print(dfPol.head())

In [3]:
# Convert dfPop to a dict ordered file
dfPop["Region"] = ""
#print(dfPop.loc[0,"Geography"])
reg = ""
for i in range(len(dfPop.index)):
    geo = dfPop.loc[i, "Geography"]
    if(geo.isupper()):
        reg = geo
    dfPop.loc[i, "Region"] = reg
    if(reg == geo):
        dfPop = dfPop.drop([i])
dfPop = dfPop.reset_index(drop = True)
print(dfPop.head())

  Geography  Population (millions)           Region
0   Algeria                   44.9  NORTHERN AFRICA
1     Egypt                  103.5  NORTHERN AFRICA
2     Libya                    6.8  NORTHERN AFRICA
3   Morocco                   36.7  NORTHERN AFRICA
4     Sudan                   46.9  NORTHERN AFRICA


In [4]:
#########
#
#    CONSTANTS
#

# The basic English Wikipedia API endpoint
API_ENWIKIPEDIA_ENDPOINT = "https://en.wikipedia.org/w/api.php"

# We'll assume that there needs to be some throttling for these requests - we should always be nice to a free data resource
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

# When making automated requests we should include something that is unique to the person making the request
# This should include an email - your UW email would be good to put in there
REQUEST_HEADERS = {
    'User-Agent': 'ecorpron, University of Washington, MSDS DATA 512 - AUTUMN 2022',
}

# This is just a list of English Wikipedia article titles that we can use for example requests
ARTICLE_TITLES = dfPol.name

# This is a string of additional page properties that can be returned see the Info documentation for
# what can be included. If you don't want any this can simply be the empty string
PAGEINFO_EXTENDED_PROPERTIES = "talkid|url|watched|watchers"
#PAGEINFO_EXTENDED_PROPERTIES = ""

# This template lists the basic parameters for making this
PAGEINFO_PARAMS_TEMPLATE = {
    "action": "query",
    "format": "json",
    "titles": "",           # to simplify this should be a single page title at a time
    "prop": "info",
    "inprop": PAGEINFO_EXTENDED_PROPERTIES
}

# The current ORES API endpoint
API_ORES_SCORE_ENDPOINT = "https://ores.wikimedia.org/v3"
# A template for mapping to the URL
API_ORES_SCORE_PARAMS = "/scores/{context}/{revid}/{model}"

# Use some delays so that we do not hammer the API with our requests
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

# When making automated requests we should include something that is unique to the person making the request
# This should include an email - your UW email would be good to put in there
REQUEST_HEADERS = {
    'User-Agent': '<uwnetid@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2022'
}

# This template lists the basic parameters for making an ORES request
ORES_PARAMS_TEMPLATE = {
    "context": "enwiki",        # which WMF project for the specified revid
    "revid" : "",               # the revision to be scored - this will probably change each call
    "model": "articlequality"   # the AI/ML scoring model to apply to the reviewion
}
#
# The current ML models for English wikipedia are:
#   "articlequality"
#   "articletopic"
#   "damaging"
#   "version"
#   "draftquality"
#   "drafttopic"
#   "goodfaith"
#   "wp10"
#
# The specific documentation on these is scattered so if you want to use one you'll have to look around.
#

In [5]:
#########
#
#    PROCEDURES/FUNCTIONS
#

def request_pageinfo_per_article(article_title = None, 
                                 endpoint_url = API_ENWIKIPEDIA_ENDPOINT, 
                                 request_template = PAGEINFO_PARAMS_TEMPLATE,
                                 headers = REQUEST_HEADERS):
    # Make sure we have an article title
    if not article_title: return None
    
    request_template['titles'] = article_title
        
    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like Wikipedia - or any other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(endpoint_url, headers=headers, params=request_template)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

def request_ores_score_per_article(article_revid = None, 
                                   endpoint_url = API_ORES_SCORE_ENDPOINT, 
                                   endpoint_params = API_ORES_SCORE_PARAMS, 
                                   request_template = ORES_PARAMS_TEMPLATE,
                                   headers = REQUEST_HEADERS,
                                   features=False):
    # Make sure we have an article revision id
    if not article_revid: return None
    
    # set the revision id into the template
    request_template['revid'] = article_revid
    
    # now, create a request URL by combining the endpoint_url with the parameters for the request
    request_url = endpoint_url+endpoint_params.format(**request_template)
    
    # the features used by the ML model can sometimes be returned as well as scores
    if features:
        request_url = request_url+"?features=true"
    
    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like ORES - or other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response


In [6]:
dfArticles = pd.DataFrame(columns = ["article_title", "revision_id", "article_quality"])
articlesWithNoScore = []

print("Starting Article Access")
for i in range(len(ARTICLE_TITLES)):
    dfArticles.loc[i, "article_title"] = ARTICLE_TITLES[i]
    info = request_pageinfo_per_article(ARTICLE_TITLES[i])

    articleKey = list(info["query"]["pages"].keys())[0]
    revId = info["query"]["pages"][articleKey].get("lastrevid")
    dfArticles.loc[i, "revision_id"] = revId
    
    score = request_ores_score_per_article(revId)
    
    if score == None:
        dfArticles.loc[i, "article_quality"] = None
    else:
        scoreKey = list(score["enwiki"]["scores"].keys())[0]
        rating = score['enwiki']["scores"][scoreKey]["articlequality"]["score"].get("prediction")
        dfArticles.loc[i, "article_quality"] = rating
print("Finished Article Access")
print(dfArticles.head())

Starting Article Access
Finished Article Access
           article_title revision_id article_quality
0        Shahjahan Noori  1099689043              GA
1  Abdul Ghafar Lakanwal   943562276           Start
2         Majah Ha Adrif   852404094           Start
3      Haroon al-Afghani  1095102390               B
4            Tayyab Agha  1104998382           Start


In [7]:
dfArticles["Country"] = ""
for i in range(len(dfArticles)):
    dfArticles.loc[i, "Country"] = dfPol.loc[i, "country"]
print(dfArticles.head())

           article_title revision_id article_quality      Country
0        Shahjahan Noori  1099689043              GA  Afghanistan
1  Abdul Ghafar Lakanwal   943562276           Start  Afghanistan
2         Majah Ha Adrif   852404094           Start  Afghanistan
3      Haroon al-Afghani  1095102390               B  Afghanistan
4            Tayyab Agha  1104998382           Start  Afghanistan


In [8]:
no_match = []
dfArticles["Region"] = ""
dfArticles["Population"] = 0.0
for i in range(len(dfPop)):
    matched = False
    country = dfPop.loc[i, "Geography"]
    for j in range(len(dfArticles)):
        if country == dfArticles.loc[j, "Country"]:
            matched = True
            dfArticles.loc[j, "Region"] = dfPop.loc[i, "Region"]
            dfArticles.loc[j, "Population"] = dfPop.loc[i, "Population (millions)"]
    if matched == False:
        no_match.append(country)
print(no_match[0:10])
print(dfArticles.head())

['Western Sahara', 'Mauritius', 'Mayotte', 'Reunion', 'Sao Tome and Principe', 'eSwatini', 'Canada', 'United States', 'Curacao', 'Guadeloupe']
           article_title revision_id article_quality      Country      Region  \
0        Shahjahan Noori  1099689043              GA  Afghanistan  SOUTH ASIA   
1  Abdul Ghafar Lakanwal   943562276           Start  Afghanistan  SOUTH ASIA   
2         Majah Ha Adrif   852404094           Start  Afghanistan  SOUTH ASIA   
3      Haroon al-Afghani  1095102390               B  Afghanistan  SOUTH ASIA   
4            Tayyab Agha  1104998382           Start  Afghanistan  SOUTH ASIA   

   Population  
0        41.1  
1        41.1  
2        41.1  
3        41.1  
4        41.1  


In [10]:
with open(r'wp_countries-no_match.txt', 'w') as fp:
    fp.write('\n'.join(no_match))
dfArticles.dropna()
dfArticles.to_csv('wp_politicians_by_country.csv')  