### Imports

In [1]:
import os
import time
import json
import requests
import urllib.parse
import pandas as pd
from typing import Any, Dict, List, Optional

### Constants

1. API endpoints and models

In [2]:
# The LiftWing ORES API endpoint and prediction model
API_ORES_LIFTWING_ENDPOINT = "https://api.wikimedia.org/service/lw/inference/v1/models/{model_name}:predict"
API_ORES_EN_QUALITY_MODEL = "enwiki-articlequality"

# The basic English Wikipedia API endpoint
API_HEADER_AGENT = 'User-Agent'
API_ENWIKIPEDIA_ENDPOINT = "https://en.wikipedia.org/w/api.php"

2. Throttling and rate limits

In [3]:
# Assumed latency and throttling for LiftWing API
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = ((60.0 * 60.0) / 5000.0) - API_LATENCY_ASSUMED  # LiftWing key authorizes 5000 requests per hour

# Assumed throttling for Wikipedia API (100 requests/second)
API_THROTTLE_WAIT_ENWIKI = (1.0 / 100.0) - API_LATENCY_ASSUMED

3. Request headers

In [4]:
# LiftWing API request headers with authorization
REQUEST_HEADER_TEMPLATE = {
    'User-Agent': "rollk@uw.edu, University of Washington, MSDS DATA 512 - AUTUMN 2024",
    'Content-Type': 'application/json',
    'Authorization': "Bearer {access_token}"
}

# Wikipedia API headers
REQUEST_HEADERS = {
    'User-Agent': 'rollk@uw.edu, University of Washington, MSDS DATA 512 - AUTUMN 2024'
}

# Template for request parameters (to be filled with actual values)
REQUEST_HEADER_PARAMS_TEMPLATE = {
    'email_address': "rollk@uw.edu",         # Your email address should go here
    'access_token': "eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJhdWQiOiI0NTk0YTFjNGJlMmIyMjVlOGY5NjBiMDg3NDhjNGRmNSIsImp0aSI6IjVhMDY2M2E4OWE2MjU4YzViOTlhOGY0Zjc3YjMyMGU2NmZiNzlhOTU1ZTE2YTA5Mzk4MzM0ZDhiYjRjMzdhZWE3MjMzYzc3MjNkYTdkZjVhIiwiaWF0IjoxNzI4Nzg1NDI3LjczMjMwMywibmJmIjoxNzI4Nzg1NDI3LjczMjMwNywiZXhwIjozMzI4NTY5NDIyNy43MzAyNDQsInN1YiI6Ijc2NzAyOTE1IiwiaXNzIjoiaHR0cHM6Ly9tZXRhLndpa2ltZWRpYS5vcmciLCJyYXRlbGltaXQiOnsicmVxdWVzdHNfcGVyX3VuaXQiOjUwMDAsInVuaXQiOiJIT1VSIn0sInNjb3BlcyI6WyJiYXNpYyJdfQ.anuC2cjC67SIvr9H2Mjktp6TqxJbGFyjP1wxWvb7lv7PkP30c8VIyadng0iN8HXkQxpQb4A4j3N2JmiXXBxzNKBtf94WuAp9rCh0dgR-Gmj6CdqoWGrNglKJB_dFK4NboJi826shrrl7Zd2IRfpSzXzFsTCNvb6LiNp9n4-lYo_e3QPspyt6-YIYIU2D9J_Fnli2IdBPohH7Wm9Rj8AlcwgjURN_Msn44p56WI0QCCS5Z8FAEEZYhHMZT4-d_q8semz7dsr6Va_B9IvmJ8a-L7JVsMo1IxtZ1DW7V-XIkB-dEzDFNQk41nH680IhtHFqhOpBYab0217-qDlXVB2ySSpIjFLmAntx0uUk5urdXZY6Kmu521E6Shxb8LVRD9jCHvkXMy75EnAhIexYuw4aO6wB5Qr6Zwp4NHCfpg05dU9mYR5FUHeFEw2hxKX4JtXU6riZGRl60AZDcaYOK0yw2IOQ_X9QK1l32Cb7IiikQb1pWCg2tKvBsvleHYUi4l2GI8ehwsKG2Hn8S2JjsCUQzdJvUEOVqW15RBLxbZVrsipzgue8eHHF7C01iqaC9B8yYg0QtAVpbTrpM8u_UrC0E8tIyLRajSPy7Tzhhz4REYjpFKIS8HD3LYALZfjB-DbMy4w9O1xI6VZvM0mcoXGZ2X7BX3mIp0AIBW3OTpHlcEs"           # The access token you create will need to go here
}

4. Article revisions and page information

In [5]:
# Template for ORES request payload (required data)
ORES_REQUEST_DATA_TEMPLATE = {
    "lang": "en",     # Required that it's English - we're scoring English Wikipedia revisions
    "rev_id": "",     # This request requires a revision id
    "features": True
}

# Additional page properties to request from Wikipedia API (optional)
PAGEINFO_EXTENDED_PROPERTIES = "talkid|url|watched|watchers"

# Template for making requests to the Wikipedia API for page information
PAGEINFO_PARAMS_TEMPLATE = {
    "action": "query",
    "format": "json",
    "titles": "",           # Single page title at a time
    "prop": "info",
    "inprop": PAGEINFO_EXTENDED_PROPERTIES
}

5. ORES User authentication variables

In [4]:
# Variables to store user credentials for API requests
USERNAME = "DowntonCrabby"
ACCESS_TOKEN = "eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJhdWQiOiI0NTk0YTFjNGJlMmIyMjVlOGY5NjBiMDg3NDhjNGRmNSIsImp0aSI6IjVhMDY2M2E4OWE2MjU4YzViOTlhOGY0Zjc3YjMyMGU2NmZiNzlhOTU1ZTE2YTA5Mzk4MzM0ZDhiYjRjMzdhZWE3MjMzYzc3MjNkYTdkZjVhIiwiaWF0IjoxNzI4Nzg1NDI3LjczMjMwMywibmJmIjoxNzI4Nzg1NDI3LjczMjMwNywiZXhwIjozMzI4NTY5NDIyNy43MzAyNDQsInN1YiI6Ijc2NzAyOTE1IiwiaXNzIjoiaHR0cHM6Ly9tZXRhLndpa2ltZWRpYS5vcmciLCJyYXRlbGltaXQiOnsicmVxdWVzdHNfcGVyX3VuaXQiOjUwMDAsInVuaXQiOiJIT1VSIn0sInNjb3BlcyI6WyJiYXNpYyJdfQ.anuC2cjC67SIvr9H2Mjktp6TqxJbGFyjP1wxWvb7lv7PkP30c8VIyadng0iN8HXkQxpQb4A4j3N2JmiXXBxzNKBtf94WuAp9rCh0dgR-Gmj6CdqoWGrNglKJB_dFK4NboJi826shrrl7Zd2IRfpSzXzFsTCNvb6LiNp9n4-lYo_e3QPspyt6-YIYIU2D9J_Fnli2IdBPohH7Wm9Rj8AlcwgjURN_Msn44p56WI0QCCS5Z8FAEEZYhHMZT4-d_q8semz7dsr6Va_B9IvmJ8a-L7JVsMo1IxtZ1DW7V-XIkB-dEzDFNQk41nH680IhtHFqhOpBYab0217-qDlXVB2ySSpIjFLmAntx0uUk5urdXZY6Kmu521E6Shxb8LVRD9jCHvkXMy75EnAhIexYuw4aO6wB5Qr6Zwp4NHCfpg05dU9mYR5FUHeFEw2hxKX4JtXU6riZGRl60AZDcaYOK0yw2IOQ_X9QK1l32Cb7IiikQb1pWCg2tKvBsvleHYUi4l2GI8ehwsKG2Hn8S2JjsCUQzdJvUEOVqW15RBLxbZVrsipzgue8eHHF7C01iqaC9B8yYg0QtAVpbTrpM8u_UrC0E8tIyLRajSPy7Tzhhz4REYjpFKIS8HD3LYALZfjB-DbMy4w9O1xI6VZvM0mcoXGZ2X7BX3mIp0AIBW3OTpHlcEs"

## Functions

##### Utility Functions

In [7]:
def prep_article_title(article_title: str) -> str:
    """
    Prepares the article title for use in a URL by replacing spaces with underscores
    and URL-encoding the title to ensure it is safe for use in API requests.

    Parameters
    ----------
    article_title : str
        The title of the Wikipedia article to be formatted.

    Returns
    -------
    str
        The formatted article title with spaces replaced by underscores and URL-encoded.
    """
    # Replace spaces in the article title with underscores,
    # as Wikipedia URLs use underscores instead of spaces
    prepped_title = article_title.replace(' ', '_')

    # URL-encode the article title to ensure it is safe for use in a URL
    # (e.g., handling special characters)
    encoded_title = urllib.parse.quote(prepped_title)

    return encoded_title

#### Requesting information from Wiki

##### For a single article

In [23]:
# Function to get the current revision ID of an article
def get_revision_id(page_title: str) -> Optional[int]:
    """
    Retrieves the latest revision ID for a given Wikipedia article.

    Parameters:
    -----------
    page_title : str
        The title of the Wikipedia page for which to retrieve the latest revision ID.

    Returns:
    --------
    Optional[int]
        The latest revision ID of the page, or None if not found.

    Raises:
    -------
    Exception:
        If the request to the Wikipedia API fails.
    """
    params = {
        "action": "query",
        "format": "json",
        "titles": page_title,
        "prop": "info"
    }
    
    try:
        response = requests.get(API_ENWIKIPEDIA_ENDPOINT, headers=REQUEST_HEADERS, params=params)
        response_data = response.json()
        
        # Extract page info
        page_data = list(response_data['query']['pages'].values())[0]
        return page_data.get('lastrevid', None)
    except Exception as e:
        print(f"Error occurred while retrieving revision ID for {page_title}: {e}")
        return None


# Function to get the ORES quality prediction
def get_ores_quality_prediction(revision_id: int) -> Optional[str]:
    """
    Retrieves the ORES quality prediction for a given revision ID.

    Parameters:
    -----------
    revision_id : int
        The revision ID of the Wikipedia article for which to retrieve the ORES quality prediction.

    Returns:
    --------
    Optional[str]
        The predicted quality class (e.g., 'Stub', 'Start', 'C', 'B', 'GA', 'FA'), or None if not found.

    Raises:
    -------
    Exception:
        If the request to the ORES API fails or the prediction cannot be retrieved.
    """
    if revision_id:
        ores_url = ORES_API_ENDPOINT.format("enwiki", revision_id)
        
        try:
            response = requests.get(ores_url, headers=REQUEST_HEADERS)
            response_data = response.json()
            
            # Return the predicted quality class
            return response_data['enwiki']['scores'][str(revision_id)]['wp10']['score']['prediction']
        except KeyError:
            print(f"KeyError: Quality prediction not available for revision ID {revision_id}")
            return None
        except Exception as e:
            print(f"Error occurred while retrieving ORES prediction for revision ID {revision_id}: {e}")
            return None
    return None


# Function to apply the above functions to get article quality prediction
def get_article_quality(row: Dict[str, Any]) -> Optional[str]:
    """
    Retrieves the ORES quality prediction for a given Wikipedia article (row).

    Parameters:
    -----------
    row : Dict[str, Any]
        A row from the dataset, expected to contain a 'name' field with the article title.

    Returns:
    --------
    Optional[str]
        The predicted quality class for the article, or None if not found.
    
    Workflow:
    ---------
    1. Get the latest revision ID using the article's title.
    2. Use the revision ID to get the ORES quality prediction.
    """
    
    # Step 1: Get revision ID
    revision_id = get_revision_id(row['name'])
    
    # Throttle requests to avoid exceeding the API rate limits
    time.sleep(0.05)
    
    # Step 2: Get ORES quality prediction
    return get_ores_quality_prediction(revision_id)

def request_pageinfo_per_article(
    article_title: Optional[str] = None,
    endpoint_url: str = API_ENWIKIPEDIA_ENDPOINT,
    request_template: Dict[str, Any] = PAGEINFO_PARAMS_TEMPLATE,
    headers: Dict[str, str] = REQUEST_HEADERS
) -> Optional[Dict[str, Any]]:
    """
    Makes a request to the Wikipedia API to retrieve page information for a given article.

    Parameters:
    -----------
    article_title : Optional[str]
        The title of the Wikipedia article. This can be passed as a function parameter 
        or pre-populated in the request_template. If not provided, the request will fail.
    
    endpoint_url : str
        The Wikipedia API endpoint to send the request to (default is API_ENWIKIPEDIA_ENDPOINT).
    
    request_template : Dict[str, Any]
        The template for the parameters to be sent in the request.
    
    headers : Dict[str, str]
        The headers for the request, which must include the 'User-Agent' field containing a valid email address.
    
    Returns:
    --------
    Optional[Dict[str, Any]]
        The JSON response from the Wikipedia API as a dictionary, or None in case of an error.

    Raises:
    -------
    Exception:
        If the article title is missing, if 'User-Agent' is missing from headers, or if a placeholder email is used.
    """

    # Ensure the article title is included in the request
    if article_title:
        request_template['titles'] = encoded_title

    if not request_template['titles']:
        raise Exception("Must supply an article title to make a pageinfo request.")

    if API_HEADER_AGENT not in headers:
        raise Exception(f"The header data should include a '{API_HEADER_AGENT}' field that contains your UW email address.")

    if 'uwnetid@uw' in headers[API_HEADER_AGENT]:
        raise Exception(f"Use your UW email address in the '{API_HEADER_AGENT}' field.")

    # Make the request to the Wikipedia API
    try:
        # Throttle to avoid exceeding the API request limits
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(endpoint_url, headers=headers, params=request_template)
        json_response = response.json()
    except Exception as e:
        print(f"Error occurred: {e}")
        json_response = None

    return json_response

def request_ores_score_per_article(
    article_revid: Optional[int] = None,
    email_address: Optional[str] = None,
    access_token: Optional[str] = None,
    endpoint_url: str = API_ORES_LIFTWING_ENDPOINT,
    model_name: str = API_ORES_EN_QUALITY_MODEL,
    request_data: Dict[str, Any] = ORES_REQUEST_DATA_TEMPLATE,
    header_format: Dict[str, str] = REQUEST_HEADER_TEMPLATE,
    header_params: Dict[str, str] = REQUEST_HEADER_PARAMS_TEMPLATE
) -> Optional[Dict[str, Any]]:
    """
    Makes a request to the ORES API to retrieve an article quality score for a given Wikipedia article revision.

    Parameters:
    -----------
    article_revid : Optional[int]
        The revision ID of the Wikipedia article to score. Required for the ORES request.
    
    email_address : Optional[str]
        The user's email address to include in the API request headers for identification. Required.
    
    access_token : Optional[str]
        The access token for authentication with the ORES API. Required.
    
    endpoint_url : str
        The LiftWing ORES API endpoint (default is API_ORES_LIFTWING_ENDPOINT).
    
    model_name : str
        The name of the ORES model to use (default is API_ORES_EN_QUALITY_MODEL).
    
    request_data : Dict[str, Any]
        The data payload to be sent in the request, including revision ID and features flag.
    
    header_format : Dict[str, str]
        The template for constructing request headers, with placeholders for email and access token.
    
    header_params : Dict[str, str]
        The dictionary containing values (email_address and access_token) to fill into the header template.

    Returns:
    --------
    Optional[Dict[str, Any]]
        The JSON response from the ORES API as a dictionary, or None in case of an error.

    Raises:
    -------
    Exception:
        If the revision ID, email address, or access token is missing.
    """

    # Ensure all required fields are provided
    if article_revid:
        request_data['rev_id'] = article_revid
    if email_address:
        header_params['email_address'] = email_address
    if access_token:
        header_params['access_token'] = access_token
    
    if not request_data['rev_id']:
        raise Exception("Must provide an article revision ID (rev_id) to score articles.")
    if not header_params['email_address']:
        raise Exception("Must provide an 'email_address' value.")
    if not header_params['access_token']:
        raise Exception("Must provide an 'access_token' value.")
    
    # Format the request URL and headers
    request_url = endpoint_url.format(model_name=model_name)
    
    headers = {key: header_format[key].format(**header_params) for key in header_format}
    
    # Make the request to the ORES API
    try:
        # Throttle to avoid exceeding the API request limits
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.post(request_url, headers=headers, data=json.dumps(request_data))
        json_response = response.json()
    except Exception as e:
        print(f"Error occurred: {e}")
        json_response = None

    return json_response

##### For multiple article/batch requests
significantly speeds things up

In [22]:
# Function to get the latest revision IDs for a batch of Wikipedia article titles
def fetch_revision_ids_for_batch(article_titles):
    """
    Fetches the latest revision IDs for a batch of Wikipedia articles using the Wikipedia API.

    Args:
        article_titles (list): A list of article titles to query.

    Returns
        dict: A dictionary mapping article titles to their respective revision IDs. 
        If an article is missing or a revision ID is not available, None is returned for that title.
    """
    try:
        # Join the titles with "|" to form a single request
        titles_query_str = "|".join(article_titles)

        # Define the parameters for the Wikipedia API query
        query_params = {
            "action": "query",
            "format": "json",
            "titles": titles_query_str,
            "prop": "info"
        }

        # Send the request to the Wikipedia API
        response = requests.get(API_ENWIKIPEDIA_ENDPOINT, headers=REQUEST_HEADERS, params=query_params)
        response_data = response.json()

        # Initialize dictionary to hold revision IDs
        article_revision_ids = {}

        # Process the response to extract revision IDs
        for page_id, page_data in response_data['query']['pages'].items():
            title = page_data.get('title', 'Unknown')
            if 'missing' in page_data:
                print(f"Article missing for title: {title}")
                article_revision_ids[title] = None  # Mark missing pages
            else:
                # Store the revision ID if available
                revision_id = page_data.get('lastrevid', None)
                if revision_id is None:
                    print(f"Revision ID missing for title: {title}")
                article_revision_ids[title] = revision_id

        return article_revision_ids

    except Exception as e:
        print(f"Error occurred while retrieving batch revision IDs: {e}")
        return {title: None for title in article_titles}  # Return None for all titles in case of failure


# Function to process all article titles in batches and retrieve their revision IDs
def fetch_revision_ids_for_all_articles(article_titles, batch_size=50):
    """
    Retrieves the revision IDs for a list of Wikipedia articles in batches to optimize API requests.

    Args:
        article_titles (list): A list of article titles for which to fetch revision IDs.
        batch_size (int, optional): The number of articles to query in each batch (default is 50).

    Returns:
        dict: A dictionary mapping article titles to their respective revision IDs.
    """
    
    all_revision_ids = {}

    # Loop over the article titles in batches
    for batch_start in range(0, len(article_titles), batch_size):
        batch_titles = article_titles[batch_start:batch_start + batch_size]
        batch_revision_ids = fetch_revision_ids_for_batch(batch_titles)
        all_revision_ids.update(batch_revision_ids)

        # Delay to avoid overwhelming the API with requests
        time.sleep(0.1)

    return all_revision_ids

## Data Loading and Cleaning

### Load the provided politicans and population datasets

In [10]:
pol_df = pd.read_csv('politicians_by_country_AUG2024.csv')
pop_df = pd.read_csv('population_by_country_AUG2024.csv')

Display the first few rows of each dataframe to inspect structure

In [11]:
pol_df.head() 

Unnamed: 0,name,url,country
0,Majah Ha Adrif,https://en.wikipedia.org/wiki/Majah_Ha_Adrif,Afghanistan
1,Haroon al-Afghani,https://en.wikipedia.org/wiki/Haroon_al-Afghani,Afghanistan
2,Tayyab Agha,https://en.wikipedia.org/wiki/Tayyab_Agha,Afghanistan
3,Khadija Zahra Ahmadi,https://en.wikipedia.org/wiki/Khadija_Zahra_Ah...,Afghanistan
4,Aziza Ahmadyar,https://en.wikipedia.org/wiki/Aziza_Ahmadyar,Afghanistan


In [12]:
pop_df.head()

Unnamed: 0,Geography,Population
0,WORLD,8009.0
1,AFRICA,1453.0
2,NORTHERN AFRICA,256.0
3,Algeria,46.8
4,Egypt,105.2


### Clean & Standardize the data
We need to clean and standardize the country names in both datasets to ensure the entries are consistent.

We will also exclude cumulative regional rows (like "AFRICA", "NORTHERN AFRICA") from the country-specific analysis but retain them in a separate dataframe for regional-level analysis.

In [13]:
# Clean the population dataset
# Separate out regions (all caps) and countries
regions_df = pop_df[pop_df['Geography'].str.isupper()]
countries_df = pop_df[~pop_df['Geography'].str.isupper()]

# Rename columns to have a consistent naming format
countries_df = countries_df.rename(columns={'Geography': 'country', 'Population': 'population_millions'})
regions_df = regions_df.rename(columns={'Geography': 'region', 'Population': 'population_millions'})

# Standardize the column names for the politicians dataset as well
pol_df = pol_df.rename(columns={'country': 'country'})

Let’s view the intermediate cleaned dataframes

In [14]:
# lets look at the contries to see if there are any glaring issues: 
pol_df["country"].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Austria',
       'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados',
       'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan', 'Guinea-Bissau',
       'Bolivia', 'Bosnia Herzegovina', 'Botswana', 'Brazil', 'Bulgaria',
       'Burkina Faso', 'Myanmar', 'Burundi', 'Cambodia', 'Cameroon',
       'Cape Verde', 'Central African Republic', 'Chad', 'Chile', 'China',
       'Colombia', 'Comoros', 'Congo', 'Congo DR', 'Costa Rica',
       'Croatia', 'Cuba', 'Cyprus', 'Czechia', 'Djibouti',
       'Dominican Republic', 'Timor Leste', 'Ecuador', 'Egypt',
       'United Arab Emirates', 'Equatorial Guinea', 'Eritrea', 'Estonia',
       'Ethiopia', 'Finland', 'France', 'Gabon', 'Gambia', 'Germany',
       'Ghana', 'Greece', 'Grenada', 'Guatemala', 'Guinea', 'Guyana',
       'Haiti', 'Honduras', 'Hungary', 'India', 'Indonesia', 'Iran',
       'Iraq', 'Israel', 'Italy', "Cote d'Ivoire

In [15]:
countries_df["country"].unique()

array(['Algeria', 'Egypt', 'Libya', 'Morocco', 'Sudan', 'Tunisia',
       'Western Sahara', 'Benin', 'Burkina Faso', 'Cape Verde',
       "Cote d'Ivoire", 'Gambia', 'Ghana', 'Guinea', 'GuineaBissau',
       'Liberia', 'Mali', 'Mauritania', 'Niger', 'Nigeria', 'Senegal',
       'Sierra Leone', 'Togo', 'Burundi', 'Comoros', 'Djibouti',
       'Eritrea', 'Ethiopia', 'Kenya', 'Madagascar', 'Malawi',
       'Mauritius', 'Mayotte', 'Mozambique', 'Reunion', 'Rwanda',
       'Seychelles', 'Somalia', 'South Sudan', 'Tanzania', 'Uganda',
       'Zambia', 'Zimbabwe', 'Angola', 'Cameroon',
       'Central African Republic', 'Chad', 'Congo', 'Congo DR',
       'Equatorial Guinea', 'Gabon', 'Sao Tome and Principe', 'Botswana',
       'eSwatini', 'Lesotho', 'Namibia', 'South Africa', 'Canada',
       'United States', 'Belize', 'Costa Rica', 'El Salvador',
       'Guatemala', 'Honduras', 'Mexico', 'Nicaragua', 'Panama',
       'Antigua and Barbuda', 'Bahamas', 'Barbados', 'Cuba', 'Curacao',
       'Do

The one thing I'm noticing is that in the politicians df there is a country 'Korean'- which I'm not sure if that means Korea (North) because there is an n at the end of it. That might be something to keep an eye on. 

In [16]:
regions_df["region"].unique()

array(['WORLD', 'AFRICA', 'NORTHERN AFRICA', 'WESTERN AFRICA',
       'EASTERN AFRICA', 'MIDDLE AFRICA', 'SOUTHERN AFRICA',
       'NORTHERN AMERICA', 'LATIN AMERICA AND THE CARIBBEAN',
       'CENTRAL AMERICA', 'CARIBBEAN', 'SOUTH AMERICA', 'ASIA',
       'WESTERN ASIA', 'CENTRAL ASIA', 'SOUTH ASIA', 'SOUTHEAST ASIA',
       'EAST ASIA', 'EUROPE', 'NORTHERN EUROPE', 'WESTERN EUROPE',
       'EASTERN EUROPE', 'SOUTHERN EUROPE', 'OCEANIA'], dtype=object)

### Matching and merging the politicians & population datasets
We will now merge the politicans and population datasets, and log any any countries from the politicans dataset that don't have a corresponding population entry.

In [17]:
# Merge the politicians dataset with the country population dataset
# Performing a left join to retain all politicians even if there is no population match
merged_df = pd.merge(pol_df, countries_df, how='left', on='country')

# Identify countries from the politicians dataset that don't have a match in the population data
no_match_countries = merged_df[merged_df['population_millions'].isna()]['country'].unique()

# Save the no-match countries to a text file
no_match_file_path = r'C:\Users\clark.roll\python coding\code_personal\Data512\Homework\Assignment 2\wp_countries-no_match.txt'
with open(no_match_file_path, 'w') as file:
    for country in no_match_countries:
        file.write(f"{country}\n")

# Save the cleaned and merged dataset as a CSV
merged_df.to_csv('wp_politicians_by_country.csv', index=False)


Lets look at the merged dataframe

In [18]:
merged_df

Unnamed: 0,name,url,country,population_millions
0,Majah Ha Adrif,https://en.wikipedia.org/wiki/Majah_Ha_Adrif,Afghanistan,42.4
1,Haroon al-Afghani,https://en.wikipedia.org/wiki/Haroon_al-Afghani,Afghanistan,42.4
2,Tayyab Agha,https://en.wikipedia.org/wiki/Tayyab_Agha,Afghanistan,42.4
3,Khadija Zahra Ahmadi,https://en.wikipedia.org/wiki/Khadija_Zahra_Ah...,Afghanistan,42.4
4,Aziza Ahmadyar,https://en.wikipedia.org/wiki/Aziza_Ahmadyar,Afghanistan,42.4
...,...,...,...,...
7150,Josiah Tongogara,https://en.wikipedia.org/wiki/Josiah_Tongogara,Zimbabwe,16.7
7151,Langton Towungana,https://en.wikipedia.org/wiki/Langton_Towungana,Zimbabwe,16.7
7152,Sengezo Tshabangu,https://en.wikipedia.org/wiki/Sengezo_Tshabangu,Zimbabwe,16.7
7153,Herbert Ushewokunze,https://en.wikipedia.org/wiki/Herbert_Ushewokunze,Zimbabwe,16.7


It looks like the 'korean' entry that I was hesitant about in the politicians dataframe was logged as no matches, so that takes care of that.

If that were a particularly important entry, then we could lookup the correct information and manually adjust it, but I think this will be fine for this project

## Getting Article Quality Predictions via ORES API

Lets start by prepping to make requests. 

We know that we need our article titles to be URL encoded so lets take care of that now by adding a column to the dataframe called `article_title` that is the URL encoded name

In [19]:
merged_df['article_title'] = merged_df['name'].apply(prep_article_title)

In [20]:
merged_df

Unnamed: 0,name,url,country,population_millions,article_title
0,Majah Ha Adrif,https://en.wikipedia.org/wiki/Majah_Ha_Adrif,Afghanistan,42.4,Majah_Ha_Adrif
1,Haroon al-Afghani,https://en.wikipedia.org/wiki/Haroon_al-Afghani,Afghanistan,42.4,Haroon_al-Afghani
2,Tayyab Agha,https://en.wikipedia.org/wiki/Tayyab_Agha,Afghanistan,42.4,Tayyab_Agha
3,Khadija Zahra Ahmadi,https://en.wikipedia.org/wiki/Khadija_Zahra_Ah...,Afghanistan,42.4,Khadija_Zahra_Ahmadi
4,Aziza Ahmadyar,https://en.wikipedia.org/wiki/Aziza_Ahmadyar,Afghanistan,42.4,Aziza_Ahmadyar
...,...,...,...,...,...
7150,Josiah Tongogara,https://en.wikipedia.org/wiki/Josiah_Tongogara,Zimbabwe,16.7,Josiah_Tongogara
7151,Langton Towungana,https://en.wikipedia.org/wiki/Langton_Towungana,Zimbabwe,16.7,Langton_Towungana
7152,Sengezo Tshabangu,https://en.wikipedia.org/wiki/Sengezo_Tshabangu,Zimbabwe,16.7,Sengezo_Tshabangu
7153,Herbert Ushewokunze,https://en.wikipedia.org/wiki/Herbert_Ushewokunze,Zimbabwe,16.7,Herbert_Ushewokunze


#### Making requests 1 article at a time

You can run the code below to get the quality predictions **BUT** it will take **~1-2 hours to run**.

Instead I would suggest running as a batch, which is explained after this

In [None]:
# Apply the ORES prediction process
merged_df['revision_id'] = merged_df['name'].apply(get_revision_id)
merged_df['article_quality'] = merged_df['revision_id'].apply(get_ores_quality_prediction)

# Log any articles for which the ORES score could not be retrieved
ores_errors = merged_df[merged_df['article_quality'].isna()]
ores_error_rate = len(ores_errors) / len(merged_df)

print(f"ORES request error rate: {ores_error_rate * 100:.2f}%")

#### Batch requests

**OR** you can use batch processing to reduce the runtime by an order of magnitude

In [25]:
# Apply the batch request function to get revision IDs for all articles
all_article_titles = merged_df['article_title'].tolist()
all_revision_ids = fetch_revision_ids_for_all_articles(all_article_titles)

Revision ID missing for title: Ezatullah_%28Nangarhar%29
Revision ID missing for title: Mohammad_Gul_%28Helmand_Council%29
Revision ID missing for title: Mohammad_Khan_%28athlete%29
Revision ID missing for title: Ali_Mohammad_%28politician%29
Revision ID missing for title: Abdul_Zahir_%28Konar_Education_Minister%29
Revision ID missing for title: Ziauddin_%28Afghan_militia_leader%29
Revision ID missing for title: Refo_%C3%87apari
Revision ID missing for title: Themistokli_G%C3%ABrmenji
Revision ID missing for title: A%C4%87if_Had%C5%BEiahmetovi%C4%87
Revision ID missing for title: Kurt_Ag%C3%AB_Kadiu
Revision ID missing for title: Alfred_Karamu%C3%A7o
Revision ID missing for title: Ali_K%C3%ABlcyra
Revision ID missing for title: Elez_Ko%C3%A7i
Revision ID missing for title: Xhelal_Kopr%C3%ABncka
Revision ID missing for title: Xhelal_Sve%C3%A7la
Revision ID missing for title: Cha%C3%A2bane_A%C3%AFt_Abderrahim
Revision ID missing for title: A%C3%AFssa_Bekkai
Revision ID missing for title:

In [26]:
all_revision_ids

{'Ezatullah_%28Nangarhar%29': None,
 'Mohammad_Gul_%28Helmand_Council%29': None,
 'Mohammad_Khan_%28athlete%29': None,
 'Abdul Baqi Turkistani': 1231655023,
 'Abdul Ghani Ghani': 1227026187,
 'Abdul Rahim Ayoubi': 1226326055,
 'Ahmad Wali Massoud': 1221720658,
 'Aimal Faizi': 1185105938,
 'Amir Muhammad Akhundzada': 1247931713,
 'Aziza Ahmadyar': 1195651393,
 'Azizullah Lodin': 1247762293,
 'Baran Khan Kudezai': 1176481824,
 'Bashir Ahmad Bezan': 1248505877,
 'Cheragh Ali Cheragh': 1193992206,
 'Fazel Ahmed Manawi': 1234514379,
 'Gajinder Singh Safri': 1212323536,
 'Ghulam Ghaus': 1158659195,
 'Ghulam Muhammad Ghobar': 1240993642,
 'Hafizullah Shabaz Khail': 1238402857,
 'Haroon al-Afghani': 1230459615,
 'Hashmat Ghani Ahmadzai': 1207743719,
 'Ismael Balkhi': 1244521219,
 'Jan Baz': 1227635806,
 'Khadija Zahra Ahmadi': 1234741562,
 'Mahboba Hoqomal': 1243745950,
 'Majah Ha Adrif': 1233202991,
 'Masoud Khalili': 1246566971,
 'Mirza Muhammad Ismail': 1235165845,
 'Moeen Marastial': 12465

In [None]:
# Add the revision IDs to the DataFrame
merged_df['revision_id'] = merged_df['name'].map(all_revision_ids)

In [None]:
merged_df[merged_df["revision_id"].notnull()]