In [18]:
#importing necessary libraries
import pandas as pd
import numpy as np
import json, time, urllib.parse
import requests

In [5]:
#reading csv file for politicians
pol = pd.read_csv('politicians_by_country_AUG.2024.csv')

In [6]:
#viweing the dataframe
pol.head()

Unnamed: 0,name,url,country
0,Majah Ha Adrif,https://en.wikipedia.org/wiki/Majah_Ha_Adrif,Afghanistan
1,Haroon al-Afghani,https://en.wikipedia.org/wiki/Haroon_al-Afghani,Afghanistan
2,Tayyab Agha,https://en.wikipedia.org/wiki/Tayyab_Agha,Afghanistan
3,Khadija Zahra Ahmadi,https://en.wikipedia.org/wiki/Khadija_Zahra_Ah...,Afghanistan
4,Aziza Ahmadyar,https://en.wikipedia.org/wiki/Aziza_Ahmadyar,Afghanistan


Getting a preliminary view of the politicans in the dataset

In [19]:
politicians = [name for name in pol['name']]

In [20]:
politicians

['Majah Ha Adrif',
 'Haroon al-Afghani',
 'Tayyab Agha',
 'Khadija Zahra Ahmadi',
 'Aziza Ahmadyar',
 'Muqadasa Ahmadzai',
 'Mohammad Sarwar Ahmedzai',
 'Amir Muhammad Akhundzada',
 'Nasrullah Baryalai Arsalai',
 'Abdul Rahim Ayoubi',
 'Ismael Balkhi',
 'Abdul Baqi Turkistani',
 'Mohammad Ghous Bashiri',
 'Jan Baz',
 'Bashir Ahmad Bezan',
 'Rafiullah Bidar',
 'Mohammad Siddiq Chakari',
 'Cheragh Ali Cheragh',
 'Nasir Ahmad Durrani',
 'Muhammad Hashim Esmatullahi',
 'Ezatullah (Nangarhar)',
 'Aimal Faizi',
 'Gajinder Singh Safri',
 'Sharif Ghalib',
 'Hashmat Ghani Ahmadzai',
 'Abdul Ghani Ghani',
 'Ghulam Ghaus',
 'Ghulam Muhammad Ghobar',
 'Mohammad Gul (Helmand Council)',
 'Sayed Yousuf Halim',
 'Rangina Hamidi',
 'Sayed Zafar Hashemi',
 'Qutbuddin Hilal',
 'Mahboba Hoqomal',
 'Musa Hotak',
 'Mirza Muhammad Ismail',
 'Sayed Jalal',
 'Said Tayeb Jawad',
 'Sayed Jalal Karim',
 'Hafizullah Shabaz Khail',
 'Masoud Khalili',
 'Mohammad Khan (athlete)',
 'Samoud Khan',
 'Baran Khan Kudezai'

# Step 1
API call from the document provided in Homework link - Data Extraction and Transformation
For the below chunk of code i am using the API function provided for the homework and extracting the views data for desktop, mobile-app and mobile web. Once extraction is done I will be saving the output as JSON files.

In [21]:
#########
#
#    CONSTANTS
#

# The basic English Wikipedia API endpoint
API_ENWIKIPEDIA_ENDPOINT = "https://en.wikipedia.org/w/api.php"
API_HEADER_AGENT = 'User-Agent'

# We'll assume that there needs to be some throttling for these requests - we should always be nice to a free data resource
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

# When making automated requests we should include something that is unique to the person making the request
# This should include an email - your UW email would be good to put in there
REQUEST_HEADERS = {
    'User-Agent': '<anurag96@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2024'
}

# This is just a list of English Wikipedia article titles that we can use for example requests
ARTICLE_TITLES = politicians

# This is a string of additional page properties that can be returned see the Info documentation for
# what can be included. If you don't want any this can simply be the empty string
PAGEINFO_EXTENDED_PROPERTIES = "talkid|url|watched|watchers"
#PAGEINFO_EXTENDED_PROPERTIES = ""

# This template lists the basic parameters for making this
PAGEINFO_PARAMS_TEMPLATE = {
    "action": "query",
    "format": "json",
    "titles": "",           # to simplify this should be a single page title at a time
    "prop": "info",
    "inprop": PAGEINFO_EXTENDED_PROPERTIES
}

In [22]:
#########
#
#    PROCEDURES/FUNCTIONS
#

def request_pageinfo_per_article(article_title = None, 
                                 endpoint_url = API_ENWIKIPEDIA_ENDPOINT, 
                                 request_template = PAGEINFO_PARAMS_TEMPLATE,
                                 headers = REQUEST_HEADERS):
    
    # article title can be as a parameter to the call or in the request_template
    if article_title:
        request_template['titles'] = article_title

    if not request_template['titles']:
        raise Exception("Must supply an article title to make a pageinfo request.")

    if API_HEADER_AGENT not in headers:
        raise Exception(f"The header data should include a '{API_HEADER_AGENT}' field that contains your UW email address.")

    if 'uwnetid@uw' in headers[API_HEADER_AGENT]:
        raise Exception(f"Use your UW email address in the '{API_HEADER_AGENT}' field.")

    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like Wikipedia - or any other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(endpoint_url, headers=headers, params=request_template)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response


In [57]:
pol_info = {} #blank dictionary to store article and latest revision id

In [58]:
missing_rev_ids = []  # List to capture articles missing 'lastrevid'

In [59]:
from tqdm import tqdm
import time

for i in tqdm(politicians, desc="Processing articles"):
    time.sleep(0.01)
    info = request_pageinfo_per_article(i)
    for page_id in info['query']['pages']:
        page_data = info['query']['pages'][page_id]
        
        # Check if 'pagelanguage' is not English
        if page_data.get('pagelanguage') != 'en':
            print(f"Article {page_data['title']} is not in English")
        
        # Check if 'lastrevid' exists, otherwise add to missing list
        if 'lastrevid' in page_data:
            pol_info[i] = page_data['lastrevid']
        else:
            missing_rev_ids.append(page_data['title'])

Processing articles: 100%|██████████████████| 7155/7155 [34:04<00:00,  3.50it/s]


In [61]:
missing_rev_ids

['Barbara Eibinger-Miedl',
 'Mehrali Gasimov',
 'Kyaw Myint',
 'André Ngongang Ouandji',
 'Tomás Pimentel',
 'Richard Sumah',
 "Segun ''Aeroland'' Adewale",
 'Bashir Bililiqo']

Thus 8 articles are missing Rev IDs. 


**I also noticed that few of the articles do not correspond to politicans i.e they had other professions like 'David Henríquez (footballer, born 1977)'. I could not manually go through each of the articles to exclude them, but the articles which had a paranthesis citing their profession, I excluded the ones except the items in list "exclude_keywords".**

In [80]:
#professions to exclude from removal
exclude_keywords = [
    'politician', 'civil servant', 'minister', 'mayor', 'President', 
    'general', 'councilor', 'ambassador', 'government', 'diplomat', 'activist', 'political', 'Bhutan'
]

In [81]:
#using regex to also exclude professions having a year interval in their paranthesis
import re
year_pattern = re.compile(r'\(\d{4}–\d{4}\)')

In [82]:
#getting those articles which are potentially not politicians
special_cases_articles = [
    article for article in pol_info.keys()
    if '(' in article and ')' in article 
    and not any(keyword.lower() in article.lower() for keyword in exclude_keywords)
    and not year_pattern.search(article)
]

In [84]:
#removing those articles found in special cases from analysis
for article in special_cases_articles:
    pol_info.pop(article, None)

# Step 2
Requesting ORES scores through LiftWing ML Service API - Getting Article Quality Predictions

In [86]:
#########
#
#    CONSTANTS
#

#    The current LiftWing ORES API endpoint and prediction model
#
API_ORES_LIFTWING_ENDPOINT = "https://api.wikimedia.org/service/lw/inference/v1/models/{model_name}:predict"
API_ORES_EN_QUALITY_MODEL = "enwiki-articlequality"

#
#    The throttling rate is a function of the Access token that you are granted when you request the token. The constants
#    come from dissecting the token and getting the rate limits from the granted token. An example of that is below.
#
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = ((60.0*60.0)/5000.0)-API_LATENCY_ASSUMED  # The key authorizes 5000 requests per hour

#    When making automated requests we should include something that is unique to the person making the request
#    This should include an email - your UW email would be good to put in there
#    
#    Because all LiftWing API requests require some form of authentication, you need to provide your access token
#    as part of the header too
#
REQUEST_HEADER_TEMPLATE = {
    'User-Agent': "<{email_address}>, University of Washington, MSDS DATA 512 - AUTUMN 2024",
    'Content-Type': 'application/json',
    'Authorization': "Bearer {access_token}"
}
#
#    This is a template for the parameters that we need to supply in the headers of an API request
#
REQUEST_HEADER_PARAMS_TEMPLATE = {
    'email_address' : "anurag96@uw.edu",         # your email address should go here
    'access_token'  : "eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJhdWQiOiJlMjM4NTk0MTMwOTU0NTFmZjFlMWE4YTU4NjNmYzRkNiIsImp0aSI6ImI5OGRhMzY3Y2Y2MjM1NTdjMjg3ZjFlNDI2NzZmYTkxZGQyNWNiYmQ0Y2M3NGJjNDBiY2Y4YzIxN2M5Y2I2ZDdiYzlmZWQ0MWJhYjU3MGI2IiwiaWF0IjoxNzI4ODM5ODQ1LjgwMjc2LCJuYmYiOjE3Mjg4Mzk4NDUuODAyNzYzLCJleHAiOjMzMjg1NzQ4NjQ1LjgwMDc5LCJzdWIiOiI3NjcwNzU4NyIsImlzcyI6Imh0dHBzOi8vbWV0YS53aWtpbWVkaWEub3JnIiwicmF0ZWxpbWl0Ijp7InJlcXVlc3RzX3Blcl91bml0Ijo1MDAwLCJ1bml0IjoiSE9VUiJ9LCJzY29wZXMiOlsiYmFzaWMiXX0.lzxrcKKUDdRPv7pF_EkJYONfHYXqYCuzdWO7_C3yNUuki7uNJDU2rfhVfTXsP7Doa0hLYaEW5n_mecVRzYYd3DBmYulvncBodVpMx86ynw6gmJDRkgT-1BlgFoOPgh0Nb0CHEJRdXZwvicuoLt6P036SRNUD9SLGrOzQx6u8TgsNIh9qbDK3q7fRMzZUQ7xO601UJ3ZHjFckhNL23zuMlyXU0EZDi8v5ecgFjRL2W6X_Eu5hqncAkxmv9z8f7euXzhSDw5rWY_eADZ8Vw0xEo8TgmYbT11QqEfLGo8wuRn1V5946wWoy-KcscAediermHLUcIHML-Wm8A6m6G87xDVXEm6DD3cJTeDGgpd60aByPxDlP5KSg86-l2gk4TiHYU-cPueHsLSGjnOo7OHQHiUECQWU0GU3Yo8lP_jXd0dezdiMbL4jMPHmOPx2rP1wQdu0lTeVhY4cpizfBK11lx78IFvV5vAN3epFdLdrTNdRjb-YILTjnzsRg7IHpzsYmRU66lMKs3Ue-h1rIeRSPg1dOoGgqIQsN2lxTWgapbt1MDP9wWQtPoTweJawvelXUBkVb8BGopUuW29K3Xfn2LVrMNEscTX2HpJ5V6XD5R_Fkrh06Pp4-2cJQRCVBUPi2WqOcM4RglO_4I5Qx__ogQviUvz6ZS-v3FaE3egR8iMY"          # the access token you create will need to go here
}

#
#    A dictionary of English Wikipedia article titles (keys) and sample revision IDs that can be used for this ORES scoring example
#
ARTICLE_REVISIONS = pol_info

#
#    This is a template of the data required as a payload when making a scoring request of the ORES model
#
ORES_REQUEST_DATA_TEMPLATE = {
    "lang":        "en",     # required that its english - we're scoring English Wikipedia revisions
    "rev_id":      "",       # this request requires a revision id
    "features":    True
}

#
#    These are used later - defined here so they, at least, have empty values
#
USERNAME = ""
ACCESS_TOKEN = ""
#

In [96]:
#   Once you've done the right set up with your Wikimedia account, it should provide you with three different keys, a Client ID,
#   a Client secret, and a Access token.
#
#   In this case I don't want to distribute my keys with the source of the notebook, so I wrote a key manager object that helps
#   track all of my API keys - a username and domain name retrieves the key. The key manager hides the keys on disk separate
#   from the code. A common code idiom to hide API keys will use code to extract the key from an OS environment variable. 
#
#   In the Homework 2 folder you should be able to find a zip file containing the apikeys user module. Install this module
#   into the folder where you keep all of your user modules. This is also the folder that your PYTHONPATH variable points to.
#
#from apikeys.KeyManager import KeyManager
#keyman = KeyManager()

#
#   This is my Wikipedia/Wikimedia username. They suggest you request your keys using your Wikipedia username, so I
#   also stored the API key using my Wikipedia username.
#
#   You should probably use your own username here.
#USERNAME = "Anuragag96"
#key_info = keyman.findRecord(USERNAME,API_ORES_LIFTWING_ENDPOINT)
#ACCESS_TOKEN = key_info[0]['key']
#print(key_info[0]['description'])
#print(ACCESS_TOKEN)
#
#   Note: if you don't want to use the key manager to help manage your API keys, you can specify the values as constants
#   below. Just don't distribute the notebook without removing the constants or you'll be distributing your key too.
#
USERNAME = "Anuragag96"
ACCESS_TOKEN = "eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJhdWQiOiJlMjM4NTk0MTMwOTU0NTFmZjFlMWE4YTU4NjNmYzRkNiIsImp0aSI6ImI5OGRhMzY3Y2Y2MjM1NTdjMjg3ZjFlNDI2NzZmYTkxZGQyNWNiYmQ0Y2M3NGJjNDBiY2Y4YzIxN2M5Y2I2ZDdiYzlmZWQ0MWJhYjU3MGI2IiwiaWF0IjoxNzI4ODM5ODQ1LjgwMjc2LCJuYmYiOjE3Mjg4Mzk4NDUuODAyNzYzLCJleHAiOjMzMjg1NzQ4NjQ1LjgwMDc5LCJzdWIiOiI3NjcwNzU4NyIsImlzcyI6Imh0dHBzOi8vbWV0YS53aWtpbWVkaWEub3JnIiwicmF0ZWxpbWl0Ijp7InJlcXVlc3RzX3Blcl91bml0Ijo1MDAwLCJ1bml0IjoiSE9VUiJ9LCJzY29wZXMiOlsiYmFzaWMiXX0.lzxrcKKUDdRPv7pF_EkJYONfHYXqYCuzdWO7_C3yNUuki7uNJDU2rfhVfTXsP7Doa0hLYaEW5n_mecVRzYYd3DBmYulvncBodVpMx86ynw6gmJDRkgT-1BlgFoOPgh0Nb0CHEJRdXZwvicuoLt6P036SRNUD9SLGrOzQx6u8TgsNIh9qbDK3q7fRMzZUQ7xO601UJ3ZHjFckhNL23zuMlyXU0EZDi8v5ecgFjRL2W6X_Eu5hqncAkxmv9z8f7euXzhSDw5rWY_eADZ8Vw0xEo8TgmYbT11QqEfLGo8wuRn1V5946wWoy-KcscAediermHLUcIHML-Wm8A6m6G87xDVXEm6DD3cJTeDGgpd60aByPxDlP5KSg86-l2gk4TiHYU-cPueHsLSGjnOo7OHQHiUECQWU0GU3Yo8lP_jXd0dezdiMbL4jMPHmOPx2rP1wQdu0lTeVhY4cpizfBK11lx78IFvV5vAN3epFdLdrTNdRjb-YILTjnzsRg7IHpzsYmRU66lMKs3Ue-h1rIeRSPg1dOoGgqIQsN2lxTWgapbt1MDP9wWQtPoTweJawvelXUBkVb8BGopUuW29K3Xfn2LVrMNEscTX2HpJ5V6XD5R_Fkrh06Pp4-2cJQRCVBUPi2WqOcM4RglO_4I5Qx__ogQviUvz6ZS-v3FaE3egR8iMY"
#

In [97]:

#
#   Decode the Wikimedia JWT Access token
#
#   NOTE: This is not required to use LiftWing to request ORES scores. This is just being done to satisfy my curiosity.
#   You might be curious too!
#
import base64

print("Decoding the ACCESS_TOKEN:")
try:
    token_components = ACCESS_TOKEN.split(".")
    if len(token_components) == 3:
        header = json.loads(base64.b64decode(token_components[0]).decode())
        payload = json.loads(base64.b64decode(token_components[1]).decode())
        print("Token Header:",json.dumps(header,indent=4))
        print("Token Payload:",json.dumps(payload,indent=4))
        #print("Token Signature:",token_components[2])
        print("Token Signature: <value_suppressed>")
        #
        #  One should be able to use public/private keys to actually validate that signature - left as an exercise for later
        #
    else:
        print(f"The ACCESS_TOKEN appears to be improperly structured. It should have 3 components and it has {len(token_components)}")
except Exception as ex:
    print(f"Looks like the ACCESS_TOKEN is undefined or an empty value")
    raise(ex)


Decoding the ACCESS_TOKEN:
Looks like the ACCESS_TOKEN is undefined or an empty value


Error: Incorrect padding

In [98]:
#########
#
#    PROCEDURES/FUNCTIONS
#

def request_ores_score_per_article(article_revid = None, email_address=None, access_token=None,
                                   endpoint_url = API_ORES_LIFTWING_ENDPOINT, 
                                   model_name = API_ORES_EN_QUALITY_MODEL, 
                                   request_data = ORES_REQUEST_DATA_TEMPLATE, 
                                   header_format = REQUEST_HEADER_TEMPLATE, 
                                   header_params = REQUEST_HEADER_PARAMS_TEMPLATE):
    
    #    Make sure we have an article revision id, email and token
    #    This approach prioritizes the parameters passed in when making the call
    if article_revid:
        request_data['rev_id'] = article_revid
    if email_address:
        header_params['email_address'] = email_address
    if access_token:
        header_params['access_token'] = access_token
    
    #   Making a request requires a revision id - an email address - and the access token
    if not request_data['rev_id']:
        raise Exception("Must provide an article revision id (rev_id) to score articles")
    if not header_params['email_address']:
        raise Exception("Must provide an 'email_address' value")
    if not header_params['access_token']:
        raise Exception("Must provide an 'access_token' value")
    
    # Create the request URL with the specified model parameter - default is a article quality score request
    request_url = endpoint_url.format(model_name=model_name)
    
    # Create a compliant request header from the template and the supplied parameters
    headers = dict()
    for key in header_format.keys():
        headers[str(key)] = header_format[key].format(**header_params)
    
    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free data
        # source like ORES - or other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        #response = requests.get(request_url, headers=headers)
        response = requests.post(request_url, headers=headers, data=json.dumps(request_data))
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response


In [138]:
ores_score = {} #blank dictionary to store scores

In [139]:
no_ores_score = [] #list to get those articles not giving an ores score

In [156]:
#in actual there are 7035 articles for which the scores were processed
for key, value in tqdm(pol_info.items(), desc="Processing articles"):
    article_title = key
    # Request the ORES score for the article
    score = request_ores_score_per_article(article_revid=ARTICLE_REVISIONS[article_title],
                                           email_address="anurag96@uw.edu",
                                           access_token=ACCESS_TOKEN)
    
    try:
        # Attempt to retrieve the prediction score in a single line
        ores_score[key] = score['enwiki']['scores'][f'{value}']['articlequality']['score']['prediction']
    except KeyError:
        # If any part of the path does not exist, the KeyError will be caught
        no_ores_score.append(key)

Processing articles: 100%|████████████████████| 755/755 [14:49<00:00,  1.18s/it]


In [159]:
#viewing the ores score
ores_score

{'Majah Ha Adrif': 'Start',
 'Haroon al-Afghani': 'B',
 'Tayyab Agha': 'Start',
 'Khadija Zahra Ahmadi': 'Stub',
 'Aziza Ahmadyar': 'Start',
 'Muqadasa Ahmadzai': 'Start',
 'Mohammad Sarwar Ahmedzai': 'Start',
 'Amir Muhammad Akhundzada': 'Start',
 'Nasrullah Baryalai Arsalai': 'Start',
 'Abdul Rahim Ayoubi': 'Start',
 'Ismael Balkhi': 'Start',
 'Abdul Baqi Turkistani': 'Stub',
 'Mohammad Ghous Bashiri': 'Start',
 'Jan Baz': 'Stub',
 'Bashir Ahmad Bezan': 'Start',
 'Rafiullah Bidar': 'Stub',
 'Mohammad Siddiq Chakari': 'Stub',
 'Cheragh Ali Cheragh': 'Start',
 'Nasir Ahmad Durrani': 'Stub',
 'Muhammad Hashim Esmatullahi': 'Stub',
 'Aimal Faizi': 'Stub',
 'Gajinder Singh Safri': 'Stub',
 'Sharif Ghalib': 'Start',
 'Hashmat Ghani Ahmadzai': 'C',
 'Abdul Ghani Ghani': 'Stub',
 'Ghulam Ghaus': 'Stub',
 'Ghulam Muhammad Ghobar': 'Stub',
 'Sayed Yousuf Halim': 'Stub',
 'Rangina Hamidi': 'C',
 'Sayed Zafar Hashemi': 'Start',
 'Qutbuddin Hilal': 'Start',
 'Mahboba Hoqomal': 'Stub',
 'Musa Hota

In [160]:
#checking if there are any None values coming in the output
unique_values = set(ores_score.values())

print(unique_values)

{'B', 'Start', 'C', 'Stub', 'GA', 'FA'}


In [236]:
#printing the error rate
print(f'The error rate is {len(no_ores_score)/len(pol_info)}%.')

The error rate is 0.0%.


# Step 3
Combining the Datasets

### Creating country mapping

In [161]:
pop = pd.read_csv('population_by_country_AUG.2024.csv') #uploading the population dataset

In [177]:
region = None
country_region_mapping = []

for index, row in pop.iterrows():
    geography = row['Geography']
    population = row['Population']

    if geography.isupper():#if all letters caps its a region else a country
        region = geography
    else:
        country_region_mapping.append({'country':geography,'region':region,'population':population})



In [178]:
country_region_pop_df = pd.DataFrame(country_region_mapping) #convert the dicionary to a dataframe

In [184]:
ores_score_df =  pd.DataFrame(list(ores_score.items()), columns=['Article', 'Score']) #converting ores score dictionary to dataframe

In [186]:
articles_rev_id_df = pd.DataFrame(list(pol_info.items()), columns=['Article', 'RevID']) #converting the dictionary fro articles and the rev id to a dataframe

Merging dataframes to get revID and Scores for article in a single dataframe

In [188]:
articles_info_df = pd.merge(ores_score_df, articles_rev_id_df, how='inner', on='Article')

In [237]:
articles_info_df.head()

Unnamed: 0,Article,Score,RevID
0,Majah Ha Adrif,Start,1233202991
1,Haroon al-Afghani,B,1230459615
2,Tayyab Agha,Start,1225661708
3,Khadija Zahra Ahmadi,Stub,1234741562
4,Aziza Ahmadyar,Start,1195651393


Merging the articles_info_df with initial dataset on poiltiicans to get country

In [194]:
articles_info_final_df = pd.merge(articles_info_df,pol, how = 'inner', left_on = 'Article', right_on = 'name')
articles_info_final_df = articles_info_final_df.drop(columns=['name']) #dropping name since its repetative

In [196]:
articles_info_final_df = articles_info_final_df.drop(columns=['url']) #dropping the url since its not required

In [238]:
articles_info_final_df.head()

Unnamed: 0,Article,Score,RevID,country
0,Majah Ha Adrif,Start,1233202991,Afghanistan
1,Haroon al-Afghani,B,1230459615,Afghanistan
2,Tayyab Agha,Start,1225661708,Afghanistan
3,Khadija Zahra Ahmadi,Stub,1234741562,Afghanistan
4,Aziza Ahmadyar,Start,1195651393,Afghanistan


### Checking which countries are not present in both the datasets

In [198]:
country_first_dataset = articles_info_final_df['country'].unique() #getting unique countries in first dataset and the second
country_second_dataset = country_region_pop_df['country'].unique()

In [200]:
country_first_set = set(country_first_dataset) #converting list to set for intersection operation
country_second_set = set(country_second_dataset)

# Find common countries in both datasets
common_countries = list(country_first_set.intersection(country_second_set))


In [202]:
unique_to_first = country_first_set - country_second_set #getting countires only present in first dataset
unique_to_second = country_second_set - country_first_set #getting countries only present in second dataset

# Write non-common countries to a text file
with open('wp_countries_no_match.txt', 'w') as file:
    file.write("Countries only in first dataset:\n")
    for country in unique_to_first:
        file.write(f"{country}\n")
    file.write("\nCountries only in second dataset:\n")
    for country in unique_to_second:
        file.write(f"{country}\n")

Making final dataframe

In [203]:
#filter datasets for common countries only
articles_info_final_df_filtered = articles_info_final_df[articles_info_final_df['country'].isin(common_countries)]
country_region_pop_df_filtered = country_region_pop_df[country_region_pop_df['country'].isin(common_countries)]

In [206]:
#merging the datasets to get final dataframe
merged_df = pd.merge(articles_info_final_df_filtered, country_region_pop_df_filtered, on='country', how='inner')

                   Article  Score       RevID      country          region  \
0           Majah Ha Adrif  Start  1233202991  Afghanistan      SOUTH ASIA   
1        Haroon al-Afghani      B  1230459615  Afghanistan      SOUTH ASIA   
2              Tayyab Agha  Start  1225661708  Afghanistan      SOUTH ASIA   
3     Khadija Zahra Ahmadi   Stub  1234741562  Afghanistan      SOUTH ASIA   
4           Aziza Ahmadyar  Start  1195651393  Afghanistan      SOUTH ASIA   
...                    ...    ...         ...          ...             ...   
6931      Josiah Tongogara      C  1203429435     Zimbabwe  EASTERN AFRICA   
6932     Langton Towungana   Stub  1246280093     Zimbabwe  EASTERN AFRICA   
6933     Sengezo Tshabangu  Start  1228478288     Zimbabwe  EASTERN AFRICA   
6934   Herbert Ushewokunze   Stub   959111842     Zimbabwe  EASTERN AFRICA   
6935          Denis Walker      C  1247902630     Zimbabwe  EASTERN AFRICA   

      population  
0           42.4  
1           42.4  
2     

In [207]:
merged_df.head()

Unnamed: 0,Article,Score,RevID,country,region,population
0,Majah Ha Adrif,Start,1233202991,Afghanistan,SOUTH ASIA,42.4
1,Haroon al-Afghani,B,1230459615,Afghanistan,SOUTH ASIA,42.4
2,Tayyab Agha,Start,1225661708,Afghanistan,SOUTH ASIA,42.4
3,Khadija Zahra Ahmadi,Stub,1234741562,Afghanistan,SOUTH ASIA,42.4
4,Aziza Ahmadyar,Start,1195651393,Afghanistan,SOUTH ASIA,42.4


In [208]:
#changing order of columns and rename it as per assignment
ordered_df = merged_df[['country', 'region', 'population', 'Article', 'RevID', 'Score']]

# Rename columns to match the desired names
ordered_df.columns = ['country', 'region', 'population', 'article_title', 'revision_id', 'article_quality']

# Save to CSV
ordered_df.to_csv('wp_politicians_by_country.csv', index=False)

# Step 4
Analysis

In [210]:
ordered_df.head()

Unnamed: 0,country,region,population,article_title,revision_id,article_quality
0,Afghanistan,SOUTH ASIA,42.4,Majah Ha Adrif,1233202991,Start
1,Afghanistan,SOUTH ASIA,42.4,Haroon al-Afghani,1230459615,B
2,Afghanistan,SOUTH ASIA,42.4,Tayyab Agha,1225661708,Start
3,Afghanistan,SOUTH ASIA,42.4,Khadija Zahra Ahmadi,1234741562,Stub
4,Afghanistan,SOUTH ASIA,42.4,Aziza Ahmadyar,1195651393,Start


In [211]:
#define high quality articles
ordered_df['is_high_quality'] = ordered_df['article_quality'].isin(['FA', 'GA'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ordered_df['is_high_quality'] = ordered_df['article_quality'].isin(['FA', 'GA'])


### Calculating on a country basis

In [214]:
country_stats = ordered_df.groupby('country').agg(
    total_articles=('article_title', 'count'),
    high_quality_articles=('is_high_quality', 'sum'),
    population=('population', 'first'),  # Use the first population entry, assuming it's consistent per country
    region=('region', 'first')  # Use the first region entry, assuming each country has a single region
).reset_index()

In [216]:
#per capita vavlues
country_stats['total_articles_per_capita'] = country_stats['total_articles'] / country_stats['population']
country_stats['high_quality_articles_per_capita'] = country_stats['high_quality_articles'] / country_stats['population']

### Calculating on a regiaonal basis

In [218]:
region_stats = country_stats.groupby('region').agg(
    total_articles=('total_articles', 'sum'),
    high_quality_articles=('high_quality_articles', 'sum'),
    population=('population', 'sum')  # Sum populations across countries in each region
).reset_index()

In [219]:
#per capita values
region_stats['total_articles_per_capita'] = region_stats['total_articles'] / region_stats['population']
region_stats['high_quality_articles_per_capita'] = region_stats['high_quality_articles'] / region_stats['population']

# Step 5
Results

In [239]:
# 1. Top 10 countries by coverage
top_10_countries_coverage = country_stats.nlargest(12, 'total_articles_per_capita') # i did 12 since monaco and tuvalu have 0 population in dataset

# 2. Bottom 10 countries by coverage
bottom_10_countries_coverage = country_stats.nsmallest(10, 'total_articles_per_capita')

# 3. Top 10 countries by high quality
top_10_countries_high_quality = country_stats.nlargest(10, 'high_quality_articles_per_capita')

# 4. Bottom 10 countries by high quality
bottom_10_countries_high_quality = country_stats.nsmallest(10, 'high_quality_articles_per_capita')

# 5. Geographic regions by total coverage (ranked in descending order)
regions_by_total_coverage = region_stats.sort_values(by='total_articles_per_capita', ascending=False)

# 6. Geographic regions by high quality coverage (ranked in descending order)
regions_by_high_quality_coverage = region_stats.sort_values(by='high_quality_articles_per_capita', ascending=False)

In [240]:
top_10_countries_coverage.style.set_caption("Top 10 Countries by Coverage")

Unnamed: 0,country,total_articles,high_quality_articles,population,region,total_articles_per_capita,high_quality_articles_per_capita
96,Monaco,10,0,0.0,WESTERN EUROPE,inf,
154,Tuvalu,1,0,0.0,OCEANIA,inf,
4,Antigua and Barbuda,32,0,0.1,CARIBBEAN,320.0,0.0
51,Federated States of Micronesia,14,0,0.1,OCEANIA,140.0,0.0
93,Marshall Islands,13,0,0.1,OCEANIA,130.0,0.0
149,Tonga,10,0,0.1,OCEANIA,100.0,0.0
12,Barbados,25,0,0.3,CARIBBEAN,83.333333,0.0
98,Montenegro,36,3,0.6,SOUTHERN EUROPE,60.0,5.0
125,Seychelles,6,0,0.1,EASTERN AFRICA,60.0,0.0
17,Bhutan,44,0,0.8,SOUTH ASIA,55.0,0.0


In [231]:
bottom_10_countries_coverage.style.set_caption("Bottom 10 Countries by Coverage")

Unnamed: 0,country,total_articles,high_quality_articles,population,region,total_articles_per_capita,high_quality_articles_per_capita
31,China,16,0,1411.3,EAST ASIA,0.011337,0.0
57,Ghana,3,1,34.1,WESTERN AFRICA,0.087977,0.029326
66,India,151,0,1428.6,SOUTH ASIA,0.105698,0.0
122,Saudi Arabia,5,2,36.9,WESTERN ASIA,0.135501,0.054201
164,Zambia,3,0,20.2,EASTERN AFRICA,0.148515,0.0
108,Norway,1,0,5.5,NORTHERN EUROPE,0.181818,0.0
70,Israel,2,0,9.8,WESTERN ASIA,0.204082,0.0
45,Egypt,32,1,105.2,NORTHERN AFRICA,0.304183,0.009506
37,Cote d'Ivoire,10,0,30.9,WESTERN AFRICA,0.323625,0.0
100,Mozambique,11,0,33.9,EASTERN AFRICA,0.324484,0.0


In [232]:
top_10_countries_high_quality.style.set_caption("Top 10 Countries by High Quality")

Unnamed: 0,country,total_articles,high_quality_articles,population,region,total_articles_per_capita,high_quality_articles_per_capita
98,Montenegro,36,3,0.6,SOUTHERN EUROPE,60.0,5.0
86,Luxembourg,27,2,0.7,WESTERN EUROPE,38.571429,2.857143
1,Albania,70,7,2.7,SOUTHERN EUROPE,25.925926,2.592593
76,Kosovo,26,4,1.7,SOUTHERN EUROPE,15.294118,2.352941
90,Maldives,33,1,0.6,SOUTH ASIA,55.0,1.666667
85,Lithuania,58,4,2.9,NORTHERN EUROPE,20.0,1.37931
38,Croatia,64,5,3.8,SOUTHERN EUROPE,16.842105,1.315789
62,Guyana,17,1,0.8,SOUTH AMERICA,21.25,1.25
111,Palestinian Territory,61,6,5.5,WESTERN ASIA,11.090909,1.090909
129,Slovenia,37,2,2.1,SOUTHERN EUROPE,17.619048,0.952381


In [233]:
bottom_10_countries_high_quality.style.set_caption("Bottom 10 Countries by High Quality")

Unnamed: 0,country,total_articles,high_quality_articles,population,region,total_articles_per_capita,high_quality_articles_per_capita
4,Antigua and Barbuda,32,0,0.1,CARIBBEAN,320.0,0.0
9,Bahamas,7,0,0.4,CARIBBEAN,17.5,0.0
12,Barbados,25,0,0.3,CARIBBEAN,83.333333,0.0
15,Belize,9,0,0.5,CENTRAL AMERICA,18.0,0.0
16,Benin,7,0,13.7,WESTERN AFRICA,0.510949,0.0
17,Bhutan,44,0,0.8,SOUTH ASIA,55.0,0.0
20,Botswana,3,0,2.7,SOUTHERN AFRICA,1.111111,0.0
27,Cape Verde,9,0,0.6,WESTERN AFRICA,15.0,0.0
29,Chad,21,0,18.3,MIDDLE AFRICA,1.147541,0.0
31,China,16,0,1411.3,EAST ASIA,0.011337,0.0


In [234]:
regions_by_total_coverage.style.set_caption("Geographic Regions by Total Coverage")

Unnamed: 0,region,total_articles,high_quality_articles,population,total_articles_per_capita,high_quality_articles_per_capita
8,NORTHERN EUROPE,189,9,27.8,6.798561,0.323741
9,OCEANIA,71,1,11.1,6.396396,0.09009
0,CARIBBEAN,213,9,36.6,5.819672,0.245902
14,SOUTHERN EUROPE,785,53,151.5,5.181518,0.349835
1,CENTRAL AMERICA,186,10,51.3,3.625731,0.194932
17,WESTERN EUROPE,490,21,181.3,2.702703,0.11583
5,EASTERN EUROPE,700,38,266.2,2.629602,0.14275
16,WESTERN ASIA,605,27,295.4,2.04807,0.091401
13,SOUTHERN AFRICA,122,8,68.3,1.786237,0.11713
4,EASTERN AFRICA,663,17,480.9,1.378665,0.03535


In [235]:
regions_by_high_quality_coverage.style.set_caption("Geographic Regions by High Quality Coverage")

Unnamed: 0,region,total_articles,high_quality_articles,population,total_articles_per_capita,high_quality_articles_per_capita
14,SOUTHERN EUROPE,785,53,151.5,5.181518,0.349835
8,NORTHERN EUROPE,189,9,27.8,6.798561,0.323741
0,CARIBBEAN,213,9,36.6,5.819672,0.245902
1,CENTRAL AMERICA,186,10,51.3,3.625731,0.194932
5,EASTERN EUROPE,700,38,266.2,2.629602,0.14275
13,SOUTHERN AFRICA,122,8,68.3,1.786237,0.11713
17,WESTERN EUROPE,490,21,181.3,2.702703,0.11583
16,WESTERN ASIA,605,27,295.4,2.04807,0.091401
9,OCEANIA,71,1,11.1,6.396396,0.09009
7,NORTHERN AFRICA,301,17,255.9,1.176241,0.066432
