In [14]:
#Imports

import pandas as pd
import numpy as np

# Dataset compilation

The function below, consolidate_datasets, takes in the filenames of the four datasets we're using and compiles them all together. This function does several things:
1. Adds the region column for each country
2. Merges all datasets together, drops unnecessary columns, and renames the columns
3. Documents which articles are missing revision IDs and therefore missing scores - omits them
4. Documents which countries have no population information - omits them

It then returns the finished dataset as a dataframe.

In [17]:
def consolidate_datasets(politicians_by_county_filename, population_by_country_filename, politician_revid_filename, score_predictions_filename):
    """
    Conslidates all datasets by giving their locations.
    Parameters:
        - politicians_by_country_filename: Takes in the location to the CSV 
        containing politicians and their country of origin
        - population_by_country_filename: Takes in the location to the CSV 
        containing countries/regions and their population
        - politician_revid_filename: Takes in the location to the CSV 
        containing politicians wikipedia titles and their last revision ID
        - score_predictions_filename: Takes in the location to the CSV 
        containing wikipedia articles and their predicted ORES score

    Returns a fully merged and cleaned dataset
    """
    
    # Loads in all datasets
    politicians_by_country = pd.read_csv(f"{politicians_by_county_filename}.csv")
    population_by_country = pd.read_csv(f"{population_by_country_filename}.csv")
    politician_revid = pd.read_csv(f"{politician_revid_filename}.csv")
    article_predictions = pd.read_csv(f"{score_predictions_filename}.csv")
    
    # adds region labels
    for index, row in population_by_country.iterrows():
        geography = row['Geography']
        if geography.isupper():  # Check if it is a region
            current_region = geography
        else:  # It's a country
            population_by_country.at[index, 'Region'] = current_region

    # Merges all data together and drops unnecessary columns
    pol_country_wiki_info = pd.merge(politicians_by_country, politician_revid, left_on='name', right_on='title', how='left')
    pol_country_wiki_info.drop(columns=['title'], inplace=True)

    wiki_pop = pd.merge(pol_country_wiki_info, population_by_country, left_on='country', right_on='Geography', how='left')
    wiki_pop.drop(columns=['Geography'], inplace=True)

    all_merged = pd.merge(wiki_pop, article_predictions, left_on='name', right_on='article_title', how='left')

    all_merged.drop(columns=['article_title'], inplace=True)
    all_merged.drop(columns=['url'], inplace=True)

    filtered_df = filter_missing_revid(all_merged)
    final_df = filter_missing_countries(filtered_df)
    complete_dataframe = final_df.copy()
    
    # renames columns
    complete_dataframe.rename(columns={'best_guess_rating': 'article_quality'}, inplace=True)
    complete_dataframe.rename(columns={'lastrevid': 'revision_id'}, inplace=True)
    complete_dataframe.rename(columns={'Population': 'population'}, inplace=True)
    complete_dataframe.rename(columns={'Region': 'region'}, inplace=True)
    complete_dataframe.rename(columns={'name': 'article_title'}, inplace=True)

    #converts revid back to integer

    complete_dataframe["revision_id"] = complete_dataframe["revision_id"].astype(int)
    
    return complete_dataframe

The function below, filter_missing_revid, is a helper function that takes in a dataframe, prints which politicians are missing revision IDs for their wikipedia page and then filters them out of the dataframe. Finally it returns the filtered dataframe.

In [15]:
def filter_missing_revid(df):
    """
    Prints articles/politicians that have no last revision ID then removes them from the dataset
        - df: The input is a dataframe containing 
        information about politicians, their wikipedia pages, and country information
    Returns a dataframe with missing revision ID rows removed
    """
        
    # Identify rows where 'lastrevid' is NA
    na_rows = df[df['lastrevid'].isna()]
    article_titles_with_na = na_rows['name'].tolist()

    # Print the number of such rows
    num_na_rows = len(na_rows)
    print("Number of articles with missing revisionIDs:", num_na_rows)
    # Print the 'article_title' values from the rows with NA in 'lastrevid'
    print("Politicians with missing revisionID:", article_titles_with_na)

    # Exclude those rows from the DataFrame
    return df.dropna(subset=['lastrevid'], inplace=True)
   

The function below, filter_missing_countries, is a helper function that takes in a dataframe, prints and writes which countries have no population information, and then filters out articles from those countries. 

In [16]:
def filter_missing_countries(input_df):
    """
    Prints countries that have population information then removes them from the dataset
        - input_df: The input is a dataframe containing 
        information about politicians, their wikipedia pages, and country information
    Returns a dataframe with population information removed
    """
    # Identify rows where 'Population' is NA
    na_rows = input_df[input_df['Population'].isna()]

    # Get unique values in the 'Country' column for those rows
    unique_countries_with_na = na_rows['country'].unique().tolist()
    print("Countries with missing population data:", unique_countries_with_na)

    # Save unique countries to a text file
    with open('results/wp_countries-no_match.txt', 'w') as f:
        for country in unique_countries_with_na:
            f.write(f"{country}\n")

    # Print the number of affected rows
    num_na_rows = len(na_rows)
    print("Number of rows filtered due to missing population data:", num_na_rows)

    # Filter out rows with NA in 'Population' in place
    return input_df.dropna(subset=['Population'], inplace=True)

The cell below runs consolidate_datasets and saves the dataframe to wp_politicians_by_country. It also prints the articles with missing revision ID, the countries with no population information, and the number of politicians/rows affected.

In [18]:
wp_politicians_by_country = consolidate_datasets(politicians_by_county_filename= "data/politicians_by_country_AUG.2024",
                     population_by_country_filename= "data/population_by_country_AUG.2024",
                     politician_revid_filename= "results/politician_revid",
                     score_predictions_filename= "results/prediction_df_final")

Number of articles with missing revisionIDs: 8
Politicians with missing revisionID: ['Barbara Eibinger-Miedl', 'Mehrali Gasimov', 'Kyaw Myint', 'André Ngongang Ouandji', 'Tomás Pimentel', 'Richard Sumah', "Segun ''Aeroland'' Adewale", 'Bashir Bililiqo']
Countries with missing population data: ['Guinea-Bissau', 'Korean', 'Korea, South']
Number of rows filtered due to missing population data: 142


The cell below saves the wp_politicians_by_country dataframe to a csv. This can be easily modified to save to a desired location.

In [19]:
wp_politicians_by_country.to_csv("results/wp_politicians_by_country.csv", index=False)

# Analysis

The function below, total_articles_per_capita is used to construct all 6 data tables. It uses the completed dataframe generated above and has several parameters which can be adjusted to get the desired tables. 
1. grouping_level can either be "region" or "country". "country" is used for the first four plots which focus on statistics by country, while the last two plots use "region" which focus on statistics by region.
2. top_or_bottom can either be "top" or "bottom" and determines whether to output the 10 highest or 10 lowest countries.
3. high_quality can either be True or False and indicates whether to just use high quality articles when computing the statistics.

One the parameters have been set, the function calculates the per capita statistics and returns the information as a dataframe/table. Because the population is given in millions, it multiplies the population by a million in order to get accurate per capita statistics.

In [20]:
def total_articles_per_capita(politician_information_df, grouping_level, top_or_bottom, high_quality=False):
    """
    Finds per capita statistics depending on the parameters selected
        - politician_information_df: The input is a dataframe containing 
        information about politicians, their wikipedia pages, and country information
        - grouping_level: String that's either 'region' or 'country'. 
        Specified to choose which column to groupby
        - top_or_bottom: String that's either 'top' or 'bottom' to choose 
        whether to return the top results or bottom results
        - high_quality: Boolean that's either True or False to determine 
        whether to filter to include only FA and GA articles.
    Returns a table with the ordered per capita results.
    """
    if grouping_level not in ['region', 'country']:
        raise ValueError("Parameter 'grouping_level' must be either 'region' or 'country'.")
    
    if top_or_bottom not in ['top', 'bottom']:
        raise ValueError("Parameter 'top_or_bottom' must be either 'top' or 'bottom'.")
    

    if high_quality:
        use_df = politician_information_df[politician_information_df['article_quality'].isin(['FA', 'GA'])].copy()
    else:
        use_df = politician_information_df.copy()
    
    # 0 population means <1000000 - but is misinterpreted to mean literally 0, therefore we replace them with Nan
    use_df['population'] = use_df['population'].replace(0, np.nan)

    # dummy variable to determine whether to sum the population (region) or just use one of the values (country)
    pop_manager = 'sum'
    # Group by 'region' and aggregate counts and sums
    if grouping_level == "country":
        pop_manager = "median"

    grouped_result = use_df.groupby(f'{grouping_level}').agg(
        count=(f'{grouping_level}', 'size'),        # Count of rows
        total_population=('population', f'{pop_manager}')  # Sum of Population
    ).reset_index()

    # Calculate rows per capita
    grouped_result['politicians_per_capita'] = grouped_result['count'] / (grouped_result['total_population'] * 1000000)
    grouped_result.drop(columns=["total_population", "count"], inplace=True)
    # Sort by rows_per_capita in descending order
    if top_or_bottom == 'top':
        grouped_result.sort_values(by='politicians_per_capita', ascending=False, inplace=True)
    else:
        grouped_result.sort_values(by='politicians_per_capita', ascending=True, inplace=True)
    
    if grouping_level == "region":
        return grouped_result
    else:
        return grouped_result.head(10)
    


All cells below are very similar, they just adjust the different parameters to get the tables for all six questions in order.

In [21]:
top_countries_by_coverage = total_articles_per_capita(wp_politicians_by_country, "country", "top", high_quality=False)
top_countries_by_coverage

Unnamed: 0,country,politicians_per_capita
4,Antigua and Barbuda,0.00033
51,Federated States of Micronesia,0.00014
93,Marshall Islands,0.00013
149,Tonga,0.0001
12,Barbados,8.3e-05
125,Seychelles,6e-05
98,Montenegro,6e-05
17,Bhutan,5.5e-05
90,Maldives,5.5e-05
121,Samoa,4e-05


In [22]:
bottom_countries_by_coverage = total_articles_per_capita(wp_politicians_by_country, "country", "bottom", high_quality=False)
bottom_countries_by_coverage

Unnamed: 0,country,politicians_per_capita
31,China,1.133707e-08
57,Ghana,8.797654e-08
66,India,1.056979e-07
122,Saudi Arabia,1.355014e-07
164,Zambia,1.485149e-07
108,Norway,1.818182e-07
70,Israel,2.040816e-07
45,Egypt,3.041825e-07
37,Cote d'Ivoire,3.236246e-07
50,Ethiopia,3.478261e-07


In [23]:
top_countries_by_high_quality = total_articles_per_capita(wp_politicians_by_country, "country", "top", high_quality=True)
top_countries_by_high_quality

Unnamed: 0,country,politicians_per_capita
63,Montenegro,5e-06
56,Luxembourg,2.857143e-06
1,Albania,2.592593e-06
50,Kosovo,2.352941e-06
58,Maldives,1.666667e-06
55,Lithuania,1.37931e-06
25,Croatia,1.315789e-06
40,Guyana,1.25e-06
70,Palestinian Territory,1.090909e-06
81,Slovenia,9.52381e-07


In [24]:
bottom_countries_by_high_quality = total_articles_per_capita(wp_politicians_by_country, "country", "bottom", high_quality=True)
bottom_countries_by_high_quality

Unnamed: 0,country,politicians_per_capita
9,Bangladesh,5.763689e-09
29,Egypt,9.505703e-09
31,Ethiopia,1.581028e-08
46,Japan,1.606426e-08
69,Pakistan,1.663202e-08
22,Colombia,1.915709e-08
23,Congo DR,1.955034e-08
100,Vietnam,2.022245e-08
95,Uganda,2.057613e-08
2,Algeria,2.136752e-08


In [25]:
regions_by_total_coverage = total_articles_per_capita(wp_politicians_by_country, "region", "top", high_quality=False)
regions_by_total_coverage

Unnamed: 0,region,politicians_per_capita
9,OCEANIA,6.480648e-07
8,NORTHERN EUROPE,1.643576e-07
0,CARIBBEAN,1.553149e-07
1,CENTRAL AMERICA,1.325063e-07
2,CENTRAL ASIA,5.343819e-08
16,WESTERN ASIA,4.558622e-08
14,SOUTHERN EUROPE,4.438479e-08
4,EASTERN AFRICA,2.77556e-08
17,WESTERN EUROPE,2.621211e-08
7,NORTHERN AFRICA,2.480717e-08


In [26]:
regions_by_high_quality = total_articles_per_capita(wp_politicians_by_country, "region", "top", high_quality=True)
regions_by_high_quality

Unnamed: 0,region,politicians_per_capita
8,NORTHERN EUROPE,1.969365e-07
1,CENTRAL AMERICA,1.083424e-07
9,OCEANIA,1.052632e-07
0,CARIBBEAN,1e-07
2,CENTRAL ASIA,6.518905e-08
16,WESTERN ASIA,4.872767e-08
14,SOUTHERN EUROPE,4.742729e-08
7,NORTHERN AFRICA,2.973068e-08
4,EASTERN AFRICA,2.678853e-08
17,WESTERN EUROPE,2.505967e-08
