# Stage 0: SETUP
The following libraries are used directly. For the full list of isntalled ppackages and versions, please see requuirements.txt

In [1]:
# For accessing ORES API
import requests

# For processing
import pandas as pd
import numpy as np


# Stage 1: Data Acquisition

Data is downloaded as csv files, and is already available in this repository in the data folder. See the readme for details on the source of the data.

## Page Data
Page data is downloaded from [this](https://figshare.com/articles/dataset/Untitled_Item/5513449) repository.

In [2]:
page_data = pd.read_csv('data/raw/page_data.csv')
page_data.head()

Unnamed: 0,page,country,rev_id
0,Template:ZambiaProvincialMinisters,Zambia,235107991
1,Bir I of Kanem,Chad,355319463
2,Template:Zimbabwe-politician-stub,Zimbabwe,391862046
3,Template:Uganda-politician-stub,Uganda,391862070
4,Template:Namibia-politician-stub,Namibia,391862409


## Population Data
Population Data is downloaded from [this](https://docs.google.com/spreadsheets/d/1CFJO2zna2No5KqNm9rPK5PCACoXKzb-nycJFhV689Iw/edit#gid=283125346) google doc.

In [91]:
pop_data = pd.read_csv('data/raw/WPDS_2020_data.csv')
pop_data.head()

Unnamed: 0,FIPS,Name,Type,TimeFrame,Data (M),Population
0,WORLD,WORLD,World,2019,7772.85,7772850000
1,AFRICA,AFRICA,Sub-Region,2019,1337.918,1337918000
2,NORTHERN AFRICA,NORTHERN AFRICA,Sub-Region,2019,244.344,244344000
3,DZ,Algeria,Country,2019,44.357,44357000
4,EG,Egypt,Country,2019,100.803,100803000


# Stage 2: Data Processing
In this stage we combine and clean the data, and use the [ORES](https://github.com/wikimedia/ores) client to get predicted article quality.

# Clean and combine
Remove the templates from page data and regions from pop data

In [94]:
# Add Region as column
regions = pop_data["Name"][pop_data["Type"] == "Sub-Region"]
regions.name = "region"
pop_data_with_region = pd.merge_asof(pop_data, regions, left_index=True, right_index=True)

# Filter out values
page_data_clean = page_data.loc[~page_data["page"].str.contains("^Template"), :]
pop_data_clean = pop_data_with_region.loc[pop_data_with_region["Type"] == "Country", :]

# Left join to keep countries without articles. Keep revid as int
combined_data = pop_data_clean.merge(page_data_clean, how="outer", left_on="Name", right_on="country")

# Record unmatched countries
unmatched_pop_data = combined_data[combined_data["page"].isna()].drop(columns=page_data_clean.columns)
unmatched_pop_data.to_csv('data/unmatched/wp_wpds_countries-no_match.csv')
print("{} Countries could not be matched".format(len(unmatched_pop_data)))

# Record unmatched pages
unmatched_page_data = combined_data[combined_data["Name"].isna()].drop(columns=pop_data_clean.columns)
unmatched_page_data.to_csv('data/unmatched/page_data-no_match.csv')
print("{} Pages could not be matched".format(len(unmatched_page_data)))

# Clean
combined_data_complete = combined_data.dropna() \
    .drop(columns=["FIPS", "Name", "Type","TimeFrame", "Data (M)"]) \
    .rename(columns={"country":"country", "name": "article_name", "rev_id":"revision_id", "Population": "population"})
combined_data_complete["revision_id"] = combined_data_complete["revision_id"].astype(int)

26 Countries could not be matched
1859 Pages could not be matched


## Get ORES Data
Data is acquired from the [ORES API](https://ores.wikimedia.org/v3/#). We request the "articlequality" model from the "enwiki" context for batches of revids at a time. Max 50 per request.

In [85]:
# Takes a batch of revids
def api_call(revids, context='enwiki', model='articlequality'):
    endpoint = "https://ores.wikimedia.org/v3/scores/{context}".format(context=context)
    headers = {
        'User-Agent': 'https://github.com/TheCaseca',
        'From': 'ccase20@uw.edu'
    }
    call = requests.get(endpoint, headers=headers, params={"models":model, "revids": "|".join(revids)})
    response = call.json()
    
    return response

In [86]:
# Clean revid format to str
revids = combined_data_complete["revision_id"]
revids = revids.astype(int)
revids = revids.astype(str)

n = 50
preds = {}
for i in range(len(revids)//n):
    if i % 100 == 0:
        print("Collecting rows {} to {} of {}".format(i*n, (i+1)*100*n, len(revids)))
    api_data = api_call(list(revids[i*n: (i+1)*n]))
    new_preds = {revid: score['articlequality'].get('score', {}).get('prediction') for revid, score in api_data['enwiki']['scores'].items()}
    preds.update(new_preds)

# Update dataframe
pred_df = pd.DataFrame.from_dict(preds, orient='index', columns=['article_quality_est'])
pred_df.index = pred_df.index.astype(int)
combined_data_complete_preds = combined_data_complete.merge(pred_df, left_on='revision_id', right_index=True)
combined_data_complete_preds.head()

Collecting rows 0 to 5000 of 44680
Collecting rows 5000 to 505000 of 44680
Collecting rows 10000 to 1005000 of 44680
Collecting rows 15000 to 1505000 of 44680
Collecting rows 20000 to 2005000 of 44680
Collecting rows 25000 to 2505000 of 44680
Collecting rows 30000 to 3005000 of 44680
Collecting rows 35000 to 3505000 of 44680
Collecting rows 40000 to 4005000 of 44680


Unnamed: 0,population,Region_x,Region_y,page,country,revision_id,article_quality_est
0,44357000.0,NORTHERN AFRICA,NORTHERN AFRICA,Ali Fawzi Rebaine,Algeria,686269631,Stub
1,44357000.0,NORTHERN AFRICA,NORTHERN AFRICA,Ahmed Attaf,Algeria,705910185,Stub
2,44357000.0,NORTHERN AFRICA,NORTHERN AFRICA,Ahmed Djoghlaf,Algeria,707427823,Stub
3,44357000.0,NORTHERN AFRICA,NORTHERN AFRICA,Hammi Larouissi,Algeria,708060571,Stub
4,44357000.0,NORTHERN AFRICA,NORTHERN AFRICA,Salah Goudjil,Algeria,708980561,Stub


We filter out any missing predictions and record them. 274 articles did not have a prediction from ORES.

In [101]:
# Recordd missing
missing_preds = combined_data_complete_preds[combined_data_complete_preds['article_quality_est'].isna()]
missing_preds.to_csv('data/unmatched/wp_wpds_politicians-no_prediction.csv')
print("{} Pages could not be predicted".format(len(missing_preds)))

# Remove from data, format and save
final_data = combined_data_complete_preds.dropna(subset=['article_quality_est'])
final_data.to_csv('wp_wpds_politicians_by_country.csv')


274 Pages could not be predicted


## Stage 5: Analysis
We analyze by comparing high-quality articles per population and per total articles. We define "high-quality" to be Good Article or Featured Article class.

In [113]:
data = pd.read_csv('wp_wpds_politicians_by_country.csv')

# High Quality column
data["High Quality"] = data["article_quality_est"].isin(["FA", "GA"]).astype(int)

# Group by country and take mean
data_by_country = data[["country", "population", "page", "High Quality"]] \
    .groupby("country") \
    .agg({"population": "mean", "page":"size", "High Quality": "sum"}) \
    .rename(columns={"population": "Pop", "page": "Article Count"})

# Add calcualted fields
data_by_country["Proportion HQ"] = data_by_country["High Quality"] / data_by_country["Article Count"]
data_by_country["Article Per Mil. People"] = data_by_country["Article Count"] / data_by_country["Pop"] * 1000000
print(data_by_country.head())

# Group by region and take mean
data_by_region = data[["region", "population", "page", "High Quality"]] \
    .groupby("region") \
    .agg({"population": "mean", "page":"size", "High Quality": "sum"}) \
    .rename(columns={"population": "Pop", "page": "Article Count"})

# Add calcualted fields
data_by_region["Proportion HQ"] = data_by_region["High Quality"] / data_by_region["Article Count"]
data_by_region["Article Per Mil. People"] = data_by_region["Article Count"] / data_by_region["Pop"] * 1000000

                    Pop  Article Count  High Quality  Proportion HQ  \
country                                                               
Afghanistan  38928000.0            319            13       0.040752   
Albania       2838000.0            456             3       0.006579   
Algeria      44357000.0            116             2       0.017241   
Andorra         82000.0             34             0       0.000000   
Angola       32522000.0            106             0       0.000000   

             Article Per Mil. People  
country                               
Afghanistan                 8.194616  
Albania                   160.676533  
Algeria                     2.615145  
Andorra                   414.634146  
Angola                      3.259332  


In [106]:
# Styler for tables below
style_args = {
    "precision":0,
    "na_rep":'MISSING',
    "thousands":",",
    "formatter":{
        "Proportion HQ": "{:.1%}",
        "Article Per Mil. People": "{0:,.2f}",
    }
}

### Top 10 Countries by Coverage
10 highest-ranked countries in terms of number of politician articles as a proportion of country population

In [107]:
data_by_country.sort_values(by="Article Per Mil. People", ascending=False).iloc[0:10,].style.format(**style_args)

Unnamed: 0_level_0,Pop,Article Count,High Quality,Proportion HQ,Article Per Mil. People
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Tuvalu,10000,54,4,7.4%,5400.0
Nauru,11000,52,0,0.0%,4727.27
San Marino,34000,81,0,0.0%,2382.35
Monaco,38000,40,0,0.0%,1052.63
Liechtenstein,39000,28,0,0.0%,717.95
Marshall Islands,57000,37,0,0.0%,649.12
Tonga,99000,63,0,0.0%,636.36
Iceland,368000,201,2,1.0%,546.2
Andorra,82000,34,0,0.0%,414.63
Federated States of Micronesia,106000,36,0,0.0%,339.62


### Bottom 10 Countries by Coverage
10 lowest-ranked countries in terms of number of politician articles as a proportion of country population

In [108]:
data_by_country.sort_values(by="Article Per Mil. People").iloc[0:10,].style.format(**style_args)

Unnamed: 0_level_0,Pop,Article Count,High Quality,Proportion HQ,Article Per Mil. People
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
India,1400100000,968,13,1.3%,0.69
Indonesia,271739000,209,9,4.3%,0.77
China,1402385000,1129,40,3.5%,0.81
Uzbekistan,34174000,28,3,10.7%,0.82
Ethiopia,114916000,101,2,2.0%,0.88
Zambia,18384000,25,0,0.0%,1.36
"Korea, North",25779000,36,8,22.2%,1.4
Thailand,66534000,112,3,2.7%,1.68
Mozambique,31166000,58,0,0.0%,1.86
Bangladesh,169809000,317,3,0.9%,1.87


### Top 10 Countries by Relative Quality
10 highest-ranked countries  in terms of the relative proportion of politician articles that are of GA and FA-quality

In [109]:
data_by_country.sort_values(by="Proportion HQ", ascending=False).iloc[0:10,].style.format(**style_args)

Unnamed: 0_level_0,Pop,Article Count,High Quality,Proportion HQ,Article Per Mil. People
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"Korea, North",25779000,36,8,22.2%,1.4
Saudi Arabia,35041000,117,15,12.8%,3.34
Romania,19241000,343,42,12.2%,17.83
Central African Republic,4830000,66,8,12.1%,13.66
Uzbekistan,34174000,28,3,10.7%,0.82
Mauritania,4650000,48,5,10.4%,10.32
Guatemala,18066000,83,7,8.4%,4.59
Dominica,72000,12,1,8.3%,166.67
Syria,19398000,128,10,7.8%,6.6
Benin,12209000,91,7,7.7%,7.45


### Bottom 10 Countries by Relative Quality
10 lowest-ranked countries  in terms of the relative proportion of politician articles that are of GA and FA-quality. 

Note that 37 countries had no High Quality articles, so they are subsequently sorted by Article Count in descending order to produce the bottom 10.

In [110]:
data_by_country.sort_values(by=["Proportion HQ", "Article Count"], ascending=[True, False]).iloc[0:10,].style.format(**style_args)

Unnamed: 0_level_0,Pop,Article Count,High Quality,Proportion HQ,Article Per Mil. People
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Finland,5529000,569,0,0.0%,102.91
Moldova,3535000,421,0,0.0%,119.09
Estonia,1331000,148,0,0.0%,111.19
Costa Rica,5111000,147,0,0.0%,28.76
Tunisia,11896000,138,0,0.0%,11.6
Angola,32522000,106,0,0.0%,3.26
Solomon Islands,715000,97,0,0.0%,135.66
San Marino,34000,81,0,0.0%,2382.35
Kazakhstan,18732000,78,0,0.0%,4.16
Tonga,99000,63,0,0.0%,636.36


### Regions by Coverage
Regions terms of number of politician articles as a proportion of region population

In [114]:
data_by_region.sort_values(by="Article Per Mil. People", ascending=False).style.format(**style_args)

Unnamed: 0_level_0,Pop,Article Count,High Quality,Proportion HQ,Article Per Mil. People
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
OCEANIA,14780099,3096,60,1.9%,209.47
Channel Islands,19288046,3763,102,2.7%,195.09
SOUTHERN EUROPE,27437086,3710,74,2.0%,135.22
WESTERN EUROPE,41808685,4560,56,1.2%,109.07
WESTERN ASIA,24003602,2563,89,3.5%,106.78
CARIBBEAN,6842079,695,13,1.9%,101.58
EASTERN EUROPE,50202939,3732,118,3.2%,74.34
EASTERN AFRICA,34800877,2502,35,1.4%,71.89
SOUTH AMERICA,59771201,3032,40,1.3%,50.73
MIDDLE AFRICA,13794516,665,16,2.4%,48.21


### Regions by Relative Quality
Regions  in terms of the relative proportion of politician articles that are of GA and FA-quality. 

In [115]:
data_by_region.sort_values(by="Proportion HQ", ascending=False).style.format(**style_args)

Unnamed: 0_level_0,Pop,Article Count,High Quality,Proportion HQ,Article Per Mil. People
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NORTHERN AMERICA,201142476,1901,104,5.5%,9.45
SOUTHEAST ASIA,82734684,2020,73,3.6%,24.42
WESTERN ASIA,24003602,2563,89,3.5%,106.78
EASTERN EUROPE,50202939,3732,118,3.2%,74.34
EAST ASIA,673606712,2473,76,3.1%,3.67
CENTRAL ASIA,13964951,245,7,2.9%,17.54
Channel Islands,19288046,3763,102,2.7%,195.09
MIDDLE AFRICA,13794516,665,16,2.4%,48.21
NORTHERN AFRICA,47502491,899,19,2.1%,18.93
SOUTHERN EUROPE,27437086,3710,74,2.0%,135.22
