# Stage 0: SETUP
The following libraries are used directly. For the full list of isntalled ppackages and versions, please see requuirements.txt

In [1]:
# For accessing ORES API
import requests

# For processing
import pandas as pd
import numpy as np


# Stage 1: Data Acquisition

Data is downloaded as csv files, and is already available in this repository in the data folder. See the readme for details on the source of the data.

## Page Data
Page data is downloaded from [this](https://figshare.com/articles/dataset/Untitled_Item/5513449) repository.

In [2]:
page_data = pd.read_csv('data/raw/page_data.csv')
page_data.head()

Unnamed: 0,page,country,rev_id
0,Template:ZambiaProvincialMinisters,Zambia,235107991
1,Bir I of Kanem,Chad,355319463
2,Template:Zimbabwe-politician-stub,Zimbabwe,391862046
3,Template:Uganda-politician-stub,Uganda,391862070
4,Template:Namibia-politician-stub,Namibia,391862409


## Population Data
Population Data is downloaded from [this](https://docs.google.com/spreadsheets/d/1CFJO2zna2No5KqNm9rPK5PCACoXKzb-nycJFhV689Iw/edit#gid=283125346) google doc.

In [91]:
pop_data = pd.read_csv('data/raw/WPDS_2020_data.csv')
pop_data.head()

Unnamed: 0,FIPS,Name,Type,TimeFrame,Data (M),Population
0,WORLD,WORLD,World,2019,7772.85,7772850000
1,AFRICA,AFRICA,Sub-Region,2019,1337.918,1337918000
2,NORTHERN AFRICA,NORTHERN AFRICA,Sub-Region,2019,244.344,244344000
3,DZ,Algeria,Country,2019,44.357,44357000
4,EG,Egypt,Country,2019,100.803,100803000


# Stage 2: Data Processing
In this stage we combine and clean the data, and use the [ORES](https://github.com/wikimedia/ores) client to get predicted article quality.

# Clean and combine
Remove the templates from page data and regions from pop data

In [144]:
# Fix Channel Islands, which is incorrectly recorded as Sub-Region
pop_data.loc[168,'Type'] = 'Country'

# Add Region as column
regions = pop_data["Name"][pop_data["Type"] == "Sub-Region"]
regions.name = "region"
pop_data_with_region = pd.merge_asof(pop_data, regions, left_index=True, right_index=True)

# Filter out values
page_data_clean = page_data.loc[~page_data["page"].str.contains("^Template"), :]
pop_data_clean = pop_data_with_region.loc[pop_data_with_region["Type"] == "Country", :]

# Left join to keep countries without articles. Keep revid as int
combined_data = pop_data_clean.merge(page_data_clean, how="outer", left_on="Name", right_on="country")

# Record unmatched countries
unmatched_pop_data = combined_data[combined_data["page"].isna()].drop(columns=page_data_clean.columns)
unmatched_pop_data.to_csv('data/unmatched/wp_wpds_countries-no_match.csv')
print("{} Countries could not be matched".format(len(unmatched_pop_data)))

# Record unmatched pages
unmatched_page_data = combined_data[combined_data["Name"].isna()].drop(columns=pop_data_clean.columns)
unmatched_page_data.to_csv('data/unmatched/page_data-no_match.csv')
print("{} Pages could not be matched".format(len(unmatched_page_data)))

# Clean
combined_data_complete = combined_data.dropna() \
    .drop(columns=["FIPS", "Name", "Type","TimeFrame", "Data (M)"]) \
    .rename(columns={"country":"country", "name": "article_name", "rev_id":"revision_id", "Population": "population"})
combined_data_complete["revision_id"] = combined_data_complete["revision_id"].astype(int)

27 Countries could not be matched
1859 Pages could not be matched


## Get ORES Data
Data is acquired from the [ORES API](https://ores.wikimedia.org/v3/#). We request the "articlequality" model from the "enwiki" context for batches of revids at a time. Max 50 per request.

In [145]:
# Takes a batch of revids
def api_call(revids, context='enwiki', model='articlequality'):
    endpoint = "https://ores.wikimedia.org/v3/scores/{context}".format(context=context)
    headers = {
        'User-Agent': 'https://github.com/TheCaseca',
        'From': 'ccase20@uw.edu'
    }
    call = requests.get(endpoint, headers=headers, params={"models":model, "revids": "|".join(revids)})
    response = call.json()
    
    return response

In [146]:
# Clean revid format to str
revids = combined_data_complete["revision_id"]
revids = revids.astype(int)
revids = revids.astype(str)

n = 50
preds = {}
for i in range(len(revids)//n):
    if i % 100 == 0:
        print("Collecting rows {} to {} of {}".format(i*n, (i+1)*100*n, len(revids)))
    api_data = api_call(list(revids[i*n: (i+1)*n]))
    new_preds = {revid: score['articlequality'].get('score', {}).get('prediction') for revid, score in api_data['enwiki']['scores'].items()}
    preds.update(new_preds)

# Update dataframe
pred_df = pd.DataFrame.from_dict(preds, orient='index', columns=['article_quality_est'])
pred_df.index = pred_df.index.astype(int)
combined_data_complete_preds = combined_data_complete.merge(pred_df, left_on='revision_id', right_index=True)
combined_data_complete_preds.head()

Collecting rows 0 to 5000 of 44680
Collecting rows 5000 to 505000 of 44680
Collecting rows 10000 to 1005000 of 44680
Collecting rows 15000 to 1505000 of 44680
Collecting rows 20000 to 2005000 of 44680
Collecting rows 25000 to 2505000 of 44680
Collecting rows 30000 to 3005000 of 44680
Collecting rows 35000 to 3505000 of 44680
Collecting rows 40000 to 4005000 of 44680


Unnamed: 0,population,region,page,country,revision_id,article_quality_est
0,44357000.0,NORTHERN AFRICA,Ali Fawzi Rebaine,Algeria,686269631,Stub
1,44357000.0,NORTHERN AFRICA,Ahmed Attaf,Algeria,705910185,Stub
2,44357000.0,NORTHERN AFRICA,Ahmed Djoghlaf,Algeria,707427823,Stub
3,44357000.0,NORTHERN AFRICA,Hammi Larouissi,Algeria,708060571,Stub
4,44357000.0,NORTHERN AFRICA,Salah Goudjil,Algeria,708980561,Stub


We filter out any missing predictions and record them. 274 articles did not have a prediction from ORES.

In [147]:
# Recordd missing
missing_preds = combined_data_complete_preds[combined_data_complete_preds['article_quality_est'].isna()]
missing_preds.to_csv('data/unmatched/wp_wpds_politicians-no_prediction.csv')
print("{} Pages could not be predicted".format(len(missing_preds)))

# Remove from data, format and save
final_data = combined_data_complete_preds.dropna(subset=['article_quality_est'])
final_data.to_csv('wp_wpds_politicians_by_country.csv')


274 Pages could not be predicted


## Stage 3: Analysis
We analyze by comparing high-quality articles per population and per total articles. We define "high-quality" to be Good Article or Featured Article class.

In [158]:
data = pd.read_csv('wp_wpds_politicians_by_country.csv')

# High Quality column
data["High Quality"] = data["article_quality_est"].isin(["FA", "GA"]).astype(int)

# Group by country and take mean
data_by_country = data[["country", "region", "population", "page", "High Quality"]] \
    .groupby(["country", "region"]) \
    .agg({"population": "mean", "page":"size", "High Quality": "sum"}) \
    .rename(columns={"population": "Pop", "page": "Article Count"})

# Add calcualted fields
data_by_country["Proportion HQ"] = data_by_country["High Quality"] / data_by_country["Article Count"]
data_by_country["Article Per Mil. People"] = data_by_country["Article Count"] / data_by_country["Pop"] * 1000000
data_by_country.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Pop,Article Count,High Quality,Proportion HQ,Article Per Mil. People
country,region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Afghanistan,SOUTH ASIA,38928000.0,319,13,0.040752,8.194616
Albania,SOUTHERN EUROPE,2838000.0,456,3,0.006579,160.676533
Algeria,NORTHERN AFRICA,44357000.0,116,2,0.017241,2.615145
Andorra,SOUTHERN EUROPE,82000.0,34,0,0.0,414.634146
Angola,MIDDLE AFRICA,32522000.0,106,0,0.0,3.259332


In [159]:
# Styler for tables below
style_args = {
    "precision":0,
    "na_rep":'MISSING',
    "thousands":",",
    "formatter":{
        "Proportion HQ": "{:.1%}",
        "Article Per Mil. People": "{0:,.2f}",
    }
}

### Top 10 Countries by Coverage
10 highest-ranked countries in terms of number of politician articles as a proportion of country population

In [160]:
data_by_country.sort_values(by="Article Per Mil. People", ascending=False).iloc[0:10,].style.format(**style_args)

Unnamed: 0_level_0,Unnamed: 1_level_0,Pop,Article Count,High Quality,Proportion HQ,Article Per Mil. People
country,region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Tuvalu,OCEANIA,10000,54,4,7.4%,5400.0
Nauru,OCEANIA,11000,52,0,0.0%,4727.27
San Marino,SOUTHERN EUROPE,34000,81,0,0.0%,2382.35
Monaco,WESTERN EUROPE,38000,40,0,0.0%,1052.63
Liechtenstein,WESTERN EUROPE,39000,28,0,0.0%,717.95
Marshall Islands,OCEANIA,57000,37,0,0.0%,649.12
Tonga,OCEANIA,99000,63,0,0.0%,636.36
Iceland,NORTHERN EUROPE,368000,201,2,1.0%,546.2
Andorra,SOUTHERN EUROPE,82000,34,0,0.0%,414.63
Federated States of Micronesia,OCEANIA,106000,36,0,0.0%,339.62


### Bottom 10 Countries by Coverage
10 lowest-ranked countries in terms of number of politician articles as a proportion of country population

In [161]:
data_by_country.sort_values(by="Article Per Mil. People").iloc[0:10,].style.format(**style_args)

Unnamed: 0_level_0,Unnamed: 1_level_0,Pop,Article Count,High Quality,Proportion HQ,Article Per Mil. People
country,region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
India,SOUTH ASIA,1400100000,968,13,1.3%,0.69
Indonesia,SOUTHEAST ASIA,271739000,209,9,4.3%,0.77
China,EAST ASIA,1402385000,1129,40,3.5%,0.81
Uzbekistan,CENTRAL ASIA,34174000,28,3,10.7%,0.82
Ethiopia,EASTERN AFRICA,114916000,101,2,2.0%,0.88
Zambia,EASTERN AFRICA,18384000,25,0,0.0%,1.36
"Korea, North",EAST ASIA,25779000,36,8,22.2%,1.4
Thailand,SOUTHEAST ASIA,66534000,112,3,2.7%,1.68
Mozambique,EASTERN AFRICA,31166000,58,0,0.0%,1.86
Bangladesh,SOUTH ASIA,169809000,317,3,0.9%,1.87


### Top 10 Countries by Relative Quality
10 highest-ranked countries  in terms of the relative proportion of politician articles that are of GA and FA-quality

In [162]:
data_by_country.sort_values(by="Proportion HQ", ascending=False).iloc[0:10,].style.format(**style_args)

Unnamed: 0_level_0,Unnamed: 1_level_0,Pop,Article Count,High Quality,Proportion HQ,Article Per Mil. People
country,region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Korea, North",EAST ASIA,25779000,36,8,22.2%,1.4
Saudi Arabia,WESTERN ASIA,35041000,117,15,12.8%,3.34
Romania,EASTERN EUROPE,19241000,343,42,12.2%,17.83
Central African Republic,MIDDLE AFRICA,4830000,66,8,12.1%,13.66
Uzbekistan,CENTRAL ASIA,34174000,28,3,10.7%,0.82
Mauritania,WESTERN AFRICA,4650000,48,5,10.4%,10.32
Guatemala,CENTRAL AMERICA,18066000,83,7,8.4%,4.59
Dominica,CARIBBEAN,72000,12,1,8.3%,166.67
Syria,WESTERN ASIA,19398000,128,10,7.8%,6.6
Benin,WESTERN AFRICA,12209000,91,7,7.7%,7.45


### Bottom 10 Countries by Relative Quality
10 lowest-ranked countries  in terms of the relative proportion of politician articles that are of GA and FA-quality. 

Note that 37 countries had no High Quality articles, so they are subsequently sorted by Article Count in descending order to produce the bottom 10.

In [163]:
data_by_country.sort_values(by=["Proportion HQ", "Article Count"], ascending=[True, False]).iloc[0:10,].style.format(**style_args)

Unnamed: 0_level_0,Unnamed: 1_level_0,Pop,Article Count,High Quality,Proportion HQ,Article Per Mil. People
country,region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Finland,NORTHERN EUROPE,5529000,569,0,0.0%,102.91
Moldova,EASTERN EUROPE,3535000,421,0,0.0%,119.09
Estonia,NORTHERN EUROPE,1331000,148,0,0.0%,111.19
Costa Rica,CENTRAL AMERICA,5111000,147,0,0.0%,28.76
Tunisia,NORTHERN AFRICA,11896000,138,0,0.0%,11.6
Angola,MIDDLE AFRICA,32522000,106,0,0.0%,3.26
Solomon Islands,OCEANIA,715000,97,0,0.0%,135.66
San Marino,SOUTHERN EUROPE,34000,81,0,0.0%,2382.35
Kazakhstan,CENTRAL ASIA,18732000,78,0,0.0%,4.16
Tonga,OCEANIA,99000,63,0,0.0%,636.36


In [172]:
# Group by region and take mean
data_by_region = data[["region", "page", "High Quality"]] \
    .groupby("region") \
    .agg({"page":"size", "High Quality": "sum"}) \
    .rename(columns={"page": "Article Count"})

# Use region populations to account for missing countries
pop_data = pd.read_csv('data/raw/WPDS_2020_data.csv')
data_by_region_with_pop = data_by_region \
    .reset_index() \
    .merge(pop_data[["Name", "Population"]], left_on="region", right_on="Name") \
    .rename(columns={'Population': "Pop"}) \
    .drop(columns=["Name"]) \
    .set_index("region")

# Add calcualted fields
data_by_region_with_pop["Proportion HQ"] = data_by_region_with_pop["High Quality"] / data_by_region_with_pop["Article Count"]
data_by_region_with_pop["Article Per Mil. People"] = data_by_region_with_pop["Article Count"] / data_by_region_with_pop["Pop"] * 1000000
data_by_region_with_pop.head()

Unnamed: 0_level_0,Article Count,High Quality,Pop,Proportion HQ,Article Per Mil. People
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CARIBBEAN,695,13,43233000,0.018705,16.075683
CENTRAL AMERICA,1543,23,178611000,0.014906,8.638886
CENTRAL ASIA,245,7,74961000,0.028571,3.268366
EAST ASIA,2473,76,1641063000,0.030732,1.50695
EASTERN AFRICA,2502,35,444970000,0.013989,5.622851


### Regions by Coverage
Regions terms of number of politician articles as a proportion of region population

In [173]:
data_by_region_with_pop.sort_values(by="Article Per Mil. People", ascending=False).style.format(**style_args)

Unnamed: 0_level_0,Article Count,High Quality,Pop,Proportion HQ,Article Per Mil. People
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
OCEANIA,3096,60,43155000,1.9%,71.74
NORTHERN EUROPE,3763,102,105990000,2.7%,35.5
SOUTHERN EUROPE,3710,74,153251000,2.0%,24.21
WESTERN EUROPE,4560,56,195479000,1.2%,23.33
CARIBBEAN,695,13,43233000,1.9%,16.08
EASTERN EUROPE,3732,118,291902000,3.2%,12.79
WESTERN ASIA,2563,89,280927000,3.5%,9.12
CENTRAL AMERICA,1543,23,178611000,1.5%,8.64
SOUTH AMERICA,3032,40,429191000,1.3%,7.06
SOUTHERN AFRICA,472,9,67732000,1.9%,6.97


### Regions by Relative Quality
Regions  in terms of the relative proportion of politician articles that are of GA and FA-quality. 

In [174]:
data_by_region_with_pop.sort_values(by="Proportion HQ", ascending=False).style.format(**style_args)

Unnamed: 0_level_0,Article Count,High Quality,Pop,Proportion HQ,Article Per Mil. People
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NORTHERN AMERICA,1901,104,368193000,5.5%,5.16
SOUTHEAST ASIA,2020,73,661845000,3.6%,3.05
WESTERN ASIA,2563,89,280927000,3.5%,9.12
EASTERN EUROPE,3732,118,291902000,3.2%,12.79
EAST ASIA,2473,76,1641063000,3.1%,1.51
CENTRAL ASIA,245,7,74961000,2.9%,3.27
NORTHERN EUROPE,3763,102,105990000,2.7%,35.5
MIDDLE AFRICA,665,16,179757000,2.4%,3.7
NORTHERN AFRICA,899,19,244344000,2.1%,3.68
SOUTHERN EUROPE,3710,74,153251000,2.0%,24.21
