# Stage 0: SETUP
The following libraries are used directly. For the full list of isntalled ppackages and versions, please see requuirements.txt

In [1]:
# For accessing ORES API
import requests

# For processing
import pandas as pd
import numpy as np


# Stage 1: Data Acquisition

Data is downloaded as csv files, and is already available in this repository in the data folder. See the readme for details on the source of the data.

## Page Data
Page data is downloaded from [this](https://figshare.com/articles/dataset/Untitled_Item/5513449) repository.

In [19]:
page_data = pd.read_csv('data/raw/page_data.csv')
page_data.head()

Unnamed: 0,page,country,rev_id
0,Template:ZambiaProvincialMinisters,Zambia,235107991
1,Bir I of Kanem,Chad,355319463
2,Template:Zimbabwe-politician-stub,Zimbabwe,391862046
3,Template:Uganda-politician-stub,Uganda,391862070
4,Template:Namibia-politician-stub,Namibia,391862409


## Population Data
Population Data is downloaded from [this](https://docs.google.com/spreadsheets/d/1CFJO2zna2No5KqNm9rPK5PCACoXKzb-nycJFhV689Iw/edit#gid=283125346) google doc.

In [3]:
pop_data = pd.read_csv('data/raw/WPDS_2020_data.csv')
pop_data.head()

Unnamed: 0,FIPS,Name,Type,TimeFrame,Data (M),Population
0,WORLD,WORLD,World,2019,7772.85,7772850000
1,AFRICA,AFRICA,Sub-Region,2019,1337.918,1337918000
2,NORTHERN AFRICA,NORTHERN AFRICA,Sub-Region,2019,244.344,244344000
3,DZ,Algeria,Country,2019,44.357,44357000
4,EG,Egypt,Country,2019,100.803,100803000


# Stage 2: Data Processing
In this stage we combine and clean the data, and use the [ORES](https://github.com/wikimedia/ores) client to get predicted article quality.

# Clean and combine
Remove the templates from page data and regions from pop data

In [22]:
page_data_clean = page_data.loc[~page_data["page"].str.contains("^Template"), :]
pop_data_clean = pop_data.loc[pop_data["Type"] == "Country", :]

# Left join to keep countries without articles. Keep revid as int
combined_data = pop_data_clean.merge(page_data_clean, how="outer", left_on="Name", right_on="country")

# Record unmatched countries
unmatched_pop_data = combined_data[combined_data["page"].isna()].drop(columns=page_data_clean.columns)
unmatched_pop_data.to_csv('data/unmatched/wp_wpds_countries-no_match.csv')
print("{} Countries could not be matched".format(len(unmatched_pop_data)))

# Record unmatched pages
unmatched_page_data = combined_data[combined_data["Name"].isna()].drop(columns=pop_data_clean.columns)
unmatched_page_data.to_csv('data/unmatched/page_data-no_match.csv')
print("{} Pages could not be matched".format(len(unmatched_page_data)))

# Clean
combined_data_complete = combined_data.dropna()
combined_data_complete["rev_id"] = combined_data_complete["rev_id"].astype(int)

26 Countries could not be matched
1859 Pages could not be matched


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


## Get ORES Data
Data is acquired from the [ORES API](https://ores.wikimedia.org/v3/#). We request the "articlequality" model from the "enwiki" context for batches of revids at a time. Max 50 per request.

In [23]:
# Takes a batch of revids
def api_call(revids, context='enwiki', model='articlequality'):
    endpoint = "https://ores.wikimedia.org/v3/scores/{context}".format(context=context)
    headers = {
        'User-Agent': 'https://github.com/TheCaseca',
        'From': 'ccase20@uw.edu'
    }
    call = requests.get(endpoint, headers=headers, params={"models":model, "revids": "|".join(revids)})
    response = call.json()
    
    return response

In [36]:
# Index by revids for easier updating
revids = combined_data_complete["rev_id"]
combined_data_complete.set_index('rev_id')

revids = revids.astype(int)
revids = revids.astype(str)

n = 50
preds = {}
for i in range(len(revids)//n):
    if i % 100 == 0:
        print("Collecting rows {} to {} of {}".format(i*n, (i*100+1)*n, len(revids)))
    api_data = api_call(list(revids[i*n: (i+1)*n]))
    new_preds = {revid: score['articlequality'].get('score', {}).get('prediction') for revid, score in api_data['enwiki']['scores'].items()}
    preds.update(new_preds)

# Update dataframe
pred_df = pd.DataFrame.from_dict(preds, orient='index', columns=['prediction'])
pred_df.index = pred_df.index.astype(int)
combined_data_complete_preds = combined_data_complete.merge(pred_df, left_on='rev_id', right_index=True)
combined_data_complete_preds.head()

Collecting rows 0 to 50 of 44680
Collecting rows 50 to 100 of 44680
Collecting rows 100 to 150 of 44680
Collecting rows 150 to 200 of 44680
Collecting rows 200 to 250 of 44680
Collecting rows 250 to 300 of 44680
Collecting rows 300 to 350 of 44680
Collecting rows 350 to 400 of 44680
Collecting rows 400 to 450 of 44680
Collecting rows 450 to 500 of 44680
Collecting rows 500 to 550 of 44680
Collecting rows 550 to 600 of 44680
Collecting rows 600 to 650 of 44680
Collecting rows 650 to 700 of 44680
Collecting rows 700 to 750 of 44680
Collecting rows 750 to 800 of 44680
Collecting rows 800 to 850 of 44680
Collecting rows 850 to 900 of 44680
Collecting rows 900 to 950 of 44680
Collecting rows 950 to 1000 of 44680
Collecting rows 1000 to 1050 of 44680
Collecting rows 1050 to 1100 of 44680
Collecting rows 1100 to 1150 of 44680
Collecting rows 1150 to 1200 of 44680
Collecting rows 1200 to 1250 of 44680
Collecting rows 1250 to 1300 of 44680
Collecting rows 1300 to 1350 of 44680
Collecting rows 1

Collecting rows 10800 to 10850 of 44680
Collecting rows 10850 to 10900 of 44680
Collecting rows 10900 to 10950 of 44680
Collecting rows 10950 to 11000 of 44680
Collecting rows 11000 to 11050 of 44680
Collecting rows 11050 to 11100 of 44680
Collecting rows 11100 to 11150 of 44680
Collecting rows 11150 to 11200 of 44680
Collecting rows 11200 to 11250 of 44680
Collecting rows 11250 to 11300 of 44680
Collecting rows 11300 to 11350 of 44680
Collecting rows 11350 to 11400 of 44680
Collecting rows 11400 to 11450 of 44680
Collecting rows 11450 to 11500 of 44680
Collecting rows 11500 to 11550 of 44680
Collecting rows 11550 to 11600 of 44680
Collecting rows 11600 to 11650 of 44680
Collecting rows 11650 to 11700 of 44680
Collecting rows 11700 to 11750 of 44680
Collecting rows 11750 to 11800 of 44680
Collecting rows 11800 to 11850 of 44680
Collecting rows 11850 to 11900 of 44680
Collecting rows 11900 to 11950 of 44680
Collecting rows 11950 to 12000 of 44680
Collecting rows 12000 to 12050 of 44680


Collecting rows 21050 to 21100 of 44680
Collecting rows 21100 to 21150 of 44680
Collecting rows 21150 to 21200 of 44680
Collecting rows 21200 to 21250 of 44680
Collecting rows 21250 to 21300 of 44680
Collecting rows 21300 to 21350 of 44680
Collecting rows 21350 to 21400 of 44680
Collecting rows 21400 to 21450 of 44680
Collecting rows 21450 to 21500 of 44680
Collecting rows 21500 to 21550 of 44680
Collecting rows 21550 to 21600 of 44680
Collecting rows 21600 to 21650 of 44680
Collecting rows 21650 to 21700 of 44680
Collecting rows 21700 to 21750 of 44680
Collecting rows 21750 to 21800 of 44680
Collecting rows 21800 to 21850 of 44680
Collecting rows 21850 to 21900 of 44680
Collecting rows 21900 to 21950 of 44680
Collecting rows 21950 to 22000 of 44680
Collecting rows 22000 to 22050 of 44680
Collecting rows 22050 to 22100 of 44680
Collecting rows 22100 to 22150 of 44680
Collecting rows 22150 to 22200 of 44680
Collecting rows 22200 to 22250 of 44680
Collecting rows 22250 to 22300 of 44680


Collecting rows 31300 to 31350 of 44680
Collecting rows 31350 to 31400 of 44680
Collecting rows 31400 to 31450 of 44680
Collecting rows 31450 to 31500 of 44680
Collecting rows 31500 to 31550 of 44680
Collecting rows 31550 to 31600 of 44680
Collecting rows 31600 to 31650 of 44680
Collecting rows 31650 to 31700 of 44680
Collecting rows 31700 to 31750 of 44680
Collecting rows 31750 to 31800 of 44680
Collecting rows 31800 to 31850 of 44680
Collecting rows 31850 to 31900 of 44680
Collecting rows 31900 to 31950 of 44680
Collecting rows 31950 to 32000 of 44680
Collecting rows 32000 to 32050 of 44680
Collecting rows 32050 to 32100 of 44680
Collecting rows 32100 to 32150 of 44680
Collecting rows 32150 to 32200 of 44680
Collecting rows 32200 to 32250 of 44680
Collecting rows 32250 to 32300 of 44680
Collecting rows 32300 to 32350 of 44680
Collecting rows 32350 to 32400 of 44680
Collecting rows 32400 to 32450 of 44680
Collecting rows 32450 to 32500 of 44680
Collecting rows 32500 to 32550 of 44680


Collecting rows 41550 to 41600 of 44680
Collecting rows 41600 to 41650 of 44680
Collecting rows 41650 to 41700 of 44680
Collecting rows 41700 to 41750 of 44680
Collecting rows 41750 to 41800 of 44680
Collecting rows 41800 to 41850 of 44680
Collecting rows 41850 to 41900 of 44680
Collecting rows 41900 to 41950 of 44680
Collecting rows 41950 to 42000 of 44680
Collecting rows 42000 to 42050 of 44680
Collecting rows 42050 to 42100 of 44680
Collecting rows 42100 to 42150 of 44680
Collecting rows 42150 to 42200 of 44680
Collecting rows 42200 to 42250 of 44680
Collecting rows 42250 to 42300 of 44680
Collecting rows 42300 to 42350 of 44680
Collecting rows 42350 to 42400 of 44680
Collecting rows 42400 to 42450 of 44680
Collecting rows 42450 to 42500 of 44680
Collecting rows 42500 to 42550 of 44680
Collecting rows 42550 to 42600 of 44680
Collecting rows 42600 to 42650 of 44680
Collecting rows 42650 to 42700 of 44680
Collecting rows 42700 to 42750 of 44680
Collecting rows 42750 to 42800 of 44680


Unnamed: 0,FIPS,Name,Type,TimeFrame,Data (M),Population,page,country,rev_id
0,DZ,Algeria,Country,2019.0,44.357,44357000.0,Ali Fawzi Rebaine,Algeria,686269631
1,DZ,Algeria,Country,2019.0,44.357,44357000.0,Ahmed Attaf,Algeria,705910185
2,DZ,Algeria,Country,2019.0,44.357,44357000.0,Ahmed Djoghlaf,Algeria,707427823
3,DZ,Algeria,Country,2019.0,44.357,44357000.0,Hammi Larouissi,Algeria,708060571
4,DZ,Algeria,Country,2019.0,44.357,44357000.0,Salah Goudjil,Algeria,708980561


We filter out any missing predictions and record them. 274 articles did not have a prediction from ORES.

In [51]:
# Recordd missing
missing_preds = combined_data_complete_preds[combined_data_complete_preds['prediction'].isna()]
missing_preds.to_csv('data/unmatched/wp_wpds_politicians-no_prediction.csv')
print("{} Pages could not be predicted".format(len(missing_preds)))

# Remove from data
final_data = combined_data_complete_preds.dropna(subset=['prediction'])

274 Pages could not be predicted


In [50]:
final_data

Unnamed: 0,FIPS,Name,Type,TimeFrame,Data (M),Population,page,country,rev_id,prediction
0,DZ,Algeria,Country,2019.0,44.357,44357000.0,Ali Fawzi Rebaine,Algeria,686269631,Stub
1,DZ,Algeria,Country,2019.0,44.357,44357000.0,Ahmed Attaf,Algeria,705910185,Stub
2,DZ,Algeria,Country,2019.0,44.357,44357000.0,Ahmed Djoghlaf,Algeria,707427823,Stub
3,DZ,Algeria,Country,2019.0,44.357,44357000.0,Hammi Larouissi,Algeria,708060571,Stub
4,DZ,Algeria,Country,2019.0,44.357,44357000.0,Salah Goudjil,Algeria,708980561,Stub
...,...,...,...,...,...,...,...,...,...,...
44833,VU,Vanuatu,Country,2019.0,0.321,321000.0,Ralph Regenvanu,Vanuatu,787719834,B
44834,VU,Vanuatu,Country,2019.0,0.321,321000.0,Sethy Regenvanu,Vanuatu,788200153,Stub
44835,VU,Vanuatu,Country,2019.0,0.321,321000.0,Ati George Sokomanu,Vanuatu,788303161,Stub
44836,VU,Vanuatu,Country,2019.0,0.321,321000.0,Antoine Wright (politician),Vanuatu,789446881,Start
