### Obtaining data

In [1]:
import pandas as pd

In [2]:
page_data = pd.read_csv('./data/page_data.csv')
WPDS_2020_data = pd.read_csv('./data/WPDS_2020_data.csv')
page_data.head()

Unnamed: 0,page,country,rev_id
0,Template:ZambiaProvincialMinisters,Zambia,235107991
1,Bir I of Kanem,Chad,355319463
2,Template:Zimbabwe-politician-stub,Zimbabwe,391862046
3,Template:Uganda-politician-stub,Uganda,391862070
4,Template:Namibia-politician-stub,Namibia,391862409


### Cleaning our Data

In [3]:
# Removing page names that start with "Template:"
page_data = page_data.loc[~page_data['page'].str.startswith('Template:'), ]
page_data

Unnamed: 0,page,country,rev_id
1,Bir I of Kanem,Chad,355319463
10,Information Minister of the Palestinian Nation...,Palestinian Territory,393276188
12,Yos Por,Cambodia,393822005
23,Julius Gregr,Czech Republic,395521877
24,Edvard Gregr,Czech Republic,395526568
...,...,...,...
47192,Yahya Jammeh,Gambia,807482007
47193,Lucius Fairchild,United States,807483006
47194,Fahd of Saudi Arabia,Saudi Arabia,807483153
47195,Francis Fessenden,United States,807483270


In [4]:
population_dataset = WPDS_2020_data.copy()
population_dataset

Unnamed: 0,FIPS,Name,Type,TimeFrame,Data (M),Population
0,WORLD,WORLD,World,2019,7772.850,7772850000
1,AFRICA,AFRICA,Sub-Region,2019,1337.918,1337918000
2,NORTHERN AFRICA,NORTHERN AFRICA,Sub-Region,2019,244.344,244344000
3,DZ,Algeria,Country,2019,44.357,44357000
4,EG,Egypt,Country,2019,100.803,100803000
...,...,...,...,...,...,...
229,WS,Samoa,Country,2019,0.200,200000
230,SB,Solomon Islands,Country,2019,0.715,715000
231,TO,Tonga,Country,2019,0.099,99000
232,TV,Tuvalu,Country,2019,0.010,10000


In [5]:
# Selecting only country level counts. 
# Country and Region level counts are distinguished by string case.

countries = population_dataset.loc[~population_dataset['Name'].str.isupper(),]
countries.to_csv('./data/WPDS_2020_countries_data.csv', index = False)
countries

Unnamed: 0,FIPS,Name,Type,TimeFrame,Data (M),Population
3,DZ,Algeria,Country,2019,44.357,44357000
4,EG,Egypt,Country,2019,100.803,100803000
5,LY,Libya,Country,2019,6.891,6891000
6,MA,Morocco,Country,2019,35.952,35952000
7,SD,Sudan,Country,2019,43.849,43849000
...,...,...,...,...,...,...
229,WS,Samoa,Country,2019,0.200,200000
230,SB,Solomon Islands,Country,2019,0.715,715000
231,TO,Tonga,Country,2019,0.099,99000
232,TV,Tuvalu,Country,2019,0.010,10000


### Getting Article Quality Predictions

Now you need to get the predicted quality scores for each article in the Wikipedia dataset. We're using a machine learning system called ORES. This was originally an acronym for "Objective Revision Evaluation Service" but was simply renamed “ORES”. ORES is a machine learning tool that can provide estimates of Wikipedia article quality. The article quality estimates are, from best to worst:

1. FA - Featured article
2. GA - Good article
3. B - B-class article
4. C - C-class article
5. Start - Start-class article
6. Stub - Stub-class article

These were learned based on articles in Wikipedia that were peer-reviewed using the [Wikipedia content assessment procedures](https://en.wikipedia.org/wiki/Wikipedia:Content_assessment).These quality classes are a sub-set of quality assessment categories developed by Wikipedia editors. For this assignment, you only need to know that these categories exist, and that ORES will assign one of these 6 categories to any rev_id you send it.
In order to get article predictions for each article in the Wikipedia dataset, you will first need to read page_data.csv into Python (or R), and then read through the dataset line by line, using the value of the rev_id column to make an API query.

##### Using the REST API endpoint (Python)

In [6]:
import math
import numpy as np
import json
import requests
from tqdm import tqdm
from urllib.parse import urlencode

In [7]:
def api_call(endpoint,parameters):
    call = requests.get(endpoint.format(**parameters))
    response = call.json()
    return response

This ORES REST API has a restriction of 50 rev_ids per batch, beyond which we get API call failures. Hence we will maintain this limit.

In [8]:
num_revids_per_batch = 50
endpoint = 'https://ores.wikimedia.org/v3/scores/enwiki?models=articlequality&revids={revid}'

page_data_copy = page_data.copy()
page_data_copy['articlequality'] = np.NaN
page_data_copy.set_index('rev_id', inplace=True)
revids = page_data_copy.index.to_list()
num_lists = round(len(revids) / num_revids_per_batch)
revids = list(map(list, np.array_split(revids, num_lists)))
iterable_revids = tqdm(revids)
page_data_copy.head()

  0%|                                                                                          | 0/934 [00:00<?, ?it/s]

Unnamed: 0_level_0,page,country,articlequality
rev_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
355319463,Bir I of Kanem,Chad,
393276188,Information Minister of the Palestinian Nation...,Palestinian Territory,
393822005,Yos Por,Cambodia,
395521877,Julius Gregr,Czech Republic,
395526568,Edvard Gregr,Czech Republic,


In [9]:
for revid_batch in iterable_revids:
    query_parms = {
        'revids': '|'.join(str(x) for x in revid_batch),
        'models': 'articlequality'
    }
    params = {
        'context': 'enwiki',
    }
    # API Call for prediction
    response = api_call('https://ores.wikimedia.org/v3/scores/{context}?'+urlencode(query_parms), params)
    # Getting correct prediction for each rev_id
    try:
        scores = response[CONTEXT]['scores']
    except:
        continue
    
    for each_rev_id in scores.keys():
        try:
            prediction = scores[each_rev_id][MODEL]['score']['prediction']
        except:
            continue
        
        each_rev_id = int(each_rev_id)
        page_data_copy.loc[each_rev_id, MODEL] = prediction

 87%|█████████████████████████████████████████████████████████████████████▏          | 808/934 [05:07<00:47,  2.63it/s]


ConnectionError: HTTPSConnectionPool(host='ores.wikimedia.org', port=443): Max retries exceeded with url: /v3/scores/enwiki?revids=803288472%7C803289828%7C803290431%7C803291528%7C803291629%7C803291657%7C803292166%7C803294020%7C803294527%7C803295897%7C803298752%7C803300602%7C803300682%7C803301403%7C803302503%7C803302597%7C803303116%7C803304465%7C803306591%7C803306594%7C803306876%7C803307184%7C803307197%7C803308332%7C803309174%7C803314691%7C803315343%7C803315818%7C803317036%7C803319558%7C803324351%7C803325525%7C803326270%7C803328007%7C803328250%7C803328528%7C803328707%7C803328817%7C803328841%7C803328854%7C803329075%7C803330007%7C803331098%7C803332354%7C803333173%7C803333498%7C803335436%7C803336359%7C803336424%7C803336732&models=articlequality (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002976C80E2B0>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))

### Combining Datasets

In [127]:
page_data_copy

NameError: name 'df_pcd' is not defined

In [None]:
full_results = df.merge(population, how = 'outer', left_on = 'country', right_on = 'Name')

# Find data that is missing in either table and save assuming we are running from src folder
missing_results = full_results.loc[(full_results['country'].isnull() | full_results['Name'].isnull())]
missing_results.to_csv('../data_clean/wp_wpds_countries-no_match.csv', index = False)