In [1]:
import pandas as pd
import json
import requests
import numpy as np
import matplotlib.pyplot as plt
import multiprocessing
%matplotlib inline

In [2]:
# Reading the WPDS 2018 (population) data
wpds = pd.read_csv("WPDS_2018_data.csv")
# looking at the contents
wpds.head()

Unnamed: 0,Geography,Population mid-2018 (millions)
0,AFRICA,1284.0
1,Algeria,42.7
2,Egypt,97.0
3,Libya,6.5
4,Morocco,35.2


Download the article data from https://figshare.com/articles/Untitled_Item/5513449 and extract to main directory of project.

In [3]:
# edit file_path to reflect path to csv file in the extracted data folder.
file_path = 'country/data/page_data.csv'
# Reading the article data
page_data = pd.read_csv(file_path)
# looking at the contents
page_data.head()

Unnamed: 0,page,country,rev_id
0,Template:ZambiaProvincialMinisters,Zambia,235107991
1,Bir I of Kanem,Chad,355319463
2,Template:Zimbabwe-politician-stub,Zimbabwe,391862046
3,Template:Uganda-politician-stub,Uganda,391862070
4,Template:Namibia-politician-stub,Namibia,391862409


In [4]:
# as mentioned on the MediaWiki ORES page: https://www.mediawiki.org/wiki/ORES 
# Note: We are using the wp10 model and care about only English Wikipedia articles
api_endpoint = 'https://ores.wikimedia.org/v3/scores/enwiki/?models=wp10&revids={rev_ids}'
headers = {'User-Agent' : 'https://github.com/CoderHam', 'From' : 'hemantj@uw.edu'}

We chunk the revids into chunks of $50$ and make requests to the api ($50$ revids per request)

In [5]:
all_revids_list = list(page_data['rev_id'])
revids_chunks = [all_revids_list[i:i+50] for i in range(0, len(all_revids_list), 50)]

In [6]:
def get_ores_data_parallel(revids_chunk):
    params = {'rev_ids':'|'.join(str(rid) for rid in revids_chunk)}
    resp = requests.get(api_endpoint.format(**params), headers)
    return resp.json()['enwiki']['scores']
#     ['enwiki']['scores']

Serially making $98$ requests will be very slow as shown below:

In [7]:
# %time get_ores_data(list(page_data['rev_id']))
# CPU times: user 33.8 s, sys: 1.02 s, total: 34.9 s
# Wall time: 4min 32s
print("Note: We are making a total of",len(revids_chunks),"requests")

Note: We are making a total of 944 requests


In [8]:
pool = multiprocessing.Pool(processes=4)
%time json_output = pool.map(get_ores_data_parallel,revids_chunks)
pool.close()
pool.join()

CPU times: user 297 ms, sys: 105 ms, total: 403 ms
Wall time: 1min 20s


After parallelizing by a factor of $4$ we get a speedup of approximately $4x$.

In [9]:
def process_json(json_output):
    revid_rating = pd.DataFrame()
    for jo in json_output:
        for rid in jo:
            try:
                rating = jo[rid]['wp10']['score']['prediction']
                revid_rating = revid_rating.append({'rev_id':rid, 'article_quality':rating}, ignore_index=True)
            except:
                rating = np.nan
#                 print(rid,"not found or unable to read score for this rev_id")
#             revid_rating = revid_rating.append({'rev_id':rid, 'article_quality':rating}, ignore_index=True)
    return revid_rating

In [10]:
revid_rating = process_json(json_output)

In [11]:
#But wait we messed up! We need to get the rev_id back to an integer.
revid_rating['rev_id'] = revid_rating['rev_id'].apply(lambda rid: int(rid))
#Lets have a look at the revid_rating table
revid_rating.head()

Unnamed: 0,article_quality,rev_id
0,Stub,355319463
1,Stub,391862046
2,Stub,391862070
3,Stub,391862409
4,Stub,391862819


In [12]:
revid_rating.to_csv('revid_rating.csv',index_label=False,index=False)

We now want to join/merge the **revid_rating** data with the article data in **page_data**. 

In [13]:
master_pagedata_rating = page_data.merge(revid_rating, on='rev_id')
master_pagedata_rating.head()

Unnamed: 0,page,country,rev_id,article_quality
0,Bir I of Kanem,Chad,355319463,Stub
1,Template:Zimbabwe-politician-stub,Zimbabwe,391862046,Stub
2,Template:Uganda-politician-stub,Uganda,391862070,Stub
3,Template:Namibia-politician-stub,Namibia,391862409,Stub
4,Template:Nigeria-politician-stub,Nigeria,391862819,Stub


We now want to join/merge the **wpds** (population) data with the merged rating+article data in **master_pagedata_rating**. 

In [14]:
wpds.rename(columns={'Geography':'country'}, inplace=True)
master = master_pagedata_rating.merge(wpds, on='country')

In [15]:
master.rename(columns={'Population mid-2018 (millions)':'population','page':'article_name','rev_id':'revision_id'}, inplace=True)
master.head()

Unnamed: 0,article_name,country,revision_id,article_quality,population
0,Bir I of Kanem,Chad,355319463,Stub,15.4
1,Abdullah II of Kanem,Chad,498683267,Stub,15.4
2,Salmama II of Kanem,Chad,565745353,Stub,15.4
3,Kuri I of Kanem,Chad,565745365,Stub,15.4
4,Mohammed I of Kanem,Chad,565745375,Stub,15.4


In [16]:
master.to_csv('master_data.csv',index_label=False,index=False)

# Analysis and Results

In [24]:
from collections import Counter
country_count = Counter(list(master['country']))

In [27]:
country_list = list(wpds['country'])
pop_list = wpds['Population mid-2018 (millions)'].apply(lambda x: float(x.replace(',','')))
count_list = [country_count[c] if c in country_count else 0 for c in country_list]
ratio_list = [count_list[i]/pop_list[i]*(10**-6) if count_list[i]!=0 else 0 for i in range(0,len(country_list))]
wpds['count_articles'] = count_list
wpds['per_person_articles'] = ratio_list

## 1. Top 10 countries in terms of number of articles about politicians as a proportion of the country population

In [28]:
# wpds = wpds.dropna()
# Top 10 countries in terms of articles per person
wpds.sort_values('per_person_articles',ascending=False).head(10)

Unnamed: 0,country,Population mid-2018 (millions),count_articles,per_person_articles
205,Tuvalu,0.01,55,0.0055
197,Nauru,0.01,53,0.0053
185,San Marino,0.03,82,0.002733
161,Monaco,0.04,40,0.001
159,Liechtenstein,0.04,29,0.000725
204,Tonga,0.1,63,0.00063
196,Marshall Islands,0.06,37,0.000617
148,Iceland,0.4,206,0.000515
175,Andorra,0.08,34,0.000425
191,Federated States of Micronesia,0.1,38,0.00038


## 2. Lowest 10 countries in terms of number of articles about politicians as a proportion of the country population

In [33]:
# Lowest 10 countries in terms of articles per person
wpds_tmp = wpds[wpds['count_articles']!=0]
wpds_tmp.sort_values('per_person_articles').head(10)

Unnamed: 0,country,Population mid-2018 (millions),count_articles,per_person_articles
121,India,1371.3,986,7.190257e-07
129,Indonesia,265.2,214,8.069382e-07
138,China,1393.8,1135,8.143206e-07
117,Uzbekistan,32.9,29,8.81459e-07
28,Ethiopia,107.5,105,9.767442e-07
40,Zambia,17.7,25,1.412429e-06
140,"Korea, North",25.6,39,1.523437e-06
135,Thailand,66.2,112,1.691843e-06
119,Bangladesh,166.4,323,1.941106e-06
33,Mozambique,30.5,60,1.967213e-06


In [34]:
# Since quality matters, we rather just run a group by to find the high quality articles i.e. GA and FA 
# As given https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.groupby.html
hq_master = master_pagedata_rating[(master_pagedata_rating['article_quality']=='GA') | (master_pagedata_rating['article_quality']=='FA')]
hq_master = hq_master.groupby('country').size().reset_index(name='hq_count_articles')

In [35]:
hq_master['count_articles'] = [country_count[c] if c in country_count else 0 for c in hq_master['country']]
hq_master = hq_master.dropna()

In [36]:
hq_master['hq_prop'] = hq_master['hq_count_articles']/hq_master['count_articles']
# *100

## 3. Top 10 countries in terms of number of high quality articles about politicians as a proportion of the country population

In [40]:
# Top 10 countries in terms of proportion high quality articles
hq_master_tmp = hq_master[hq_master['count_articles']!=0]
hq_master_tmp.sort_values('hq_prop',ascending=False).head(10)

Unnamed: 0,country,hq_count_articles,count_articles,hq_prop
72,"Korea, North",7,39,0.179487
122,Saudi Arabia,16,119,0.134454
24,Central African Republic,8,68,0.117647
116,Romania,40,348,0.114943
90,Mauritania,5,52,0.096154
13,Bhutan,3,33,0.090909
146,Tuvalu,5,55,0.090909
37,Dominica,1,12,0.083333
150,United States,82,1092,0.075092
12,Benin,7,94,0.074468


## 4. Lowest 10 countries in terms of number of high quality articles about politicians as a proportion of the country population

In [41]:
# Lowest 10 countries in terms of proportion high quality articles
hq_master.sort_values('hq_prop').head(10)

Unnamed: 0,country,hq_count_articles,count_articles,hq_prop
140,Tanzania,1,408,0.002451
110,Peru,1,354,0.002825
82,Lithuania,1,248,0.004032
102,Nigeria,3,682,0.004399
95,Morocco,1,208,0.004808
45,Fiji,1,199,0.005025
14,Bolivia,1,187,0.005348
17,Brazil,3,551,0.005445
83,Luxembourg,1,180,0.005556
125,Sierra Leone,1,166,0.006024
