# Homework 2 - Considering Bias in Data

## Data Acquisition

Sources of Data :
1. List of Wikipedia articles of politicians by country
2. Population data for the countries



In [18]:
import pandas as pd
import json, time, urllib.parse
import requests

In [19]:
df_politicians = pd.read_csv('../data/politicians_by_country_2022.csv')
df_population = pd.read_csv('../data/population_by_country_2022.csv')

In [20]:
len(df_politicians)

7584

In [21]:
duplicate_indices = df_politicians[df_politicians.duplicated()].index.tolist()
df_politicians.drop(index=duplicate_indices,inplace=True)

In [22]:
print(df_politicians.name.value_counts().loc[lambda x : x.values!=1].count())
duplicates = df_politicians.name.value_counts().loc[lambda x : x.values!=1].keys().tolist()
df_politicians.loc[df_politicians.name.isin(duplicates),['name','country']].sort_values('name',ascending=True).groupby('name').first()['country']

46


name
Alexandra Benado                                  Sweden
Ali al-Qaradaghi                                    Iraq
Antonio Gutiérrez y Ulloa                    El Salvador
Antonín Janoušek                                Slovakia
Ashab Uddin Ahmad                             Bangladesh
Bak Jungyang                                       Japan
Count Wenzel Chotek of Chotkow and Wognin        Austria
Djama Ali Moussa                                 Somalia
Eduard Hedvicek                                  Czechia
German Kuznetsov                                  Russia
Goran Rakić                                       Kosovo
Grace Schneiders-Howard                      Netherlands
Heinrich von Brühl                                Poland
Hrant Maloyan                                      Syria
Ibrahim Harun                                    Eritrea
Jacob Magnus Sprengtporten                        Sweden
Josip Ferfolja                                  Slovenia
José Alejandro de Aycinena

In [23]:
# The basic English Wikipedia API endpoint
API_ENWIKIPEDIA_ENDPOINT = "https://en.wikipedia.org/w/api.php"

# We'll assume that there needs to be some throttling for these requests - we should always be nice to a free data resource
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

# When making automated requests we should include something that is unique to the person making the request
# This should include an email - your UW email would be good to put in there
REQUEST_HEADERS = {
    'User-Agent': '<uwnetid@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2022',
}

# This is just a list of English Wikipedia article titles that we can use for example requests
# ARTICLE_TITLES = [ 'Bison', 'Northern flicker', 'Red squirrel', 'Chinook salmon', 'Horseshoe bat' ]
ARTICLE_TITLES = df_politicians.name.tolist()

# This template lists the basic parameters for making this
PAGEINFO_PARAMS_TEMPLATE = {
    "action": "query",
    "format": "json",
    "titles": "",           # to simplify this should be a single page title at a time
    "prop": "info",
    "inprop": ""
}

In [24]:
def request_pageinfo_per_article(article_title = None, 
                                 endpoint_url = API_ENWIKIPEDIA_ENDPOINT, 
                                 request_template = PAGEINFO_PARAMS_TEMPLATE,
                                 headers = REQUEST_HEADERS):
    # Make sure we have an article title
    if not article_title: return None
    
    request_template['titles'] = article_title
        
    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like Wikipedia - or any other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(endpoint_url, headers=headers, params=request_template)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

In [25]:
titleRevIdDict = {key: None for key in ARTICLE_TITLES}
pages_no_revId = []
for article in ARTICLE_TITLES:
    info = request_pageinfo_per_article(article)
    for attribute,value in info['query'].items():
        for a,v in value.items():
            if 'lastrevid' not in list(v.keys()) :
                print(v['title'])
                pages_no_revId.append(v['title'])
            else:
                titleRevIdDict[v['title']] = v['lastrevid']

Prince Ofosu Sefah
Harjit Kaur Talwandi
Abd al-Razzaq al-Hasani
Abiodun Abimbola Orekoya
Segun “Aeroland” Adewale
Roman Konoplev
Nhlanhla “Lux” Dlamini


In [26]:
with open("../data/articles_no_revId.txt", "w") as fp:
    for item in pages_no_revId:
        fp.write("%s\n" % item)
no_revId_indices = df_politicians[df_politicians.name.isin(pages_no_revId)].index.tolist()
df_politicians.drop(index=no_revId_indices,inplace=True)

In [27]:
with open('dict.json', 'w') as fp:
    json.dump(titleRevIdDict, fp)

In [28]:
len(df_politicians)

7575

In [29]:
# The current ORES API endpoint
API_ORES_SCORE_ENDPOINT = "https://ores.wikimedia.org/v3"
# A template for mapping to the URL
API_ORES_SCORE_PARAMS = "/scores/{context}/{revid}/{model}"

# A dictionary of English Wikipedia article titles (keys) and sample revision IDs that can be used for this ORES scoring example
ARTICLE_REVISIONS = titleRevIdDict

# This template lists the basic parameters for making an ORES request
ORES_PARAMS_TEMPLATE = {
    "context": "enwiki",        # which WMF project for the specified revid
    "revid" : "",               # the revision to be scored - this will probably change each call
    "model": "articlequality"   # the AI/ML scoring model to apply to the reviewion
}

In [30]:
def request_ores_score_per_article(article_revid = None, 
                                   endpoint_url = API_ORES_SCORE_ENDPOINT, 
                                   endpoint_params = API_ORES_SCORE_PARAMS, 
                                   request_template = ORES_PARAMS_TEMPLATE,
                                   headers = REQUEST_HEADERS,
                                   features=False):
    # Make sure we have an article revision id
    if not article_revid: return None
    
    # set the revision id into the template
    request_template['revid'] = article_revid
#     print(endpoint_url)
#     print(endpoint_params.format(**request_template))

    # now, create a request URL by combining the endpoint_url with the parameters for the request
    request_url = endpoint_url+endpoint_params.format(**request_template)
    
    # the features used by the ML model can sometimes be returned as well as scores
    if features:
        request_url = request_url+"?features=true"
    
    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like ORES - or other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

In [31]:
df_scores = pd.DataFrame(columns = ['name','prediction'])
myList = []
for article in ARTICLE_TITLES:
    score = request_ores_score_per_article(ARTICLE_REVISIONS[article])
    if score is None:
        myList.append(article)
    else:
        for a,v in score['enwiki']['scores'].items():
            df_scores.loc[len(df_scores.index)] = [article,v['articlequality']['score']['prediction']]

In [32]:
regions = [i for i in df_population.Geography if i.isupper()]
regions_index = df_population.index[df_population.Geography.isin(regions)].tolist()

In [33]:
df_population['region']=None
for i in range(0,len(df_population)):
    if i in regions_index:
        region_value = df_population.iloc[i,0]
    df_population.iloc[i,2]=region_value
for i in regions_index:
    df_population.drop(i,axis=0,inplace=True)

In [34]:
no_population = df_population.loc[df_population['Population (millions)']==0.0,'Geography']

with open("../data/wp_countries-no_match.txt", "w") as fp:
    for item in df_population.loc[df_population.Geography.isin(no_population.tolist()),'Geography']:
        fp.write("%s\n" % item)
df_population.drop(index=no_population.index,inplace=True)

In [35]:
df_merge = df_politicians.merge(df_population,how='outer',left_on='country',right_on='Geography')
indices_no_match = df_merge.loc[(df_merge.country.isnull())|(df_merge.Geography.isnull()),:].index.tolist()
no_match = pd.unique(df_merge.iloc[indices_no_match,[2,3]].values.ravel('K')).tolist()
no_match = [x for x in no_match if str(x) != 'nan']
with open("../data/wp_countries-no_match.txt", "a") as fp:
    for item in no_match:
        fp.write("%s\n" % item)
df_merge.drop(index=indices_no_match,inplace=True)
df_merge.drop(['Geography'],axis=1,inplace=True)

In [36]:
pd.set_option('display.float_format', '{}'.format)
df_merge = df_merge.merge(df_scores)
df_revision = pd.DataFrame(titleRevIdDict.items())
df_revision = df_revision.rename(columns={0: 'name',1:'revisionId'})
df_merge = df_merge.merge(df_revision)
df_merge.drop('url',axis=1,inplace=True)
df_merge = df_merge.rename(columns={'Population (millions)': 'population','revisionId':'revision_id','prediction':'article_quality','name':'article_title'})
df_merge.head(10)

Unnamed: 0,article_title,country,population,region,article_quality,revision_id
0,Shahjahan Noori,Afghanistan,41.1,SOUTH ASIA,GA,1099689043.0
1,Abdul Ghafar Lakanwal,Afghanistan,41.1,SOUTH ASIA,Start,943562276.0
2,Majah Ha Adrif,Afghanistan,41.1,SOUTH ASIA,Start,852404094.0
3,Haroon al-Afghani,Afghanistan,41.1,SOUTH ASIA,B,1095102390.0
4,Tayyab Agha,Afghanistan,41.1,SOUTH ASIA,Start,1104998382.0
5,Ahmadullah Wasiq,Afghanistan,41.1,SOUTH ASIA,Start,1109361754.0
6,Aziza Ahmadyar,Afghanistan,41.1,SOUTH ASIA,Start,1087211008.0
7,Muqadasa Ahmadzai,Afghanistan,41.1,SOUTH ASIA,Start,1082489593.0
8,Mohammad Sarwar Ahmedzai,Afghanistan,41.1,SOUTH ASIA,Start,1038918070.0
9,Amir Muhammad Akhundzada,Afghanistan,41.1,SOUTH ASIA,Start,1069322182.0


In [37]:
df_merge.to_csv('../data/wp_politicians_by_country.csv')

In [38]:
df_country_articles = pd.DataFrame()
df_country_articles = df_merge.groupby(['country','population'],as_index=False)['article_title'].size()
df_country_articles.rename(columns={'size':'article_count'},inplace=True)
df_in = df_merge.loc[df_merge.article_quality.isin(['GA','FA']),:].groupby(['country','population'],as_index=False)['article_title'].size()
df_in.rename(columns={'size':'high_quality_count'},inplace=True)
df_country_articles = df_country_articles.merge(df_in,how='outer')
df_country_articles.head()

Unnamed: 0,country,population,article_count,high_quality_count
0,Afghanistan,41.1,118,6.0
1,Albania,2.8,84,6.0
2,Algeria,44.9,34,
3,Andorra,0.1,10,2.0
4,Angola,35.6,42,


In [39]:
df_country_articles['coverage_per_capita']=0
df_country_articles['hq_per_capita']=0

In [40]:
for i, row in df_country_articles.iterrows():
    df_country_articles.iloc[i,4] = df_country_articles.iloc[i,2]/(df_country_articles.iloc[i,1])
    df_country_articles.iloc[i,5] = df_country_articles.iloc[i,3]/(df_country_articles.iloc[i,1])    

## Top 10 countries by coverage 

In [41]:
df_country_articles.loc[:,['country','population','article_count','coverage_per_capita']].sort_values('coverage_per_capita',ascending=False).head(10)

Unnamed: 0,country,population,article_count,coverage_per_capita
5,Antigua and Barbuda,0.1,17,170.0
54,Federated States of Micronesia,0.1,13,130.0
3,Andorra,0.1,10,100.0
13,Barbados,0.3,28,93.33333333333334
103,Marshall Islands,0.1,9,90.0
108,Montenegro,0.6,39,65.0
138,Seychelles,0.1,6,60.0
96,Luxembourg,0.7,37,52.85714285714286
18,Bhutan,0.8,41,51.25
64,Grenada,0.1,5,50.0


## Bottom 10 countries by coverage

In [42]:
df_country_articles.loc[:,['country','population','article_count','coverage_per_capita']].sort_values('coverage_per_capita',ascending=True).head(10)

Unnamed: 0,country,population,article_count,coverage_per_capita
32,China,1436.6,2,0.0013921759710427
105,Mexico,127.5,1,0.0078431372549019
135,Saudi Arabia,36.7,3,0.0817438692098092
130,Romania,19.0,2,0.1052631578947368
73,India,1417.2,179,0.1263053909116568
148,Sri Lanka,22.4,3,0.1339285714285714
48,Egypt,103.5,14,0.1352657004830917
53,Ethiopia,123.4,26,0.2106969205834684
156,Taiwan,23.2,5,0.2155172413793103
174,Vietnam,99.4,27,0.2716297786720322


## Top 10 countries by high quality

In [43]:
df_country_articles.loc[:,['country','population','article_count','hq_per_capita']].sort_values('hq_per_capita',ascending=False).head(10)

Unnamed: 0,country,population,article_count,hq_per_capita
3,Andorra,0.1,10,20.0
108,Montenegro,0.6,39,5.0
152,Suriname,0.6,24,3.333333333333333
1,Albania,2.8,84,2.142857142857143
20,Bosnia-Herzegovina,3.4,53,1.4705882352941178
95,Lithuania,2.8,75,1.0714285714285714
39,Croatia,3.8,57,1.0526315789473684
142,Slovenia,2.1,45,0.9523809523809524
122,Palestinian Territory,5.4,71,0.9259259259259258
58,Gabon,2.4,6,0.8333333333333334


## Bottom 10 countries by high quality

In [44]:
df_country_articles.loc[:,['country','population','article_count','hq_per_capita']].sort_values('hq_per_capita',ascending=True).head(10)

Unnamed: 0,country,population,article_count,hq_per_capita
73,India,1417.2,179,0.004233700254022
159,Thailand,66.8,29,0.0149700598802395
80,Japan,124.9,110,0.0160128102481985
117,Nigeria,218.5,220,0.0183066361556064
174,Vietnam,99.4,27,0.0201207243460764
33,Colombia,49.1,58,0.020366598778004
167,Uganda,47.2,44,0.0211864406779661
121,Pakistan,235.8,124,0.0212044105173876
151,Sudan,46.9,33,0.021321961620469
75,Iran,88.6,60,0.0225733634311512


In [45]:
df_populations = pd.read_csv('../data/population_by_country_2022.csv')

In [46]:
df_region_articles = pd.DataFrame()
df_region_articles = df_merge.groupby(['region'],as_index=False)['article_title'].size()
df_rn = df_merge.loc[df_merge.article_quality.isin(['GA','FA']),:].groupby(['region'],as_index=False)['article_title'].size()
df_rn.rename(columns={'size':'high_quality_count'},inplace=True)
df_region_articles = df_region_articles.merge(df_rn,how='outer')
df_region_articles=df_region_articles.merge(df_populations,left_on='region',right_on='Geography',how='left')
df_region_articles.drop(['Geography'],axis=1,inplace=True)
df_region_articles['coverage_per_capita']=0
df_region_articles['hq_per_capita']=0
for i, row in df_region_articles.iterrows():
    df_region_articles.iloc[i,4] = df_region_articles.iloc[i,1]/(df_region_articles.iloc[i,3])
    df_region_articles.iloc[i,5] = df_region_articles.iloc[i,2]/(df_region_articles.iloc[i,3])    
df_region_articles.rename(columns={'size':'article_count','Population (millions)':'population'},inplace=True)

## Geographic regions by total coverage

In [47]:
df_region_articles.loc[:,['region','population','article_count','coverage_per_capita']].sort_values('coverage_per_capita',ascending=False).head(19)

Unnamed: 0,region,population,article_count,coverage_per_capita
14,SOUTHERN EUROPE,151.0,915,6.059602649006623
0,CARIBBEAN,44.0,202,4.590909090909091
17,WESTERN EUROPE,197.0,695,3.527918781725888
5,EASTERN EUROPE,287.0,749,2.609756097560976
8,NORTHERN EUROPE,107.0,265,2.4766355140186915
16,WESTERN ASIA,294.0,690,2.346938775510204
13,SOUTHERN AFRICA,69.0,117,1.6956521739130437
9,OCEANIA,44.0,72,1.6363636363636365
2,CENTRAL ASIA,78.0,119,1.5256410256410255
4,EASTERN AFRICA,473.0,652,1.3784355179704018


## Geographic regions by high quality coverage

In [48]:
df_region_articles.loc[:,['region','population','article_count','hq_per_capita']].sort_values('hq_per_capita',ascending=False).head(19)

Unnamed: 0,region,population,article_count,hq_per_capita
14,SOUTHERN EUROPE,151.0,915,0.304635761589404
0,CARIBBEAN,44.0,202,0.1818181818181818
5,EASTERN EUROPE,287.0,749,0.1324041811846689
17,WESTERN EUROPE,197.0,695,0.116751269035533
16,WESTERN ASIA,294.0,690,0.0952380952380952
8,NORTHERN EUROPE,107.0,265,0.0747663551401869
13,SOUTHERN AFRICA,69.0,117,0.0579710144927536
1,CENTRAL AMERICA,178.0,200,0.0561797752808988
2,CENTRAL ASIA,78.0,119,0.0384615384615384
12,SOUTHEAST ASIA,676.0,417,0.0355029585798816


In [49]:
df_population.describe()

Unnamed: 0,Population (millions)
count,203.0
mean,39.22068965517242
std,146.61818186359883
min,0.1
25%,1.75
50%,8.7
75%,28.25
max,1436.6


In [50]:
df_population.sort_values('Population (millions)').head(20)

Unnamed: 0,Geography,Population (millions),region
82,Curacao,0.1,CARIBBEAN
41,Seychelles,0.1,EASTERN AFRICA
230,Tonga,0.1,OCEANIA
217,Federated States of Micronesia,0.1,OCEANIA
93,St. Vincent and the Grenadines,0.1,CARIBBEAN
85,Grenada,0.1,CARIBBEAN
91,St. Kitts-Nevis,0.1,CARIBBEAN
222,Marshall Islands,0.1,OCEANIA
83,Dominica,0.1,CARIBBEAN
78,Antigua and Barbuda,0.1,CARIBBEAN
