* Purpose: Link redirected Wikipages to the target Wikipage's wikidata ID (Or vice versa in some cases).

In [1]:
import pandas as pd
import numpy as np


In [2]:
entities = pd.read_csv('../data/raw/wikidata_20190805.item.csv')

In [3]:
entities.head()

Unnamed: 0,id,en_label,en_description,enwiki_title
0,51475818,YouTube as a source of information on kidney s...,scientific article published on 4 December 2010,
1,51475821,The sinus lift with phycogenic bone substitute...,scientific article published in June 2005,
2,51475829,Economic aspects of single-tooth replacement.,scientific article published in June 2005,
3,51475835,"Template:Peace, Unity, and Development Party/m...",,"Template:Peace, Unity, and Development Party/m..."
4,51475865,Long-term results and survival rate of implant...,scientific article published in June 2005,


In [4]:
entities.shape

(58708336, 4)

In [2]:
wikipages = pd.read_csv('../data/raw/enwiki_20190801.page.csv')

In [3]:
wikipages.head()

Unnamed: 0,page_id,page_title,page_is_redirect,page_len,wikidata_numeric_id,views
0,31880,Universe,0,125156,1.0,37605
1,24437894,Boston,0,188674,100.0,60038
2,12027,Gabon,0,60678,1000.0,24767
3,1313683,Dutch_Wikipedia,0,8325,10000.0,1203
4,4037258,Cadier_en_Keer,0,2584,100000.0,51


In [4]:
wikipages.shape

(14730178, 6)

In [5]:
redirect = pd.read_csv('../data/raw/enwiki_20190801.redirect.csv')

In [6]:
redirect.head()

Unnamed: 0,source_page_id,target_page_id,source_page_title,target_page_title
0,39378878,38421275,Infocom_Network,TradeIndia
1,25917412,2238902,Canyonero_(car),The_Last_Temptation_of_Krust
2,3245614,1941596,Chief_Dull_Knife,Morning_Star_(chief)
3,43321056,32960669,National_Register_of_Historic_Places_in_Early_...,National_Register_of_Historic_Places_listings_...
4,13307706,369596,The_New_York_Botanical_Garden,New_York_Botanical_Garden


In [10]:
redirect.shape

(8798299, 4)

In [11]:
wikipages[(wikipages.page_is_redirect == 1) & ~(wikipages.wikidata_numeric_id.isnull())]

Unnamed: 0,page_id,page_title,page_is_redirect,page_len,wikidata_numeric_id,views
168,11547342,Pentagastrin-stimulated_calcitonin_test,1,76,1000221.0,21
385,39526278,All_in_the_Family_(season_1),1,64,10006388.0,83
447,39526279,All_in_the_Family_(season_2),1,69,10007629.0,121
504,39526280,All_in_the_Family_(season_3),1,69,10008470.0,119
565,39526282,All_in_the_Family_(season_4),1,69,10009361.0,121
604,39526284,All_in_the_Family_(season_5),1,69,10010115.0,86
693,39526287,All_in_the_Family_(season_6),1,69,10011833.0,86
744,39526288,All_in_the_Family_(season_7),1,69,10012953.0,60
790,2801550,BudgeCo,1,24,1001380.0,25
803,39526289,All_in_the_Family_(season_8),1,69,10014029.0,62


In [13]:
#Get wikidata_id for redirected pages. 
wikipages_cleaned = wikipages.copy()
#Get target page id for all 
target_page_ids = wikipages_cleaned.merge(redirect[['source_page_id', 'target_page_id']],
                          how = 'left', left_on = ['page_id'], right_on = ['source_page_id'])
#Pages that do not appear in redirect have both source_page_id and target_page_id as null after join. These are just page_id for both
target_page_ids['target_page_id'] = target_page_ids['target_page_id'].fillna(target_page_ids['page_id'])
target_page_ids['source_page_id'] = target_page_ids['source_page_id'].fillna(target_page_ids['page_id'])
target_page_ids = target_page_ids.drop(['page_id'], axis = 1)

wikipages_cleaned = target_page_ids.merge(wikipages[['page_id', 'page_title', 'wikidata_numeric_id']], how = 'left', 
                                          left_on = ['target_page_id'], right_on = ['page_id'])

wikipages_cleaned['wikidata_numeric_id_x'] = wikipages_cleaned['wikidata_numeric_id_x'].fillna(
                                                    wikipages_cleaned['wikidata_numeric_id_y'])
wikipages_cleaned = wikipages_cleaned.drop(['wikidata_numeric_id_y', 'page_id'], axis = 1)
wikipages_cleaned = wikipages_cleaned.rename(columns = {'page_title_x': 'page_title', 
                                                        'page_title_y': 'target_page_title',
                                                        'wikidata_numeric_id_x': 'wikidata_numeric_id',
                                                        'source_page_id': 'page_id'
                                                       })
#There are some instances (see end of notebook) of source page having wikidata instead of target.
#Set target page to have source page wikidata for those
wikidata_ids = wikipages_cleaned.groupby(['target_page_id'])[['wikidata_numeric_id']].min().reset_index()
wikipages_cleaned = wikipages_cleaned.merge(wikidata_ids, how = 'left', on = ['target_page_id'])
wikipages_cleaned['wikidata_numeric_id_x'] = wikipages_cleaned['wikidata_numeric_id_x'].fillna(
                                                    wikipages_cleaned['wikidata_numeric_id_y'])
wikipages_cleaned = wikipages_cleaned.drop(['wikidata_numeric_id_y'], axis = 1)
wikipages_cleaned = wikipages_cleaned.rename(columns = {'wikidata_numeric_id_x': 'wikidata_numeric_id'})

wikipages_cleaned


Unnamed: 0,page_title,page_is_redirect,page_len,wikidata_numeric_id,views,page_id,target_page_id,target_page_title
0,Universe,0,125156,1.0,37605,31880.0,31880.0,Universe
1,Boston,0,188674,100.0,60038,24437894.0,24437894.0,Boston
2,Gabon,0,60678,1000.0,24767,12027.0,12027.0,Gabon
3,Dutch_Wikipedia,0,8325,10000.0,1203,1313683.0,1313683.0,Dutch_Wikipedia
4,Cadier_en_Keer,0,2584,100000.0,51,4037258.0,4037258.0,Cadier_en_Keer
5,Water_crisis_in_Iran,0,11924,1000000.0,351,43228764.0,43228764.0,Water_crisis_in_Iran
6,Gold_Cobra,0,25138,1000001.0,3304,25266597.0,25266597.0,Gold_Cobra
7,Nielles-lès-Bléquin,0,2776,1000003.0,23,13127566.0,13127566.0,Nielles-lès-Bléquin
8,New_Hampshire_Route_16,0,21068,1000004.0,332,1977519.0,1977519.0,New_Hampshire_Route_16
9,Karel_Matěj_Čapek-Chod,0,1974,1000005.0,45,4217319.0,4217319.0,Karel_Matěj_Čapek-Chod


In [33]:
wikipages_cleaned.to_csv('../data/wikipages_cleaned.csv', index = False)

In [14]:
wikipages_cleaned[wikipages_cleaned.page_id == 325726.0]

Unnamed: 0,page_title,page_is_redirect,page_len,wikidata_numeric_id,views,page_id,target_page_id,target_page_title
6040736,Social_network_analysis,0,47818,7551269.0,10825,325726.0,325726.0,Social_network_analysis


In [25]:
len(wikipages_cleaned[wikipages_cleaned.wikidata_numeric_id.isnull()])

40693

In [26]:
wikipages_cleaned[wikipages_cleaned.wikidata_numeric_id.isnull()].sort_values(['views'], ascending = False)

Unnamed: 0,page_title,page_is_redirect,page_len,wikidata_numeric_id,views,page_id,target_page_id,target_page_title
6013857,King_of_the_Monsters,0,718,,50810,61352819.0,61352819.0,King_of_the_Monsters
6040729,"2019_Tour_de_France,_Stage_12_to_Stage_21",0,43438,,25187,61241389.0,61241389.0,"2019_Tour_de_France,_Stage_12_to_Stage_21"
6040731,List_of_countries_by_population_(United_Nations),0,39128,,24281,39707994.0,39707994.0,List_of_countries_by_population_(United_Nations)
6040732,G1_Climax_(2019),0,52071,,17689,61355160.0,61355160.0,G1_Climax_(2019)
6040733,Fear_Inoculum,0,16805,,15794,59592046.0,59592046.0,Fear_Inoculum
6040734,71st_Primetime_Creative_Arts_Emmy_Awards,0,65200,,14241,61294403.0,61294403.0,71st_Primetime_Creative_Arts_Emmy_Awards
6040735,List_of_A24_films,0,12959,,13583,61287896.0,61287896.0,List_of_A24_films
6049574,Rich_Energy_Ltd.,1,42,,13316,61296123.0,61118727.0,Rich_Energy
6040737,The_Search_(NF_album),0,6309,,10626,60932174.0,60932174.0,The_Search_(NF_album)
6040738,Premiership_of_Boris_Johnson,0,7191,,10621,61343841.0,61343841.0,Premiership_of_Boris_Johnson


75% of the articles still without wikidata item have fewer than 10 views. As such, most are insignificant.

In [32]:
wikipages_cleaned[wikipages_cleaned.wikidata_numeric_id.isnull()]['views'].describe()

count    40693.000000
mean        34.924090
std        407.577772
min          0.000000
25%          0.000000
50%          0.000000
75%         10.000000
max      50810.000000
Name: views, dtype: float64

Some of the pages have Wikidata items, but are not indicated as such in the wikipages dataset.

In [27]:
entities[entities.id == 66084740]

Unnamed: 0,id,en_label,en_description,enwiki_title
23613348,66084740,List of countries by population (United Nations),Wikimedia list article,List of countries by population (United Nations)


In [28]:
wikipages_cleaned[wikipages_cleaned.wikidata_numeric_id == 66084740]

Unnamed: 0,page_title,page_is_redirect,page_len,wikidata_numeric_id,views,page_id,target_page_id,target_page_title


In [29]:
wikipages[wikipages.wikidata_numeric_id == 66084740]

Unnamed: 0,page_id,page_title,page_is_redirect,page_len,wikidata_numeric_id,views


Example of a target page not having wikidata numeric id but a source page does.

In [64]:
wikipages[wikipages.page_id == 325726.0]

Unnamed: 0,page_id,page_title,page_is_redirect,page_len,wikidata_numeric_id,views
6040736,325726,Social_network_analysis,0,47818,,10825


In [69]:
redirect[redirect['target_page_id'] == 325726]

Unnamed: 0,source_page_id,target_page_id,source_page_title,target_page_title
1717720,1559348,325726,Social_Network_Analysis,Social_network_analysis
1721329,12567496,325726,Social_networking_potential,Social_network_analysis
1722555,20645616,325726,Cascade_(Social_Network_Theory),Social_network_analysis
1724358,17661070,325726,Social_Network_Change_Detection,Social_network_analysis
1727934,17576720,325726,Social_network_change_detection,Social_network_analysis
1730560,39596224,325726,Networks_in_Political_Science,Social_network_analysis
1732941,14825621,325726,Social_Networking_Potential,Social_network_analysis


In [70]:
wikipages[wikipages.page_id == 17661070]

Unnamed: 0,page_id,page_title,page_is_redirect,page_len,wikidata_numeric_id,views
13526126,17661070,Social_Network_Change_Detection,1,69,,0


In [71]:
wikipages[wikipages.page_id == 17576720]

Unnamed: 0,page_id,page_title,page_is_redirect,page_len,wikidata_numeric_id,views
5421981,17576720,Social_network_change_detection,1,37,7551269.0,318


In [65]:
entities[entities.id == 7551269]

Unnamed: 0,id,en_label,en_description,enwiki_title
17128640,7551269,Social network change detection,,Social network change detection
