In [1]:
import pandas as pd

In [2]:
anchors = pd.read_csv('../data/raw/enwiki_20190801.k_raw_anchors.csv')
anchors.dropna(inplace=True)
anchors.head()

Unnamed: 0,anchor_text,target_page_id,target_wikidata_numeric_id,anchor_target_count,anchor_frac,target_frac
0,Adaptive technology,653.0,688498.0,4,1.0,0.013605
1,assistive technology,653.0,688498.0,133,0.985185,0.452381
2,Adaptive Design,653.0,688498.0,2,1.0,0.006803
3,assistive device,653.0,688498.0,14,1.0,0.047619
4,assistance,653.0,688498.0,2,0.1,0.006803


In [9]:
anchors[anchors.anchor_text.str.contains('^castle')]

Unnamed: 0,anchor_text,target_page_id,target_wikidata_numeric_id,anchor_target_count,anchor_frac,target_frac
15780,castle Balmoral,4644.0,42049.0,2,1.000000,0.004866
52742,castle gate,12821.0,53060.0,2,1.000000,0.007968
183673,castle,47615.0,137.0,2,0.000429,0.007353
183794,castled,47642.0,102877.0,13,0.812500,0.068421
183795,castle,47642.0,102877.0,37,0.007945,0.194737
...,...,...,...,...,...,...
6673686,castles of Manderscheid,51511504.0,881754.0,2,1.000000,0.333333
6751500,castle of Ibelin,53273329.0,3877418.0,5,1.000000,0.263158
6763632,castle of Medinaceli,53527356.0,22809054.0,2,1.000000,0.666667
6834146,castle,55198442.0,9386654.0,2,0.000429,0.500000


In [5]:
def clean_anchor_text(text, regex_ls, unicode_dict):
    
    import re
    from text_cleaning_functions import replace_accents
    no_accent = replace_accents(text, unicode_dict)
    
    for pat, rep in regex_ls:
        no_accent = re.sub(pat, rep, no_accent)
    cleaned_text = no_accent.lower()
    return cleaned_text

from text_cleaning_functions import get_unicode_dict
# replace unicode manually
unicode_dict = get_unicode_dict()

regex_ls = [('&\w+;|&#[0-9]+;|&#[xX][a-fA-F0-9]+;', ''), # html encoded strings
            ('[^a-zA-Z0-9\s]', ''), # weird characters
            ('\s{2,}', ' '), # whitespaces
            ('^ | $', ''), # strip whitespaces
            ('[0-9]', '#')] # replace numbers with hash # not sure whether this is best option to deal with numbers...

# clean anchor text
cleaned_anchors = anchors.copy()
cleaned_anchors['anchor_text'] = anchors.apply(lambda i: clean_anchor_text(i.anchor_text, regex_ls, unicode_dict), axis=1)
# drop empty anchors after cleaning
cleaned_anchors = cleaned_anchors[cleaned_anchors['anchor_text']!= '']
# after cleaning, some anchor text entries are now the same/link to the same wikidata numeric id
cleaned_anchors = cleaned_anchors[['anchor_text', 'target_wikidata_numeric_id']]
cleaned_anchors.drop_duplicates(inplace=True)
cleaned_anchors.head()

Unnamed: 0,anchor_text,target_wikidata_numeric_id
0,adaptive technology,688498.0
1,assistive technology,688498.0
2,adaptive design,688498.0
3,assistive device,688498.0
4,assistance,688498.0


In [6]:
# get list of candidates
anchor_candidates = (cleaned_anchors
                     .groupby('anchor_text')
                     .apply(lambda i: list(i.target_wikidata_numeric_id))
                     .to_frame(name='candidates')
                     .reset_index())

# convert to dictionary
anchor_candidates_dict = {}
for row in range(anchor_candidates.shape[0]):
    anchor_candidates_dict[anchor_candidates.loc[row, 'anchor_text']] = anchor_candidates.loc[row, 'candidates']

In [15]:
# lots of anchors which just single number/character, should we drop them?
anchor_candidates_dict['f']

[9765.0,
 494083.0,
 42289.0,
 162378.0,
 42189.0,
 170545.0,
 203607.0,
 1668896.0,
 193540.0,
 131255.0,
 407350.0,
 380770.0,
 862169.0,
 1931209.0,
 11793044.0,
 648619.0,
 866081.0,
 394641.0,
 177144.0,
 280658.0,
 349707.0,
 184172.0,
 654705.0,
 1095693.0,
 279749.0,
 543457.0,
 17525400.0,
 15304494.0,
 5177487.0,
 493386.0,
 3362265.0,
 1428028.0,
 1401388.0,
 5427464.0,
 630075.0,
 4592.0,
 126404.0,
 1353952.0,
 1148974.0,
 2485547.0,
 913790.0,
 4226143.0,
 283741.0,
 1149640.0,
 6636744.0,
 788472.0,
 522701.0,
 5427508.0,
 6956208.0,
 5427493.0,
 5469884.0,
 1526359.0,
 1034276.0,
 2031969.0,
 4738059.0,
 2133631.0,
 5440359.0,
 142.0,
 1395133.0,
 2979873.0,
 6565943.0,
 749679.0,
 7830785.0,
 6606128.0,
 6621625.0,
 6622230.0,
 1622022.0,
 775617.0,
 2652945.0,
 17117940.0,
 4436514.0,
 18034373.0,
 1105889.0,
 517301.0,
 1105931.0,
 698244.0,
 650.0,
 1142061.0,
 7391030.0,
 4000022.0,
 964155.0,
 4671270.0,
 5427465.0,
 17094372.0,
 5427460.0,
 4996303.0,
 6620799.0,

In [18]:
# save dictionary of candidates
import pickle
with open('../data/anchor_candidates.pkl', 'wb') as f:
    pickle.dump(anchor_candidates_dict, f, protocol=pickle.HIGHEST_PROTOCOL)

In [26]:
# modified candidates dictionary such that each entry has at least 2 candidates
# as of now, just get a random list from the next entry
import random
mod = False
for key, val in anchor_candidates_dict.items():
    if mod:
        new_val = temp_val + [random.choice(val)]
        anchor_candidates_dict[temp_key] = new_val
    if len(val) < 2:
        mod  = True
        temp_key = key
        temp_val = val
    else:
        mod = False

In [28]:
with open('../data/mod_anchor_candidates.pkl', 'wb') as f:
    pickle.dump(anchor_candidates_dict, f, protocol=pickle.HIGHEST_PROTOCOL)

***

Maybe later modify candidate list with redirects.

In [None]:
wikidata[wikidata['wikidata_numeric_id']]

In [20]:
# wikidata 
wikidata = pd.read_csv('../data/wikipages_cleaned.csv')
wikidata.dropna(inplace=True)
wikidata.head()

Unnamed: 0,page_title,page_is_redirect,page_len,wikidata_numeric_id,views,page_id,target_page_id,target_page_title
0,Universe,0,125156,1.0,37605,31880.0,31880.0,Universe
1,Boston,0,188674,100.0,60038,24437894.0,24437894.0,Boston
2,Gabon,0,60678,1000.0,24767,12027.0,12027.0,Gabon
3,Dutch_Wikipedia,0,8325,10000.0,1203,1313683.0,1313683.0,Dutch_Wikipedia
4,Cadier_en_Keer,0,2584,100000.0,51,4037258.0,4037258.0,Cadier_en_Keer


In [8]:
# merge dataframe on wikidata numeric ids, to link anchor text with target page title
merged_anchors = anchors.merge(wikidata[['wikidata_numeric_id', 'page_title', 'target_page_title']], 
                               'left',
                               left_on='target_wikidata_numeric_id', 
                               right_on='wikidata_numeric_id',
                              )
merged_anchors.head()

Unnamed: 0,anchor_text,target_page_id,target_wikidata_numeric_id,anchor_target_count,anchor_frac,target_frac,wikidata_numeric_id,page_title,target_page_title
0,Adaptive technology,653.0,688498.0,4,1.0,0.013605,688498.0,Assistive_technology,Assistive_technology
1,Adaptive technology,653.0,688498.0,4,1.0,0.013605,688498.0,Adaptive_technology,Assistive_technology
2,Adaptive technology,653.0,688498.0,4,1.0,0.013605,688498.0,Assistive_device,Assistive_technology
3,Adaptive technology,653.0,688498.0,4,1.0,0.013605,688498.0,Adaptive_Design,Assistive_technology
4,Adaptive technology,653.0,688498.0,4,1.0,0.013605,688498.0,Assistive_Technology,Assistive_technology


In [9]:
merged_anchors.head(20)

Unnamed: 0,anchor_text,target_page_id,target_wikidata_numeric_id,anchor_target_count,anchor_frac,target_frac,wikidata_numeric_id,page_title,target_page_title
0,Adaptive technology,653.0,688498.0,4,1.0,0.013605,688498.0,Assistive_technology,Assistive_technology
1,Adaptive technology,653.0,688498.0,4,1.0,0.013605,688498.0,Adaptive_technology,Assistive_technology
2,Adaptive technology,653.0,688498.0,4,1.0,0.013605,688498.0,Assistive_device,Assistive_technology
3,Adaptive technology,653.0,688498.0,4,1.0,0.013605,688498.0,Adaptive_Design,Assistive_technology
4,Adaptive technology,653.0,688498.0,4,1.0,0.013605,688498.0,Assistive_Technology,Assistive_technology
5,Adaptive technology,653.0,688498.0,4,1.0,0.013605,688498.0,Assistive_technologies,Assistive_technology
6,Adaptive technology,653.0,688498.0,4,1.0,0.013605,688498.0,Access_technology,Assistive_technology
7,Adaptive technology,653.0,688498.0,4,1.0,0.013605,688498.0,AssistiveTechnology,Assistive_technology
8,Adaptive technology,653.0,688498.0,4,1.0,0.013605,688498.0,Reading_technology,Assistive_technology
9,Adaptive technology,653.0,688498.0,4,1.0,0.013605,688498.0,Adaptive_designs,Assistive_technology


In [5]:
from text_cleaning_functions import replace_accents

