In [1]:
import pandas as pd

In [2]:
anchors = pd.read_csv('../data/raw/enwiki_20190801.k_raw_anchors.csv')
anchors.dropna(inplace=True)
anchors.head()

Unnamed: 0,anchor_text,target_page_id,target_wikidata_numeric_id,anchor_target_count,anchor_frac,target_frac
0,Adaptive technology,653.0,688498.0,4,1.0,0.013605
1,assistive technology,653.0,688498.0,133,0.985185,0.452381
2,Adaptive Design,653.0,688498.0,2,1.0,0.006803
3,assistive device,653.0,688498.0,14,1.0,0.047619
4,assistance,653.0,688498.0,2,0.1,0.006803


In [5]:
def clean_anchor_text(text, regex_ls, unicode_dict):
    
    import re
    from text_cleaning_functions import replace_accents
    no_accent = replace_accents(text, unicode_dict)
    
    for pat, rep in regex_ls:
        no_accent = re.sub(pat, rep, no_accent)
    cleaned_text = no_accent.lower()
    return cleaned_text

from text_cleaning_functions import get_unicode_dict
# replace unicode manually
unicode_dict = get_unicode_dict()

regex_ls = [('&\w+;|&#[0-9]+;|&#[xX][a-fA-F0-9]+;', ''), # html encoded strings
            ('[^a-zA-Z0-9\s]', ''), # weird characters
            ('\s{2,}', ' '), # whitespaces
            ('^ | $', ''), # strip whitespaces
            ('[0-9]', '#')] # replace numbers with hash # not sure whether this is best option to deal with numbers...

# clean anchor text
cleaned_anchors = anchors.copy()
cleaned_anchors['anchor_text'] = anchors.apply(lambda i: clean_anchor_text(i.anchor_text, regex_ls, unicode_dict), axis=1)
# drop empty anchors after cleaning
cleaned_anchors = cleaned_anchors[cleaned_anchors['anchor_text']!= '']
# after cleaning, some anchor text entries are now the same/link to the same wikidata numeric id
cleaned_anchors = cleaned_anchors[['anchor_text', 'target_wikidata_numeric_id']]
cleaned_anchors.drop_duplicates(inplace=True)
cleaned_anchors.head()

Unnamed: 0,anchor_text,target_wikidata_numeric_id
0,adaptive technology,688498.0
1,assistive technology,688498.0
2,adaptive design,688498.0
3,assistive device,688498.0
4,assistance,688498.0


In [6]:
# get list of candidates
anchor_candidates = (cleaned_anchors
                     .groupby('anchor_text')
                     .apply(lambda i: list(i.target_wikidata_numeric_id))
                     .to_frame(name='candidates')
                     .reset_index())

# convert to dictionary
anchor_candidates_dict = {}
for row in range(anchor_candidates.shape[0]):
    anchor_candidates_dict[anchor_candidates.loc[row, 'anchor_text']] = anchor_candidates.loc[row, 'candidates']

In [18]:
# save dictionary of candidates
import pickle
with open('../data/anchor_candidates.pkl', 'wb') as f:
    pickle.dump(anchor_candidates_dict, f, protocol=pickle.HIGHEST_PROTOCOL)

In [26]:
# modified candidates dictionary such that each entry has at least 2 candidates
# as of now, just get a random list from the next entry
import random
mod = False
for key, val in anchor_candidates_dict.items():
    if mod:
        new_val = temp_val + [random.choice(val)]
        anchor_candidates_dict[temp_key] = new_val
    if len(val) < 2:
        mod  = True
        temp_key = key
        temp_val = val
    else:
        mod = False

In [28]:
with open('../data/mod_anchor_candidates.pkl', 'wb') as f:
    pickle.dump(anchor_candidates_dict, f, protocol=pickle.HIGHEST_PROTOCOL)

***