This notebook makes the files necessary for candidate selection.

Final candidate selection will be combining LSH candidates with anchor text candidates. Final candidate selection function is stored in candidate_selection.py.

In [3]:
import pickle
import re
import unidecode
from collections import defaultdict
from nltk import ngrams
from tqdm import tqdm_notebook
import pandas as pd
import numpy as np
from datasketch import MinHash, MinHashLSHForest


In [8]:
#Wikidata 
wikidata = pd.read_csv('../data/wikipages_cleaned.csv')
wikidata = wikidata.dropna()
#Unfortunately can't put all pages in LSH due to memory constraints
top_wikidata = wikidata.sort_values(by = 'views', ascending = False).iloc[:4000000]
top_wikidata.to_csv('../data/candidate_selection/top_wikipages.csv', index = False)

In [4]:

def preprocess(text):
    #Substitute non-alphanumeric characters with _
    text = re.sub(r'[^\w]','_',text)
    #Lower case everything character
    text = text.lower()
    return text


In [None]:

#https://www.learndatasci.com/tutorials/building-recommendation-engine-locality-sensitive-hashing-lsh-python/
#Make LSH
#Number of Permutations
permutations = 128
def get_forest(data, perms):    
    minhash = []
    for text in tqdm_notebook(data['page_title'], total = len(data)):
        text_preprocessed = preprocess(text)
        m = MinHash(num_perm=perms)
        for d in ngrams(text_preprocessed, 3):
            m.update("".join(d).encode('utf-8'))
        minhash.append(m)    
    forest = MinHashLSHForest(num_perm=perms)
    for i,m in enumerate(tqdm_notebook(minhash)):
        forest.add(i,m)
        
    forest.index()
    return forest
forest = get_forest(top_wikidata, permutations)
with open('../data/candidate_selection/lsh_forest.pkl', 'wb') as f:
    pickle.dump(forest, f)



def predict(text, database, perms, num_results, forest):    
    text_preprocessed = preprocess(text)
    m = MinHash(num_perm=perms)
    for d in ngrams(text_preprocessed, 3):
        m.update("".join(d).encode('utf-8'))
    idx_array = np.array(forest.query(m, num_results))
    if len(idx_array) == 0:
        return None # if your query is empty, return none
    
    result = database.iloc[idx_array]['wikidata_numeric_id'].astype(int)    
    return result


In [5]:
anchors = pd.read_csv('../data/raw/enwiki_20190801.k_raw_anchors.csv')
anchors.dropna(inplace=True)
anchors.head()

Unnamed: 0,anchor_text,target_page_id,target_wikidata_numeric_id,anchor_target_count,anchor_frac,target_frac
0,Adaptive technology,653.0,688498.0,4,1.0,0.013605
1,assistive technology,653.0,688498.0,133,0.985185,0.452381
2,Adaptive Design,653.0,688498.0,2,1.0,0.006803
3,assistive device,653.0,688498.0,14,1.0,0.047619
4,assistance,653.0,688498.0,2,0.1,0.006803


In [6]:
anchors['anchor_text_processed'] = anchors['anchor_text'].apply(lambda text: preprocess(text))
#Drop any duplicates after processing anchor text
anchors = anchors.drop_duplicates(subset = ['target_wikidata_numeric_id', 'anchor_text_processed'])

In [13]:
anchors[anchors.anchor_text_processed == 'trump'].sort_values('anchor_frac', ascending = False).iloc[:10]

Unnamed: 0,anchor_text,target_page_id,target_wikidata_numeric_id,anchor_target_count,anchor_frac,target_frac,anchor_text_processed
2875654,trump,6509278.0,727407.0,108,0.955752,0.54,trump
2559162,Trump,4848272.0,22686.0,249,0.803226,0.023828,trump
871741,Trump,489947.0,7847760.0,24,0.077419,0.888889,trump
6902014,Trump,56815814.0,35995833.0,10,0.032258,1.0,trump
6703512,Trump,52231341.0,27809653.0,8,0.025806,0.009009,trump
1008974,Trump,640713.0,5634699.0,4,0.012903,0.666667,trump
2924453,Trump,6808851.0,3999864.0,2,0.006452,0.095238,trump
5919841,Trump,38316801.0,16944413.0,2,0.006452,0.333333,trump
6703562,Trump,52231773.0,27811470.0,2,0.006452,0.035088,trump


In [85]:
#Make dictionary of anchor text mapping to list of candidates, in descending order of anchor frac
def get_anchor_candidates(x):
    wikidata_id, anchor_frac = list(x['target_wikidata_numeric_id']), list(x['anchor_frac'])
    candidates_list = [(wikidata_id[i], anchor_frac[i]) for i in range(len(x))]
    candidates_list = sorted(candidates_list, key = lambda x: x[1], reverse = True)
    return candidates_list

anchors_grouped = anchors.groupby(['anchor_text_processed']).apply(get_anchor_candidates)
anchors_grouped_dict = anchors_grouped.to_dict()
with open('../data/candidate_selection/anchors_dict.pkl', 'wb') as f:
    pickle.dump(anchors_grouped_dict, f)

In [87]:
# anchors_grouped = anchors.groupby(['anchor_text_processed']).apply(get_anchor_candidates)

In [100]:
# anchors_grouped_dict = anchors_grouped.to_dict()

In [105]:
# with open('../data/candidate_selection/anchors_dict.pkl', 'wb') as f:
#     pickle.dump(anchors_grouped_dict, f)

In [132]:
def get_candidates(entity, lsh_k, anchor_k):
    #Get lsh_k candidates for entity from LSH and anchor_k candidates from anchor
    entity = preprocess(entity)
    anchors_candidates = anchors_grouped_dict[entity][:anchor_k]
    anchors_candidates = [int(candidate[0]) for candidate in anchors_candidates]
    lsh_candidates = predict(entity, top_wikidata, 128, lsh_k, forest).tolist()
    return set(anchors_candidates + lsh_candidates)
    

In [137]:
get_candidates('china', 10, 10)

{148,
 8733,
 43467,
 82714,
 82972,
 130582,
 130693,
 473473,
 619865,
 713170,
 756037,
 770553,
 851782,
 1447741,
 2451592,
 5100121,
 13426199}