In [25]:
import pandas as pd
import pickle
import re
import unidecode
import numpy as np


from sklearn.linear_model import LogisticRegression

In [26]:
with open('../data/knowledge_graph_data/id2text_entity.pickle', 'rb') as f:
    id2text = pickle.load(f)

In [27]:
with open('../data/sample_doc2vec_data.pkl', 'rb') as f:
    docs = pickle.load(f)

In [28]:
with open('../data/name_to_wiki_id.pkl', 'rb') as f:
    name_to_wiki_id = pickle.load(f)

In [29]:
with open('../data/knowledge_graph_data/idx2id_entity.pickle', 'rb') as f:
    idx2id = pickle.load(f)

In [30]:
id2idx = {value: key for key, value in idx2id.items()}

In [7]:
graph_embedding = np.load('../data/knowledge_graph_data/wiki_DistMult_entity.npy') 

In [8]:
len(graph_embedding)

5850119

In [31]:
# replace words within paranthesis and remove non alphanumeric characters
# get_rid of `the` which can cause difference
pattern = re.compile(r'\([^)]*\)|[^a-zA-Z0-9 -]|\s|(^|[^a-zA-Z0-9])the[^a-zA-Z0-9]')
# unidecode to remove accents
normalize_text = lambda text: pattern.sub("", unidecode.unidecode(text).lower())


def preprocess(text):
    return [word.lower() for word in word_tokenize(text) if word.lower() not in stoplist and not word.isdigit()]

In [32]:
lst = []
not_in_graph = []
for doc in docs:
    doc_embedding = doc[0]
    for name, entity_id in doc[1].items():
        #get a candidate from candidate_dict to get a false, irrelevant entity for each entity
        candidates_list = name_to_wiki_id[normalize_text(name[0])]
        #can't choose actual entity as false 
        if entity_id in candidates_list:
            candidates_list.remove(entity_id)
        if len(candidates_list) > 0:
            for false_entity_id in candidates_list:
                if false_entity_id in id2idx:
                    false_graph_embed = graph_embedding[id2idx[false_entity_id]]
                    lst.append(np.concatenate([doc_embedding, false_graph_embed, np.zeros(1)]))
                else:
                    not_in_graph.append(false_entity_id)
        if entity_id in id2idx:
            true_graph_embed = graph_embedding[id2idx[entity_id]]
            lst.append(np.concatenate([doc_embedding, true_graph_embed, np.ones(1)]))
        else:
            not_in_graph.append(entity_id)
lst = np.array(lst)
X = lst[:, :-1]
y = lst[:, -1:]

In [33]:
not_in_graph

[853581.0,
 853581.0,
 65091914.0,
 17088505.0,
 4810561.0,
 56651006.0,
 4813456.0,
 12875810.0,
 853581.0,
 65082770.0,
 28453463.0,
 4654676.0,
 65081718.0,
 6654586.0,
 3758329.0,
 55635595.0,
 16254770.0,
 780933,
 6961536,
 7837279.0,
 55641576.0,
 7072443.0,
 3293310,
 1531726,
 2006334,
 17642222,
 2544084,
 65079741.0,
 65079741.0,
 6041248.0,
 1017464.0,
 2054962.0,
 5586089.0,
 25352014.0,
 3966691.0,
 6041248.0,
 65079741.0,
 463257.0,
 16953533.0,
 4742853.0,
 1003198,
 65057523.0,
 13406760.0,
 60756105.0,
 64825508.0,
 4705174.0,
 684402.0,
 16197373.0,
 30601025.0,
 336308.0,
 7669747.0,
 853581.0,
 65090331.0,
 65090331.0,
 5170692,
 30632542.0,
 64825508.0,
 65060938.0,
 7085454.0,
 65060938.0,
 6979964.0,
 5472990.0,
 65049114.0,
 6979964.0,
 5472990.0,
 50962189.0,
 5273606.0,
 24039482.0,
 64825508.0,
 6365185.0,
 6365185.0,
 64825508.0,
 989673.0,
 5152209.0,
 28135704.0,
 65064412.0,
 6007037.0,
 7933447.0,
 7933450.0,
 1528875.0,
 2527557.0,
 17058771.0,
 111143

In [34]:
len(not_in_graph)

4876

In [35]:
y.mean()

0.18892508143322476

In [36]:
X.shape

(114204, 1250)

In [37]:
np.save('../data/sample_embedding_dataset_imbalanced.npy', lst)

In [24]:
wikipages_cleaned = pd.read_csv('../data/wikipages_cleaned.csv')

In [30]:
wikipages_cleaned[wikipages_cleaned.wikidata_numeric_id.isin(not_in_graph)]

Unnamed: 0,page_title,page_is_redirect,page_len,wikidata_numeric_id,views,page_id,target_page_id,target_page_title
1869,Her_Majesty's_Government_(term),0,11950,1003198.0,798,25318121.0,25318121.0,Her_Majesty's_Government_(term)
9246,Bush_(automobile),0,812,1017464.0,42,12455948.0,12455948.0,Bush_(automobile)
13181,Connected_Limited_Device_Configuration,0,8197,1023837.0,201,417988.0,417988.0,Connected_Limited_Device_Configuration
14003,Isha_Upanishad,0,25556,1025128.0,1225,2408537.0,2408537.0,Isha_Upanishad
32590,Home_Nations,0,10145,1040745.0,1376,971501.0,971501.0,Home_Nations
...,...,...,...,...,...,...,...,...
14695478,Seventeenth_Naval_District,1,63,7892607.0,0,61098242.0,11559695.0,United_States_naval_districts
14702846,Orémus_grape,1,19,8076053.0,0,10195148.0,12930665.0,Zéta
14721383,WQC,1,41,128067.0,0,47726527.0,2170444.0,World_Quizzing_Championship
14722996,Ninth_Avenue_Line_(Brooklyn_surface),1,37,7914432.0,0,9734770.0,8840562.0,B69_(New_York_City_bus)


In [31]:
triplets = pd.read_csv('../data/raw/wikidata_20190805.qpq_item_statements.csv')

In [37]:
triplets[triplets.source_item_id == 2408537.0]

Unnamed: 0,source_item_id,edge_property_id,target_item_id,el_rank
289440533,2408537,641,7291,1
289440534,2408537,156,2408540,1
289440535,2408537,155,2408527,1
289440536,2408537,31,696677,1
289440537,2408537,17,869,1
289440538,2408537,279,13357858,1
289440539,2408537,2094,63869675,1
289440540,2408537,276,1861,1


In [38]:
len(triplets['source_item_id'].unique())

56411129

In [45]:
wikidata = pd.read_csv('../data/raw/wikidata_20190805.item.csv')

In [46]:
wikidata.head()

Unnamed: 0,id,en_label,en_description,enwiki_title
0,51475818,YouTube as a source of information on kidney s...,scientific article published on 4 December 2010,
1,51475821,The sinus lift with phycogenic bone substitute...,scientific article published in June 2005,
2,51475829,Economic aspects of single-tooth replacement.,scientific article published in June 2005,
3,51475835,"Template:Peace, Unity, and Development Party/m...",,"Template:Peace, Unity, and Development Party/m..."
4,51475865,Long-term results and survival rate of implant...,scientific article published in June 2005,


In [56]:
wikidata[wikidata.id == 1017464]

Unnamed: 0,id,en_label,en_description,enwiki_title
38421949,1017464,Bush,automobile manufacturer,Bush (automobile)


In [58]:
wikidata[wikidata.enwiki_title == 'Bush (automobile)']

Unnamed: 0,id,en_label,en_description,enwiki_title
38421949,1017464,Bush,automobile manufacturer,Bush (automobile)


In [55]:
triplets[triplets.source_item_id == 1017464]

Unnamed: 0,source_item_id,edge_property_id,target_item_id,el_rank
240591450,1017464,31,786820,1


In [57]:
id2idx[1017464]

KeyError: 1017464