In [1]:
import pandas as pd
import numpy as np
import csv

import spacy
spacy.prefer_gpu()

import pickle
import sqlite3
import re

Do data manipulation in SQL instead.

In [2]:
try:
    # data of both wikipedia pages 
    sample = pd.read_csv('../data/plaintext_link_sample.csv')
    
except FileNotFoundError:
    
    # connect to database
    db = sqlite3.connect('../data/kensho.db')
    c = db.cursor()

    # visualize query into pandas dataframe
    def viz_tables(cols, query):
        q = c.execute(query).fetchall()
        framelist = dict()
        for i, col_name in enumerate(cols):
            framelist[col_name] = [col[i] for col in q]
        return pd.DataFrame.from_dict(framelist)
    
    # column names
    plaintext_link_cols = [col[1] for col in c.execute("PRAGMA table_info(plaintext_link)")]
    
    # get a sample of the data to work on pandas
    query2 = """
    SELECT *
    FROM plaintext_link 
    WHERE source_section_id IN (SELECT DISTINCT(source_section_id) 
                                FROM plaintext_link 
                                ORDER BY random() 
                                LIMIT 5000)
    """
    sample = viz_tables(plaintext_link_cols, query2)
    sample.to_csv('../data/plaintext_link_sample.csv', index=False)
    
    c.close()
    db.close()

In [3]:
# # update database
# # note we only save the introduction of all the wikipedia articles for now due to memory limitations
# query = """
# CREATE TABLE plaintext_link AS
# WITH link_intro AS (
#     SELECT 
#         l.link_id, l.source_section_id, l.source_wikidata_numeric_id, l.link_anchor, 
#         l.link_offset_start, l.link_offset_end, l.target_wikidata_numeric_id
#     FROM link l
#     WHERE l.source_section_num = 0
# )  
# SELECT 
#     l.link_id, l.source_section_id, l.source_wikidata_numeric_id, l.link_anchor, 
#     l.link_offset_start, l.link_offset_end, l.target_wikidata_numeric_id, 
#     p.section_text, p.section_len, 
#     r.anchor_target_count, r.anchor_frac, r.target_frac
# FROM link_intro l LEFT JOIN plaintext p
# ON p.section_id = l.source_section_id
# LEFT JOIN raw_anchor r
# ON l.link_anchor = r.anchor_text
# AND l.target_wikidata_numeric_id = r.target_wikidata_numeric_id
# """
# c.execute(query)
# db.commit()

In [4]:
def get_unicode_dict():
    import sys
    import unicodedata
    
    # get all unicode accented characters for alphabets
    unicode_dict = {}
    for letter in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
        letter_str = '['
        if letter.isupper():
            caps = 'CAPITAL '
        else:
            caps = 'SMALL '
        for i in range(sys.maxunicode):
            try:
                if caps+'LETTER '+letter.upper()+' ' in unicodedata.name(chr(i)):
                    letter_str+=chr(i)
            except ValueError:
                continue
        unicode_dict[letter] = letter_str+']'
    return unicode_dict

unicode_dict = get_unicode_dict()

In [5]:
def correct_whitespace_offset(data):
    data = data.copy()
    data['check'] = data.apply(lambda i: i.section_text[int(i.link_offset_start):int(i.link_offset_end)], axis=1)
    # drop empty anchors
    data = data[data['check']!='']
    # check for hanging whitespace at start
    wrong_link_offsets = data[data['check'].str.contains(r'^ ')] 
    data.loc[data['check'].str.contains(r'^ '), 'link_offset_start'] = [i+1 for i in wrong_link_offsets['link_offset_start'].values]
    # check for hanging whitespace at end
    wrong_link_offsets = data[data['check'].str.contains(r' $')] 
    data.loc[data['check'].str.contains(r' $'), 'link_offset_end'] = [i-1 for i in wrong_link_offsets['link_offset_end'].values]        
    return data

def replace_accents(text, unicode_dict):
    import re
    for rep, mat in unicode_dict.items():
        text = re.sub(mat, rep, text)
        
    return text

def adjust_entity_idx(text, replace_start, replace_end, replace_word):
    """
    Given a text, and a given start and end position of the original text to replace, the text to replace it by, 
    return a list consisting of the new text, an offset to shift the characters of the text, and the character beyond 
    which an offset is required
    """

    offset = replace_end - replace_start - len(replace_word)
    new_text = text[:replace_start] + replace_word + text[replace_end:]
    # original index > replace_end needs to be offset
    return [new_text, offset]

def text_preprocessing(data, regex_ls, unicode_dict):
    """Given the wikipedia text dataset, preprocess the text data in the column 'section_text'
    given a regex list to remove from the text data, and then adjust the corresponding link offsets
    under the columns 'link_offset_start', and 'link_offset_end'.
    """
    import re
    data = data.copy()
    
    # existing data has some wrong link offsets initially, these have additional spaces in front of the link anchors
    data = correct_whitespace_offset(data)
    # sort data first by section id, then by link offset
    data.sort_values(['source_section_id', 'link_offset_start'], inplace=True)
    
    # unidecode to replace accents
    data.loc[:,'section_text'] = data.apply(lambda i: replace_accents(i.section_text, unicode_dict), axis=1)
    data.loc[:,'link_anchor'] = data.apply(lambda i: replace_accents(i.link_anchor, unicode_dict), axis=1)
    
    # data cleaning while keeping track of link offsets
    for sid in data['source_section_id'].unique():
        replace_section = data[data['source_section_id'] == sid]
        text = replace_section['section_text'].iloc[0]
        # list of original link offsets
        link_offset_idx = list(replace_section.apply(lambda i: (i.link_offset_start, i.link_offset_end), axis=1))
    
        # remove text based on regex
        for regex in regex_ls:
            match_idx = [(m.start(0), m.end(0)) for m in re.finditer(regex[0], text)]
            if not match_idx:
                continue
    
            offset = 0
            for i in match_idx:
                replace_ls = adjust_entity_idx(text, 
                                               i[0]-offset,
                                               i[1]-offset,
                                               regex[1])
        
                
                # adjust link offsets
                for j, idx in enumerate(link_offset_idx):
                    # account for matched item within entity
                    if (i[0]-offset>=idx[0]) & (i[1]-offset<=idx[1]):
                        link_offset_idx[j] = (idx[0], idx[1]-replace_ls[1])
                    # original index > replace_end needs to be offset
                    elif idx[0] >= i[1]-offset:
                        link_offset_idx[j] = (idx[0]-replace_ls[1], idx[1]-replace_ls[1])
                    else:
                        link_offset_idx[j] = (idx[0], idx[1])
                        
                # text preprocessing
                text = replace_ls[0]        
                offset += replace_ls[1]
   
        # edit original 
        # replace numbers with hash #
        # https://mlwhiz.com/blog/2019/01/17/deeplearning_nlp_preprocess/
#         data.loc[data['source_section_id'] == sid, 'section_text'] = re.sub('[0-9]', '#', text)
        data.loc[data['source_section_id'] == sid, 'section_text'] = text
        data.loc[data['source_section_id'] == sid, 'link_offset_start'] = [i[0] for i in link_offset_idx]
        data.loc[data['source_section_id'] == sid, 'link_offset_end'] = [i[1] for i in link_offset_idx]
        
    # a removed matched entity may leave whitespaces when we calculate offsets, this corrects for it
    # existing data has some wrong link offsets initially, these have additional spaces in front/behind the link anchors
    data = correct_whitespace_offset(data)
    
    data['link_offset_end'] = data['link_offset_end'].astype(int)
    data['link_offset_start'] = data['link_offset_start'].astype(int)
    
    return data
        
# replace html encoded strings, weird characters, and additional whitespaces
cleaned_sample = text_preprocessing(sample, [('&\w+;|&#[0-9]+;|&#[xX][a-fA-F0-9]+;', ''), 
                                             ('[^a-zA-Z0-9\s]', ''), 
                                             ('\s{2,}', ' '),
                                             ('^ | $', '')], unicode_dict)
cleaned_sample.head()

Unnamed: 0,link_id,source_section_id,source_wikidata_numeric_id,link_anchor,link_offset_start,link_offset_end,target_wikidata_numeric_id,section_text,section_len,anchor_target_count,anchor_frac,target_frac,check
29759,37496,1663,335806.0,Ge'ez,16,20,35667,An abugida from Geez abugida or alphasyllabary...,2308,312.0,0.753623,0.601156,Geez
1590,37497,1663,335806.0,consonant,142,151,38035,An abugida from Geez abugida or alphasyllabary...,2308,943.0,0.946787,0.827193,consonant
1909,37498,1663,335806.0,vowel,163,168,36244,An abugida from Geez abugida or alphasyllabary...,2308,998.0,0.993035,0.740356,vowel
26768,37499,1663,335806.0,alphabet,218,226,9779,An abugida from Geez abugida or alphasyllabary...,2308,625.0,0.795165,0.823452,alphabet
15274,37500,1663,335806.0,abjad,287,292,185087,An abugida from Geez abugida or alphasyllabary...,2308,132.0,0.985075,0.66,abjad


In [6]:
cleaned_sample.to_csv('../data/test.csv', index=False)

In [7]:
# # data of wikidata entries
# wikidata = pd.read_csv('../data/wikipages_cleaned.csv')
# wikidata.dropna(inplace=True)

# # replace words within paranthesis and remove underscores
# # get_rid of `the` which can cause difference
# pattern = re.compile(r'\([^)]*\)|_')
# clean_text = lambda text: re.sub(r'\([^)]*\)', "", text).replace('_', ' ').strip()
# wikidata.loc[:,'target_page_title'] = wikidata.apply(lambda i: clean_text(i['target_page_title']), axis=1)

# # we need to replace link anchors with actual wikidata page titles
# def replace_anchor_text(text, replace_text, replace_start, replace_end):
#     """
#     Given a text, a given phrase/word to replace into the text, and a given start and end position
#     of the original text to replace, return a list consisting of the new text, and the new start and end
#     indexes corresponding to the replaced phrase/word, and an offset to shift the characters of the text
#     """
#     import string
#     new_replace_end = replace_start+len(replace_text)
#     # replace full words
#     if len(text) > replace_end:
#         while text[replace_end] in string.ascii_letters:
#             replace_end+=1
#     offset = new_replace_end - replace_end
#     new_text = text[:replace_start] + replace_text + text[replace_end:]
#     return [new_text, replace_start, new_replace_end, offset]

# # get corresponding wikidata page titles

# merged_sample = sample.merge(wikidata[['wikidata_numeric_id', 'target_page_title']], 
#                                'left', left_on='target_wikidata_numeric_id', right_on='wikidata_numeric_id')
# # SOMEHOW WHEN IT MERGES, IT DUPLICATES ALOT OF ROWS
# merged_sample.drop_duplicates(inplace=True)

# replaced_sample = []
# # replace link anchors with actual wikidata page titles
# # extract replaced text and indexes
# for sid in merged_sample['source_section_id'].unique():
#     replace_section = merged_sample[merged_sample['source_section_id'] == sid].sort_values('link_offset_start').reset_index(drop=True)
#     text = replace_section['section_text'].loc[0]
#     offset = 0
#     for i in np.arange(replace_section.shape[0]):
#         replace_ls = replace_anchor_text(text, replace_section.loc[i,'target_page_title'], 
#                                          replace_section.loc[i,'link_offset_start']+offset,
#                                          replace_section.loc[i,'link_offset_end']+offset)
#         text = replace_ls[0]
#         replace_section.loc[i, 'link_offset_start'] = replace_ls[1]
#         replace_section.loc[i, 'link_offset_end'] = replace_ls[2]
# #     replace_section.loc[i, 'link_anchor'] = replace_section.loc[i,'target_page_title']
#         offset += replace_ls[3]
#     replace_section['section_text'] = text
#     replaced_sample.append(replace_section)
    
# # get back dataframe
# replaced_sample = pd.concat(replaced_sample)
# # edit section length and link anchors
# replaced_sample.loc[:,'link_anchor'] = replaced_sample.loc[:,'target_page_title']
# replaced_sample.loc[:,'section_len'] = replaced_sample.apply(lambda i: len(i['section_text']), axis=1)

# # drop redundant columns
# replaced_sample.drop(['wikidata_numeric_id', 'target_page_title'], axis=1, inplace=True)

# # check whether link anchor corresponds to position in text
# check_anchor = lambda row_data: row_data['section_text'][row_data['link_offset_start']:row_data['link_offset_end']] == row_data['link_anchor']
# assert all(replaced_sample.apply(check_anchor, axis=1))
# assert len(replaced_sample['section_text'].unique())==5000

In [2]:
# https://spacy.io/usage/linguistic-features#native-tokenizers
from spacy.tokens import Doc

class WhitespaceTokenizer(object):
    def __init__(self, vocab):
        self.vocab = vocab

    def __call__(self, text):
        words = text.split(' ')
        # All tokens 'own' a subsequent space character in this tokenizer
        spaces = [True] * len(words)
        return Doc(self.vocab, words=words, spaces=spaces)

In [3]:
def extract_entities_and_labels_helper(section, tokenized_text):
    """
    Helper function for extract_entities_and_labels
    """
    import numpy as np
    tokenized_vector = np.zeros(len(tokenized_text))
    # get span object to match characters with tokens
    char_to_token = tokenized_text.char_span(section.link_offset_start, section.link_offset_end)
    
    # label corresponding tokens if there is an anchor link
    # to match anchor text rather than anchor link
    if not char_to_token:
        char_tokens = np.array([token.idx for token in tokenized_text])
        # to account for tokens at end of text
        if np.where(char_tokens >= section.link_offset_end)[0].size > 0:
            closest_start_token = char_tokens[np.where(char_tokens <= section.link_offset_start)[0][-1]]
            closest_end_token = char_tokens[np.where(char_tokens >= section.link_offset_end)[0][0]]-1
            char_to_token = tokenized_text.char_span(closest_start_token, closest_end_token)
            tokenized_vector[char_to_token.start:char_to_token.end] = 1
        else:
            tokenized_vector[np.where(char_tokens <= section.link_offset_start)[0][-1]:-1] = 1
    else:
        tokenized_vector[char_to_token.start:char_to_token.end] = 1 
        
    # extract other relevant information
    data = {(section.link_anchor, section.link_offset_start, 
             section.link_offset_end, 
             section.target_wikidata_numeric_id): tokenized_vector}
    return data

def drop_no_entities(ner_data, label_data):
    """
    Given NER data and true labels data, both in the format
    i.e. [['Apple is a good company', {('Apple', 0, 6, 0101102086): [1,0,0,0,0]}],...]
    check if there are text documents with either no entities identified by NER, or no
    entities with Wikidata links. Drop all of these text documents, and returns the NER 
    and true labels data.
    """
    # check to drop all texts without any ner_entity, or any true entity
    ner_entities_missing = [i for i, text in enumerate(ner_data) if not text[1]]
    true_entities_missing = [i for i, text in enumerate(label_data) if not text[1]]
    
    # drop entries if either does not have any entities
    new_ner_data = [text for i, text in enumerate(ner_data) if i not in ner_entities_missing+true_entities_missing]
    new_label_data = [text for i, text in enumerate(label_data) if i not in ner_entities_missing+true_entities_missing]
    return new_ner_data, new_label_data

In [4]:
def extract_entities_and_labels(data):
    """
    Given a data of wikipedia articles, extract two items, both in the format of a list of lists.
    Each entry in the outer list corresponds to a document. 
    For each document (inner list), the first entry is the actual text document, while the second entry is a dictionary.
    For the first item extracted, the dictionary has the true entities (wikidata links), 
    start characters, end characters, and true Wikidata entry IDs as the key of the dictionary.
    The dictionary has a vector of zeros, with ones at the positions of the corresponding token positions of the true entities
    i.e. ['Apple is a good company', {('Apple', 0, 6, 0101102086): [1,0,0,0,0]}]
    For the second item extracted, the item is largely the same. However, they contain the NER identified entities rather than
    the true entities. In the position of the true Wikidata entry IDs, they have an entity label instead.
    i.e. ['Apple is a good company', {('Apple', 0, 6, 'ORG'): [1,0,0,0,0]}]
    """
    import spacy
    import re
    import numpy as np
    true_entity = []
    ner_entity = []
    
    # for each section of text
    for sid in data['source_section_id'].unique():
        text = data.loc[data['source_section_id']==sid, 'section_text'].iloc[0]   
        
        # tokenizer+NER
        nlp = spacy.load('en_core_web_sm')
        nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
        tokenized_text = nlp(text)
        
        # NER entities
        ner_entity_dict = {}
        for entity in tokenized_text.ents:
            if entity.label_ in ['DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']:
                continue
            ner_tokenized_vector = np.zeros(len(tokenized_text))
            char_to_token = tokenized_text.char_span(entity.start_char, entity.end_char)
            ner_tokenized_vector[char_to_token.start:char_to_token.end] = 1        
            ner_entity_dict[(entity.text, entity.start_char, entity.end_char, entity.label_)] = ner_tokenized_vector
    
        
        # actual wikidata links entities     
        # extract relevant data from dataframe 
        entity_data = (data[data['source_section_id']==sid]
                       .apply(lambda i: extract_entities_and_labels_helper(i, tokenized_text), axis=1))
              
        # form the correct data structure
        # wikidata link true entity
        # replace numbers with hash #
        # https://mlwhiz.com/blog/2019/01/17/deeplearning_nlp_preprocess/
        # these were originally kept for NER purposes
        new_text = re.sub('[0-9]', '#', text.lower())
        true_entity.append([new_text, dict()])
        for entity in list(entity_data):
            true_entity[-1][-1].update(entity)
            
        # ner identified entity
        ner_entity.append([new_text, ner_entity_dict])
    
    # drop text documents with either no entities identified by NER, or no entities with Wikidata links
    ner_entity, true_entity = drop_no_entities(ner_entity, true_entity)
    
    return true_entity, ner_entity

In [9]:
# approx 10-15 minutes for 5000 text...
true_entity, ner_entity = extract_entities_and_labels(cleaned_sample)

In [23]:
# with open('../data/sample_full_labels.pkl', 'wb') as f:
#     pickle.dump(ylabel, f, protocol=pickle.HIGHEST_PROTOCOL)

# with open('../data/sample_data.pkl', 'wb') as f:
#     pickle.dump(sample_ner, f, protocol=pickle.HIGHEST_PROTOCOL)

***

Let us now extract the NER entities that also have a corresponding 'true' Wikidata link.

In [10]:
def extract_01_vector(data, axis):
    """
    Given data in the format 
    i.e. [['Apple is a good company', {('Apple', 0, 6, 0101102086): [1,0,0,0,0]}],...]
    extract all the 0-1 vectors within each dictionary, 
    concatenate them together along a given axis, and repeat for every single entry in the data, 
    where each entry in the data corresponds to a text document. 
    The function then returns 2 items, the first a list of arrays corresponding to the concatenated
    0-1 vectors, the second a list (of the same length) of dictionaries where the key, value pairs 
    correspond to the index within the arrays (from the 1st item) and the matching key pair of entity
    matches from the data respectively
    """
    key_idx_dict_ls = []
    concat_vec_all = []
    for text in data:
        key_idx_dict = {}
        concat_vec = []
        # to check if there are entities in the data
        if not text[1]:
            raise Exception('Drop text documents without entities!')
        else:
            for i, (key, val) in enumerate(text[1].items()):
                concat_vec.append(val)
                key_idx_dict[i] = key
            concat_vec = np.stack(concat_vec, axis=axis)

            # each entry in the list corresponds to a text
            concat_vec_all.append(concat_vec)
            key_idx_dict_ls.append(key_idx_dict)
    return concat_vec_all, key_idx_dict_ls


In [20]:
def drop_entities(ner_entity, true_entity):
    # for each identified NER entity in each text
    ner_entities, ner_key_idx_dict_ls = extract_01_vector(ner_entity, axis=1)
    # possible true entities with wikidata links
    true_entities, true_key_idx_dict_ls = extract_01_vector(true_entity, axis=0)
    entity_check_idx_ls = []
    for i in np.arange(len(true_entities)):
        # FOR NOW, AS LONG AS THERE IS SOME OVERLAP, WE KEEP THE TRUE ENTITY
        entity_check = np.sum(true_entities[i] @ ner_entities[i], axis=1)
    
        # entities which have a Wikidata link, but do not have a corresponding NER entity
        entity_check_idx = np.where(entity_check==0)[0]
        entity_check_idx_ls.append(entity_check_idx)
        
    for data, idx_ls, key_idx in zip(true_entity, entity_check_idx_ls, true_key_idx_dict_ls):
        for i in idx_ls:
            del data[1][key_idx[i]]
        
    ner_entity, true_entity = drop_no_entities(ner_entity, true_entity)        
    # possible true entities with wikidata links
    # for each identified NER entity in each text
    ner_entities, ner_key_idx_dict_ls = extract_01_vector(ner_entity, axis=1)
    true_entities, true_key_idx_dict_ls = extract_01_vector(true_entity, axis=0)
    entity_check_idx_ls = []
    for i in np.arange(len(ner_entities)):
        # FOR NOW, AS LONG AS THERE IS SOME OVERLAP, WE KEEP THE TRUE ENTITY
        entity_check = np.sum(true_entities[i] @ ner_entities[i], axis=0)
    
        # entities which have a Wikidata link, but do not have a corresponding NER entity
        entity_check_idx = np.where(entity_check==0)[0]
        entity_check_idx_ls.append(entity_check_idx)
        
    # drop all those NER entities with no corresponding wikidata link
    for data, idx_ls, key_idx in zip(ner_entity, entity_check_idx_ls, ner_key_idx_dict_ls):
        for i in idx_ls:
            del data[1][key_idx[i]]

    # drop document if there is no entity
    ner_entity, true_entity = drop_no_entities(ner_entity, true_entity)
    return ner_entity, true_entity

ner_entity, true_entity = drop_entities(ner_entity, true_entity)

In [8]:
# # for each identified NER entity in each text
# ner_entities, ner_key_idx_dict_ls = extract_01_vector(ner_entity, axis=1)
# # possible true entities with wikidata links
# true_entities, true_key_idx_dict_ls = extract_01_vector(true_entity, axis=0)

# # for each true entity in each text, check for a corresponding identified NER entity
# # if there is no corresponding identified NER entity, drop the true entity
# # check for the percentage of entities correctly identified by NER
# n_true_ent = 0
# n_ner_ent = 0
# entity_check_idx_ls = []
# for i in np.arange(len(true_entities)):
#     # FOR NOW, AS LONG AS THERE IS SOME OVERLAP, WE KEEP THE TRUE ENTITY
#     entity_check = np.sum(true_entities[i] @ ner_entities[i], axis=1)
    
#     # entities which have a Wikidata link, but do not have a corresponding NER entity
#     entity_check_idx = np.where(entity_check==0)[0]
#     entity_check_idx_ls.append(entity_check_idx)
    
#     # percentage of entities identified
#     n_true_ent += entity_check.shape[0]
#     n_ner_ent += entity_check.shape[0]-len(entity_check_idx)

# print('Percentage of True Entities identified by NER: {}'.format(n_ner_ent/n_true_ent))

# # drop all those not identified entities in the true entities list
# # note that we did not drop NER identified entities without a corresponding true entity
# for data, idx_ls, key_idx in zip(true_entity, entity_check_idx_ls, true_key_idx_dict_ls):
#     for i in idx_ls:
#         del data[1][key_idx[i]]
        
# ner_entity, true_entity = drop_no_entities(ner_entity, true_entity)        
# # possible true entities with wikidata links
# # for each identified NER entity in each text
# ner_entities, ner_key_idx_dict_ls = extract_01_vector(ner_entity, axis=1)
# true_entities, true_key_idx_dict_ls = extract_01_vector(true_entity, axis=0)

# # for each NER entity in each text, check for a corresponding true entity
# # if there is no corresponding true entity, drop the NER entity
# n_true_ent = 0
# n_ner_ent = 0
# entity_check_idx_ls = []
# for i in np.arange(len(ner_entities)):
#     # FOR NOW, AS LONG AS THERE IS SOME OVERLAP, WE KEEP THE TRUE ENTITY
#     entity_check = np.sum(true_entities[i] @ ner_entities[i], axis=0)
    
#     # entities which have a Wikidata link, but do not have a corresponding NER entity
#     entity_check_idx = np.where(entity_check==0)[0]
#     entity_check_idx_ls.append(entity_check_idx)
    
#     # percentage of entities identified
#     n_true_ent += entity_check.shape[0]
#     n_ner_ent += entity_check.shape[0]-len(entity_check_idx)

# print('Percentage of identified NER Entities with corresponding true link: {}'.format(n_ner_ent/n_true_ent))

# # drop all those NER entities with no corresponding wikidata link
# for data, idx_ls, key_idx in zip(ner_entity, entity_check_idx_ls, ner_key_idx_dict_ls):
#     for i in idx_ls:
#         del data[1][key_idx[i]]

# # drop document if there is no entity
# ner_entity, true_entity = drop_no_entities(ner_entity, true_entity)        
# # for each identified NER entity in each text
# ner_entities, ner_key_idx_dict_ls = extract_01_vector(ner_entity, axis=1)
# # possible true entities with wikidata links
# true_entities, true_key_idx_dict_ls = extract_01_vector(true_entity, axis=0)

Percentage of True Entities identified by NER: 0.646969742859739


***

## Doc2Vec

For now, drop NER entities without corresponding hyperlinks.

In [13]:
with open('../data/sample_labels.pkl', 'wb') as f:
    pickle.dump(true_entity, f, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('../data/sample_data.pkl', 'wb') as f:
    pickle.dump(ner_entity, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# https://medium.com/@mishra.thedeepak/doc2vec-simple-implementation-example-df2afbbfbad5
# get list of documents
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
docs = []
for text in ner_entity:
    docs.append(text[0])
tagged_docs = [TaggedDocument(words=word_tokenize(word), tags=[str(i)]) for i, word in enumerate(docs)]

max_epochs = 100
vec_size = 1000
alpha = 0.025

model = Doc2Vec(vector_size = vec_size,
                alpha = alpha,
                min_alpha = 0.0025,
                min_count = 1,
                dm = 1)
model.build_vocab(tagged_docs)

for epoch in range(max_epochs):
    print('Iteration {}'.format(epoch))
    model.train(tagged_docs,
               total_examples=model.corpus_count,
               epochs=model.epochs)
    # decrease learning rate over time
    model.alpha -= 0.0002
    # ensure no decay
    model.min_alpha = model.alpha
# save model
model.save('../data/doc2vec.model')

In [33]:
from gensim.models.doc2vec import Doc2Vec

model = Doc2Vec.load('../data/doc2vec.model')

In [54]:
model.docvecs.most_similar('6')

[('3683', 0.6052637100219727),
 ('1395', 0.6032894849777222),
 ('1399', 0.6031019687652588),
 ('3139', 0.5970553159713745),
 ('2870', 0.5903152227401733),
 ('1456', 0.5884307622909546),
 ('2714', 0.588376522064209),
 ('3467', 0.5871361494064331),
 ('2715', 0.5861045122146606),
 ('3593', 0.5852940082550049)]

In [55]:
docs[6]

'transport in north korea is constrained by economic problems and government restrictions public transport predominates and most of it is electrified'

In [56]:
docs[3683]

'annerley road is an arterial road in brisbane queensland australia it was formerly known as boggo road'

In [34]:
# match ner entity with true corresponding entity
true_entity_idx_ls = []
# for each text
for i in np.arange(len(ner_entities)):
    entity_arr = true_entities[i] @ ner_entities[i]
    true_entity_idx = []
    for j in np.arange(entity_arr.shape[1]):
        true_entity_idx.append(list(np.where(entity_arr[:,j]>0)[0]))
    true_entity_idx_ls.append(true_entity_idx)
    
# # form doc2vec data
text_doc2vec_ls = []
for i, (ner_dict, idx_ls, true_dict) in enumerate(zip(ner_key_idx_dict_ls, true_entity_idx_ls, true_key_idx_dict_ls)):
    text_doc2vec = [model.docvecs[str(i)], dict()]
    for j, idx in enumerate(idx_ls):
        for k in idx:
            text_doc2vec[-1][(ner_dict[j][0], true_dict[k][0])] = true_dict[k][-1]
    text_doc2vec_ls.append(text_doc2vec)

In [35]:
with open('../data/sample_doc2vec_data.pkl', 'wb') as f:
    pickle.dump(text_doc2vec_ls, f, protocol=pickle.HIGHEST_PROTOCOL)