In [1]:
import pandas as pd
import numpy as np
import csv

import spacy
spacy.prefer_gpu()

import pickle
import sqlite3
import re

Do data manipulation in SQL instead.

In [2]:
try:
    # data of both wikipedia pages 
    sample = pd.read_csv('../data/plaintext_link_sample.csv')
    
except FileNotFoundError:
    
    # connect to database
    db = sqlite3.connect('../data/kensho.db')
    c = db.cursor()

    # visualize query into pandas dataframe
    def viz_tables(cols, query):
        q = c.execute(query).fetchall()
        framelist = dict()
        for i, col_name in enumerate(cols):
            framelist[col_name] = [col[i] for col in q]
        return pd.DataFrame.from_dict(framelist)
    
    # column names
    plaintext_link_cols = [col[1] for col in c.execute("PRAGMA table_info(plaintext_link)")]
    
    # get a sample of the data to work on pandas
    query2 = """
    SELECT *
    FROM plaintext_link 
    WHERE source_section_id IN (SELECT DISTINCT(source_section_id) 
                                FROM plaintext_link 
                                ORDER BY random() 
                                LIMIT 5000)
    """
    sample = viz_tables(plaintext_link_cols, query2)
    sample.to_csv('../data/plaintext_link_sample.csv', index=False)
    
    c.close()
    db.close()

In [3]:
# # update database
# # note we only save the introduction of all the wikipedia articles for now due to memory limitations
# query = """
# CREATE TABLE plaintext_link AS
# WITH link_intro AS (
#     SELECT 
#         l.link_id, l.source_section_id, l.source_wikidata_numeric_id, l.link_anchor, 
#         l.link_offset_start, l.link_offset_end, l.target_wikidata_numeric_id
#     FROM link l
#     WHERE l.source_section_num = 0
# )  
# SELECT 
#     l.link_id, l.source_section_id, l.source_wikidata_numeric_id, l.link_anchor, 
#     l.link_offset_start, l.link_offset_end, l.target_wikidata_numeric_id, 
#     p.section_text, p.section_len, 
#     r.anchor_target_count, r.anchor_frac, r.target_frac
# FROM link_intro l LEFT JOIN plaintext p
# ON p.section_id = l.source_section_id
# LEFT JOIN raw_anchor r
# ON l.link_anchor = r.anchor_text
# AND l.target_wikidata_numeric_id = r.target_wikidata_numeric_id
# """
# c.execute(query)
# db.commit()

In [4]:
# data of wikidata entries
wikidata = pd.read_csv('../data/wikipages_cleaned.csv')
wikidata.dropna(inplace=True)

In [5]:
# replace words within paranthesis and remove underscores
# get_rid of `the` which can cause difference
pattern = re.compile(r'\([^)]*\)|_')
clean_text = lambda text: re.sub(r'\([^)]*\)', "", text).replace('_', ' ').strip()
wikidata.loc[:,'target_page_title'] = wikidata.apply(lambda i: clean_text(i['target_page_title']), axis=1)

In [6]:
# we need to replace lnk anchors with actual wikidata page titles
def replace_anchor_text(text, replace_text, replace_start, replace_end):
    """
    Given a text, a given phrase/word to replace into the text, and a given start and end position
    of the original text to replace, return a list consisting of the new text, and the new start and end
    indexes corresponding to the replaced phrase/word, and an offset to shift the characters of the text
    """
    import string
    new_replace_end = replace_start+len(replace_text)
    # replace full words
    if len(text) > replace_end:
        while text[replace_end] in string.ascii_letters:
            replace_end+=1
    offset = new_replace_end - replace_end
    new_text = text[:replace_start] + replace_text + text[replace_end:]
    return [new_text, replace_start, new_replace_end, offset]

# get corresponding wikidata page titles

merged_sample = sample.merge(wikidata[['wikidata_numeric_id', 'target_page_title']], 
                               'left', left_on='target_wikidata_numeric_id', right_on='wikidata_numeric_id')
# SOMEHOW WHEN IT MERGES, IT DUPLICATES ALOT OF ROWS
merged_sample.drop_duplicates(inplace=True)

replaced_sample = []
# replace link anchors with actual wikidata page titles
# extract replaced text and indexes
for sid in merged_sample['source_section_id'].unique():
    replace_section = merged_sample[merged_sample['source_section_id'] == sid].sort_values('link_offset_start').reset_index(drop=True)
    text = replace_section['section_text'].loc[0]
    offset = 0
    for i in np.arange(replace_section.shape[0]):
        replace_ls = replace_anchor_text(text, replace_section.loc[i,'target_page_title'], 
                                         replace_section.loc[i,'link_offset_start']+offset,
                                         replace_section.loc[i,'link_offset_end']+offset)
        text = replace_ls[0]
        replace_section.loc[i, 'link_offset_start'] = replace_ls[1]
        replace_section.loc[i, 'link_offset_end'] = replace_ls[2]
#     replace_section.loc[i, 'link_anchor'] = replace_section.loc[i,'target_page_title']
        offset += replace_ls[3]
    replace_section['section_text'] = text
    replaced_sample.append(replace_section)
    
# get back dataframe
replaced_sample = pd.concat(replaced_sample)
# edit section length and link anchors
replaced_sample.loc[:,'link_anchor'] = replaced_sample.loc[:,'target_page_title']
replaced_sample.loc[:,'section_len'] = replaced_sample.apply(lambda i: len(i['section_text']), axis=1)

# drop redundant columns
replaced_sample.drop(['wikidata_numeric_id', 'target_page_title'], axis=1, inplace=True)

# check whether link anchor corresponds to position in text
check_anchor = lambda row_data: row_data['section_text'][row_data['link_offset_start']:row_data['link_offset_end']] == row_data['link_anchor']
assert all(replaced_sample.apply(check_anchor, axis=1))
assert len(replaced_sample['section_text'].unique())==5000

In [7]:
replaced_sample.head()

Unnamed: 0,link_id,source_section_id,source_wikidata_numeric_id,link_anchor,link_offset_start,link_offset_end,target_wikidata_numeric_id,section_text,section_len,anchor_target_count,anchor_frac,target_frac
0,24000001404979,24000000296809,6662070.0,Order of the British Empire,33,60,14420,"Llewellyn Heycock, Baron Heycock Order of the ...",981,5283.0,0.994728,0.181478
1,24000001404980,24000000296809,6662070.0,Wales,106,111,25,"Llewellyn Heycock, Baron Heycock Order of the ...",981,3620.0,0.351866,0.174231
2,24000001404981,24000000296809,6662070.0,Life peer,143,152,2914468,"Llewellyn Heycock, Baron Heycock Order of the ...",981,1613.0,0.995679,0.729534
3,24000001404982,24000000296809,6662070.0,Margam,182,188,6759079,"Llewellyn Heycock, Baron Heycock Order of the ...",981,104.0,0.832,0.971963
4,24000001404983,24000000296809,6662070.0,Great Western Railway,239,260,843251,"Llewellyn Heycock, Baron Heycock Order of the ...",981,3320.0,0.812531,0.902419


In [8]:
def extract_ground_truth_helper(section):
    """
    Helper function for extract_ground_truth
    """
    import numpy as np
    import spacy
    # tokenize text and extract corresponding tokens for anchor text
    tokenizer = spacy.load('en_core_web_sm')
    tokenized_text = tokenizer(section.section_text)
    # base vector
    tokenized_vector = np.zeros(len(tokenized_text))
    # get span object to match characters with tokens
    char_to_token = tokenized_text.char_span(section.link_offset_start, section.link_offset_end)
    # label corresponding tokens if there is an anchor link
    # to match anchor text rather than anchor link
    if not char_to_token:
        char_tokens = np.array([token.idx for token in tokenized_text])
        closest_start_token = char_tokens[np.where(char_tokens <= section.link_offset_start)[0][-1]]
        # to account for tokens at end of text
        if np.where(char_tokens >= section.link_offset_end)[0].size > 0:
            closest_end_token = char_tokens[np.where(char_tokens >= section.link_offset_end)[0][0]]-1
        else:
            closest_end_token = char_tokens[-1]-1
        char_to_token = tokenized_text.char_span(closest_start_token, closest_end_token)
        # weird quirk of spacy, probably some bug
        # this bug makes it impossible to identify the last tokens if it is an entity
        # as such, for entities which appear in the 2nd last token, perhaps we can view the last token as an entity
        if not char_to_token:
            char_to_token = tokenized_text.char_span(closest_start_token, closest_end_token+1)
    tokenized_vector[char_to_token.start:char_to_token.end] = 1

    # extract other relevant information
    data = {(section.link_anchor, section.link_offset_start, 
             section.link_offset_end, 
             section.target_wikidata_numeric_id): tokenized_vector}
    
    return data

def extract_ground_truth(data):
    """
    Given a data of wikipedia articles, extract in the format of a list in this format
    and extract the true entities (wikidata links), start characters, end characters, and true Wikidata entry IDs 
    as the key of a dictionary with a vector of zeros, with ones at the positions of the
    corresponding token positions of the true entities
    i.e. ['Apple is a good company', {('Apple', 0, 6, 0101102086): [1,0,0,0,0]}]
    """ 
    sample_truth = []
    
    # for each section of text
    for sid in data['source_section_id'].unique():
        text = data.loc[data['source_section_id']==sid, 'section_text'].iloc[0]
    
        # extract relevant data from dataframe 
        entity_data = (data[data['source_section_id']==sid]
                       .apply(extract_ground_truth_helper, axis=1))
              
        # form the correct data structure 
        sample_truth.append([text, dict()])
        for entity in list(entity_data):
            sample_truth[-1][-1].update(entity)
    return sample_truth

In [9]:
ylabel = extract_ground_truth(replaced_sample)

***

## NER

In [10]:
def entity_or_not(arr):
    """
    Given an array of 'ENT_IOB' outputs from a SpaCy tokenizer, where 
    3: token begins entity, 2: outside entity, 1: inside entity, 0: no entity tag
    converts it to 1's and 0's where 1: entity, 0: no entity
    """
    import numpy as np
    return np.array([1 if i in [1,3] else 0 for i in arr])

def extract_nlp_data(text):
    """
    Given a text string, run SpaCy NER on the text string,
    and extract the identified entities, start characters, end characters, and the entities labels
    correspondingly as the key of a dictionary with a vector of zeros, with ones at the positions of the
    corresponding token positions of the identified entities
    i.e. ['Apple is a good company', {('Apple', 0, 6, 'ORG'): [1,0,0,0,0]}]
    """
    import spacy
    spacy_tagger = spacy.load('en_core_web_sm')
    spacy_res = spacy_tagger(text)
    
    ner_res = {}
    for entity in spacy_res.ents:
        if entity.label_ in ['DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']:
            continue
        tokenized_vector = np.zeros(len(spacy_res))
        char_to_token = spacy_res.char_span(entity.start_char, entity.end_char)
        tokenized_vector[char_to_token.start:char_to_token.end] = 1        
#         tokenized_vector = entity_or_not(spacy_res.to_array('ENT_IOB'))
        ner_res[(entity.text, entity.start_char, entity.end_char, entity.label_)] = tokenized_vector
    
    return [text, ner_res]

In [11]:
# for all text data, do NER and extract all useful data
sample_ner = [extract_nlp_data(text) for text in replaced_sample['section_text'].unique()]

In [12]:
def drop_no_entities(ner_data, label_data):
    """
    Given NER data and true labels data, both in the format
    i.e. [['Apple is a good company', {('Apple', 0, 6, 0101102086): [1,0,0,0,0]}],...]
    check if there are text documents with either no entities identified by NER, or no
    entities with Wikidata links. Drop all of these text documents, and returns the NER 
    and true labels data.
    """
    # check to drop all texts without any ner_entity, or any true entity
    ner_entities_missing = [i for i, text in enumerate(ner_data) if not text[1]]
    true_entities_missing = [i for i, text in enumerate(label_data) if not text[1]]
    
    # drop entries if either does not have any entities
    new_ner_data = [text for i, text in enumerate(ner_data) if i not in ner_entities_missing+true_entities_missing]
    new_label_data = [text for i, text in enumerate(label_data) if i not in ner_entities_missing+true_entities_missing]
    return new_ner_data, new_label_data

sample_ner, ylabel = drop_no_entities(sample_ner, ylabel)

with open('../data/sample_full_labels.pkl', 'wb') as f:
    pickle.dump(ylabel, f, protocol=pickle.HIGHEST_PROTOCOL)

with open('../data/sample_data.pkl', 'wb') as f:
    pickle.dump(sample_ner, f, protocol=pickle.HIGHEST_PROTOCOL)

***

Let us now extract the NER entities that also have a corresponding 'true' Wikidata link.

In [13]:
def extract_01_vector(data, axis):
    """
    Given data in the format 
    i.e. [['Apple is a good company', {('Apple', 0, 6, 0101102086): [1,0,0,0,0]}],...]
    extract all the 0-1 vectors within each dictionary, 
    concatenate them together along a given axis, and repeat for every single entry in the data, 
    where each entry in the data corresponds to a text document. 
    The function then returns 2 items, the first a list of arrays corresponding to the concatenated
    0-1 vectors, the second a list (of the same length) of dictionaries where the key, value pairs 
    correspond to the index within the arrays (from the 1st item) and the matching key pair of entity
    matches from the data respectively
    """
    key_idx_dict_ls = []
    concat_vec_all = []
    for text in data:
        key_idx_dict = {}
        concat_vec = []
        # to check if there are entities in the data
        if not text[1]:
            raise Assertion('Drop text documents without entities!')
        else:
            for i, (key, val) in enumerate(text[1].items()):
                concat_vec.append(val)
                key_idx_dict[i] = key
            concat_vec = np.stack(concat_vec, axis=axis)

            # each entry in the list corresponds to a text
            concat_vec_all.append(concat_vec)
            key_idx_dict_ls.append(key_idx_dict)
    return concat_vec_all, key_idx_dict_ls

# for each identified NER entity in each text
ner_entities, _ = extract_01_vector(sample_ner, axis=1)

# possible true entities with wikidata links
true_entities, key_idx_dict_ls = extract_01_vector(ylabel, axis=0)

In [14]:
# for each true entity in each text, check for a corresponding identified NER entity
# if there is no corresponding identified NER entity, drop the true entity
# check for the percentage of entities correctly identified by NER
n_true_ent = 0
n_ner_ent = 0
entity_check_idx_ls = []
for i in np.arange(len(true_entities)):
    # FOR NOW, AS LONG AS THERE IS SOME OVERLAP, WE KEEP THE TRUE ENTITY
    entity_check = np.sum(true_entities[i] @ ner_entities[i], axis=1)
    
    # entities which have a Wikidata link, but do not have a corresponding NER entity
    entity_check_idx = np.where(entity_check==0)[0]
    entity_check_idx_ls.append(entity_check_idx)
    
    # percentage of entities identified
    n_true_ent += entity_check.shape[0]
    n_ner_ent += entity_check.shape[0]-len(entity_check_idx)


print('Percentage of True Entities identified by NER: {}'.format(n_ner_ent/n_true_ent))

Percentage of True Entities identified by NER: 0.8430122402333764


In [17]:
# drop all those not identified entities in the true entities list
for data, idx_ls, key_idx in zip(ylabel, entity_check_idx_ls, key_idx_dict_ls):
    for i in idx_ls:
        del data[1][key_idx[i]]

In [18]:
with open('../data/sample_labels.pkl', 'wb') as f:
    pickle.dump(ylabel, f, protocol=pickle.HIGHEST_PROTOCOL)

***