In [5]:
import pandas as pd
import sqlite3
import knowledge_graph_extraction
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
%matplotlib inline

## Node2Vec Graphical Embeddings

In [2]:
# connect to database
db = sqlite3.connect('../data/kensho.db')
c = db.cursor()

# visualize query into pandas dataframe
def viz_tables(cols, query):
    q = c.execute(query).fetchall()
    framelist = dict()
    for i, col_name in enumerate(cols):
        framelist[col_name] = [col[i] for col in q]
    return pd.DataFrame.from_dict(framelist)

In [2]:
# c.execute("""DROP TABLE IF EXISTS title_triplets""")

# query = """
# CREATE TABLE 
#     title_triplets AS 
# SELECT w1.target_page_title, t.edge_property_id, w2.target_page_title
# FROM wikipage_triplets t LEFT JOIN wikipages_cleaned as w1
# ON t.source_item_id = w1.wikidata_numeric_id
# LEFT JOIN wikipages_cleaned as w2
# ON t.target_item_id = w2.wikidata_numeric_id
# """
# c.execute(query)
# db.commit()

# query = """
# SELECT *
# FROM title_triplets t
# LIMIT 1000
# """
# triplets = viz_tables(['source_title', 'edge_property', 'target_title'], query)
# triplets.head()

db = '../data/kensho.db'

query = """
SELECT source_item_id, edge_property_id, target_item_id
FROM wikipage_triplets
"""

triplets = knowledge_graph_extraction.conduct_sql_query(db, query)

In [3]:
triplets[['source_item_id', 'target_item_id']].to_csv('../data/sample_graph.edgelist', 
                                                     header=None,
                                                     index=False,
                                                     sep=' ',
                                                     mode='a')

In [3]:
c.close()
db.close()

In [6]:
from node2vec import node2vec
import networkx as nx
from gensim.models import Word2Vec

def read_graph(data, weighted=False, directed=False):
    '''
    Reads the input network in networkx.
    '''
    if weighted:
        G = nx.read_edgelist(data, nodetype=int, data=(('weight',float),), create_using=nx.DiGraph())
    else:
        G = nx.read_edgelist(data, nodetype=int, create_using=nx.DiGraph())
        for edge in G.edges():
            G[edge[0]][edge[1]]['weight'] = 1

    if not directed:
        G = G.to_undirected()

    return G

def learn_embeddings(walks, size=128, window_size=10, min_count=0, workers=1, epochs=1):
    '''
    Learn embeddings by optimizing the Skipgram objective using SGD.
    '''
    walks = [list(map(str, walk)) for walk in walks]
    model = Word2Vec(walks, 
                     size=size, # number of dimensions
                     window=window_size, # context size for optimization
                     min_count=min_count, 
                     sg=1, 
                     workers=workers, # number of parallel workers
                     iter=epochs) # number of epochs in SGD
    
    return model

In [None]:
# https://towardsdatascience.com/node2vec-embeddings-for-graph-data-32a866340fef
# not the author, but gives good introduction to node2vec
# read input network in networkx
nx_G = read_graph('../data/sample_graph.edgelist')
G = node2vec.Graph(nx_G, # network graph
                   False, # directed or not
                   1, # return hyperparameter: probability of returning to previous node
                   1) # in-out hyperparameter: probability of exploring undiscovered parts of graph
G.preprocess_transition_probs()
walks = G.simulate_walks(10, # num of walks: how many random walks for each node in graph 
                         80) # walk length: how many nodes in each random walk
model = learn_embeddings(walks, workers=4)

In [None]:
model.save('../data/graph_embedding.model')

## Entity Text Embeddings

Currently just using Word2Vec on entities, instead of full Wikipedia text.

In [2]:
import pickle
with open('../data/sample_data.pkl', 'rb') as f:
    data = pickle.load(f)

In [8]:
# get all entities
all_entities = []
for text in data:
    text_entities = []
    for key, _ in text[1].items():
        # get phrases underscored for phrase2vec
        entity = key[0].replace(' ', '_')
        text_entities.append(entity)
    all_entities.append(text_entities)

In [9]:
# using only a sequence of entities and not full sentences for now
entity_model = Word2Vec(all_entities, size=128, window=10, min_count=1, workers=8, iter=100)

In [10]:
entity_model.save('../data/entity_embedding.model')

## Word Embeddings

In [None]:
def adjust_link_offset

In [32]:
# an important thing to note while preprocessing is that we need to recalculate where the anchor links are at 
# this is so that subsequently we can match the corresponding token to the position of the links
def preprocess_text(text):
    import re
    import nltk
    from nltk.corpus import stopwords
    # remove html encoded strings
    processed_text = re.sub('&\w+;|&#[0-9]+;|&#[xX][a-fA-F0-9]+;', '', text)
    # remove any other special characters
    processed_text = re.sub('[^a-zA-Z0-9\s]', '', processed_text)
    # replace numbers with hash #
    # https://mlwhiz.com/blog/2019/01/17/deeplearning_nlp_preprocess/
    if bool(re.search(r'\d', processed_text)):
        processed_text = re.sub('[0-9]', '#', processed_text)
        

    processed_sentences = nltk.sent_tokenize(processed_text)
    processed_words = [nltk.word_tokenize(sentence) for sentence in processed_sentences][0]
    # remove stopwords
#     # remove apostrophes for stopwords here
#     english_stopwords = [re.sub('[^a-zA-Z0-0\s]', '', word) for word in set(stopwords.words('english'))]
#     processed_words = [word for word in processed_words if word not in english_stopwords]
    return processed_words
 

In [40]:
# https://nicschrading.com/project/Intro-to-NLP-with-spaCy/
tst = preprocess_text(data[0][0])
# get back one long sentence string
tst_sentence = ' '.join(word for word in tst)

# ner tagging
import spacy
from spacy import displacy
ner = spacy.load('en_core_web_sm')
tokens = ner(tst_sentence)
lemmas = []

# lemmatizing tokens
for tok in tokens:
     lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
lemmas

['llewellyn',
 'heycock',
 'baron',
 'heycock',
 'order',
 'of',
 'the',
 'british',
 'empire',
 '#',
 '#',
 'august',
 '#',
 '#',
 '#',
 '#',
 '#',
 '#',
 'march',
 '#',
 '#',
 '#',
 '#',
 'be',
 'a',
 'wales',
 'local',
 'politician',
 'who',
 'become',
 'a',
 'life',
 'peer',
 'in',
 '#',
 '#',
 '#',
 '#',
 'heycock',
 'be',
 'bear',
 'in',
 'margam',
 'and',
 'begin',
 'his',
 'career',
 'as',
 'an',
 'engine',
 'driver',
 'with',
 'the',
 'great',
 'western',
 'railway',
 'he',
 'subsequently',
 'rise',
 'to',
 'a',
 'powerful',
 'position',
 'in',
 'south',
 'wales',
 'local',
 'politic',
 'through',
 'his',
 'trade',
 'union',
 'connection',
 'and',
 'membership',
 'of',
 'the',
 'labour',
 'party',
 'a',
 'personality',
 'of',
 'transcendent',
 'authority',
 'despite',
 'have',
 'himself',
 'receive',
 'little',
 'formal',
 'education',
 'he',
 'become',
 'chairman',
 'of',
 'the',
 'glamorganshire',
 'education',
 'committee',
 'in',
 'april',
 '#',
 '#',
 '#',
 '#',
 'he',
 '

In [43]:
# displacy.render(ner(tst_sentence), jupyter=True, style='ent')
displacy.render(ner(tst_sentence), jupyter=True, style='ent')

In [7]:
displacy.render(ner(data[0][0]), jupyter=True, style='ent')