In [1]:
import pandas as pd
import sqlite3
import knowledge_graph_extraction
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
%matplotlib inline

## Node2Vec Graphical Embeddings

In [2]:
# connect to database
db = sqlite3.connect('../data/kensho.db')
c = db.cursor()

# visualize query into pandas dataframe
def viz_tables(cols, query):
    q = c.execute(query).fetchall()
    framelist = dict()
    for i, col_name in enumerate(cols):
        framelist[col_name] = [col[i] for col in q]
    return pd.DataFrame.from_dict(framelist)

In [33]:
# c.execute("""DROP TABLE IF EXISTS title_triplets""")

# query = """
# CREATE TABLE 
#     title_triplets AS 
# SELECT w1.target_page_title, t.edge_property_id, w2.target_page_title
# FROM wikipage_triplets t LEFT JOIN wikipages_cleaned as w1
# ON t.source_item_id = w1.wikidata_numeric_id
# LEFT JOIN wikipages_cleaned as w2
# ON t.target_item_id = w2.wikidata_numeric_id
# """
# c.execute(query)
# db.commit()

# query = """
# SELECT *
# FROM title_triplets t
# LIMIT 1000
# """
# triplets = viz_tables(['source_title', 'edge_property', 'target_title'], query)
# triplets.head()

# db = '../data/kensho.db'

query = """
SELECT source_item_id, edge_property_id, target_item_id
FROM wikipage_triplets
"""

triplets = knowledge_graph_extraction.conduct_sql_query(db, query)

In [34]:
triplets[['source_item_id', 'target_item_id']].to_csv('../data/sample_graph.edgelist', 
                                                     header=None,
                                                     index=False,
                                                     sep=' ',
                                                     mode='a')

In [3]:
c.close()
db.close()

In [35]:
from node2vec import node2vec
import networkx as nx
from gensim.models import Word2Vec

def read_graph(data, weighted=False, directed=False):
    '''
    Reads the input network in networkx.
    '''
    if weighted:
        G = nx.read_edgelist(data, nodetype=int, data=(('weight',float),), create_using=nx.DiGraph())
    else:
        G = nx.read_edgelist(data, nodetype=int, create_using=nx.DiGraph())
        for edge in G.edges():
            G[edge[0]][edge[1]]['weight'] = 1

    if not directed:
        G = G.to_undirected()

    return G

def learn_embeddings(walks, size=128, window_size=10, min_count=0, workers=1, epochs=1):
    '''
    Learn embeddings by optimizing the Skipgram objective using SGD.
    '''
    walks = [list(map(str, walk)) for walk in walks]
    model = Word2Vec(walks, 
                     size=size, # number of dimensions
                     window=window_size, # context size for optimization
                     min_count=min_count, 
                     sg=1, 
                     workers=workers, # number of parallel workers
                     iter=epochs) # number of epochs in SGD
    
    return model

In [None]:
# https://towardsdatascience.com/node2vec-embeddings-for-graph-data-32a866340fef
# not the author, but gives good introduction to node2vec
# read input network in networkx
nx_G = read_graph('../data/sample_graph.edgelist')
G = node2vec.Graph(nx_G, # network graph
                   False, # directed or not
                   1, # return hyperparameter: probability of returning to previous node
                   1) # in-out hyperparameter: probability of exploring undiscovered parts of graph
G.preprocess_transition_probs()
walks = G.simulate_walks(10, # num of walks: how many random walks for each node in graph 
                         80) # walk length: how many nodes in each random walk
model = learn_embeddings(walks, workers=8)

In [None]:
page_id = '31'
print('Page Title: {}'.format(id_title(wikidata, page_id)))
for i in model.wv.most_similar(page_id):
    print('Similar to {}, with similarity score {}'
          .format(id_title(wikidata, i[0]), i[1]))
    

In [None]:
model.save('../data/graph_embedding.model')

## Entity Text Embeddings

Currently just using Word2Vec on entities, instead of full Wikipedia text.

In [4]:
import pickle
with open('../data/sample_data.pkl', 'rb') as f:
    data = pickle.load(f)

In [8]:
# get all entities
import re
all_entities = []
for text in data:
    text_entities = []
    for key, _ in text[1].items():
        # get phrases underscored for phrase2vec
        entity = key[0].replace(' ', '_')
        text_entities.append(entity)
    all_entities.append(text_entities)

In [9]:
# using only a sequence of entities and not full sentences for now
entity_model = Word2Vec(all_entities, size=128, window=10, min_count=1, workers=8, iter=100)

In [10]:
entity_model.save('../data/entity_embedding.model')