In [1]:
# Parameters

# Folder on local machine where to create the output and temporary folders
output_path = "/Users/pedroszekely/Downloads/kypher"

# The names of the output and temporary folders
output_folder = "schwarzenegger"
temp_folder = "temp.schwarzenegger"

data_folder = output_path + "/" + output_folder

# Location of the cache database for kypher
cache_path = "/Users/pedroszekely/Downloads/kypher/temp.schwarzenegger"

# shortcuts to commands
kgtk = "time kgtk --debug"
# kgtk = "kgtk --debug"

In [2]:
import io
import os
import subprocess
import sys

import numpy as np
import pandas as pd

from IPython.display import display, HTML

import papermill as pm

In [25]:
# The names of files in the KGTK Wikidata distirbution that we will use in this notebook.
file_names = {
    "all": "Q2685.graph.all.tsv.gz",
}

# We will define environment variables to hold the full paths to the files as we will use them in the shell commands
kgtk_environment_variables = []

for key, value in file_names.items():
    variable = key.upper()
    os.environ[variable] = data_folder + "/" + value
    kgtk_environment_variables.append(variable)
    
# KGTK creates a SQLite database to index the knowledge graph.
if cache_path:
    os.environ['STORE'] = "{}/wikidata.sqlite3.db".format(cache_path)
else:
    os.environ['STORE'] = "{}/{}/wikidata.sqlite3.db".format(output_path, temp_folder)
kgtk_environment_variables.append('STORE')

# We will create many temporary files, so set up a folder for outputs and one for the temporary files.
os.environ['TEMP'] = "{}/{}".format(output_path, temp_folder) 
os.environ['OUT'] = "{}/{}".format(output_path, output_folder) 
kgtk_environment_variables.append('TEMP')
kgtk_environment_variables.append('OUT')

os.environ['KGTK_LABEL_FILE'] = "{}/{}".format(os.environ['OUT'], "Q2685.graph.label.tsv.gz") 
kgtk_environment_variables.append('KGTK_LABEL')

# Envronment variables with shortcuts to the commands we use often
os.environ['kgtk'] = kgtk
# Use for debugging, but careful as it causes import to dataframes to break
os.environ['kypher'] = "time kgtk --debug query --graph-cache " + os.environ['STORE']
# os.environ['kypher'] = "time kgtk query --graph-cache " + os.environ['STORE']
#os.environ['kypher'] = "kgtk query --graph-cache " + os.environ['STORE']
kgtk_environment_variables.append('kgtk')
kgtk_environment_variables.append('kypher')

# We'll save the current working directory so we can call into other example notebooks later
os.environ["EXAMPLES_DIR"] = os.getcwd()
kgtk_environment_variables.append('EXAMPLES_DIR')

kgtk_environment_variables.sort()
for variable in kgtk_environment_variables:
    print("{}: \"{}\"".format(variable, os.environ[variable]))

ALL: "/Users/pedroszekely/Downloads/kypher/schwarzenegger/Q2685.graph.all.tsv.gz"
EXAMPLES_DIR: "/Users/pedroszekely/Downloads/kypher"
KGTK_LABEL: "/Users/pedroszekely/Downloads/kypher/schwarzenegger/Q2685.graph.label.tsv.gz"
OUT: "/Users/pedroszekely/Downloads/kypher/schwarzenegger"
STORE: "/Users/pedroszekely/Downloads/kypher/temp.schwarzenegger/wikidata.sqlite3.db"
TEMP: "/Users/pedroszekely/Downloads/kypher/temp.schwarzenegger"
kgtk: "time kgtk --debug"
kypher: "time kgtk --debug query --graph-cache /Users/pedroszekely/Downloads/kypher/temp.schwarzenegger/wikidata.sqlite3.db"


In [4]:
def kgtk_to_dataframe(kgtk):
    columns = kgtk[0].split("\t")
    data = []
    for line in kgtk[1:]:
        data.append(line.encode('utf-8').decode('utf-8').split("\t"))
    return pd.DataFrame(data, columns=columns)    

In [5]:
%cd {output_path}

/Users/pedroszekely/Downloads/kypher


Define the shortcuts for Kypher

In [6]:
!$kypher \
-i "$ALL" --as all \
--limit 3

[2021-09-28 12:13:50 sqlstore]: DROP graph data table graph_1 from all
[2021-09-28 12:13:53 sqlstore]: IMPORT graph directly into table graph_1 from /Users/pedroszekely/Downloads/kypher/schwarzenegger/Q2685.graph.all.tsv.gz ...
[2021-09-28 12:14:06 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_1 AS graph_1_c1
     LIMIT ?
  PARAS: [3]
---------------------------------------------
node1	label	node2	id
P10	datatype	commonsMedia	P10-datatype
P1000	P31	Q18608871	P1000-P31-Q18608871-093affb5-0
P1001	P1647	P276	P1001-P1647-P276-e4e44f83-0
       17.05 real        21.87 user         2.82 sys


# To Do
- Do partition of the graph in to the usual Wikidata files
- Compute the derived files

# Explore The Arnold Schwarzenegger Graph `Q2685`



In [7]:
!$kypher -i all \
--match '(n1)-[:P31]->(:Q5)' \
--return 'count(distinct n1)'

[2021-09-28 12:14:07 query]: SQL Translation:
---------------------------------------------
  SELECT count(DISTINCT graph_1_c1."node1")
     FROM graph_1 AS graph_1_c1
     WHERE graph_1_c1."label" = ?
        AND graph_1_c1."node2" = ?
  PARAS: ['P31', 'Q5']
---------------------------------------------
[2021-09-28 12:14:07 sqlstore]: CREATE INDEX on table graph_1 column label ...
[2021-09-28 12:14:08 sqlstore]: ANALYZE INDEX on table graph_1 column label ...
[2021-09-28 12:14:08 sqlstore]: CREATE INDEX on table graph_1 column node2 ...
[2021-09-28 12:14:11 sqlstore]: ANALYZE INDEX on table graph_1 column node2 ...
count(DISTINCT graph_1_c1."node1")
18446
        4.82 real         4.15 user         0.58 sys


Count organizations

Number of edges in the graph (not counting qualifier edges)

In [10]:
!$kypher -i all \
--match '()-[l]->()' \
--return 'count(distinct l)'

[2021-09-28 12:16:30 query]: SQL Translation:
---------------------------------------------
  SELECT count(DISTINCT graph_1_c1."id")
     FROM graph_1 AS graph_1_c1
  PARAS: []
---------------------------------------------
count(DISTINCT graph_1_c1."id")
2421622
        3.56 real         2.80 user         0.71 sys


Count the number of qualifier edges

In [11]:
!$kypher -i all \
--match '()-[l]->(), (l)-[q]->()' \
--return 'count(distinct q)'

[2021-09-28 12:17:34 query]: SQL Translation:
---------------------------------------------
  SELECT count(DISTINCT graph_1_c2."id")
     FROM graph_1 AS graph_1_c1
     INNER JOIN graph_1 AS graph_1_c2
     ON graph_1_c1."id" = graph_1_c2."node1"
  PARAS: []
---------------------------------------------
[2021-09-28 12:17:34 sqlstore]: CREATE INDEX on table graph_1 column id ...
[2021-09-28 12:17:36 sqlstore]: ANALYZE INDEX on table graph_1 column id ...
count(DISTINCT graph_1_c2."id")
630986
        4.25 real         3.48 user         0.68 sys


In [8]:
!$kypher -i all \
--match '(n1)-[:P31]->(:Q43229)' \
--return 'count(distinct n1)'

[2021-09-28 12:14:12 query]: SQL Translation:
---------------------------------------------
  SELECT count(DISTINCT graph_1_c1."node1")
     FROM graph_1 AS graph_1_c1
     WHERE graph_1_c1."label" = ?
        AND graph_1_c1."node2" = ?
  PARAS: ['P31', 'Q43229']
---------------------------------------------
count(DISTINCT graph_1_c1."node1")
618
        1.05 real         0.85 user         0.16 sys


Schwarzenegger films, may be incomplete as we need to use P279star, but we don't have those files yet

In [40]:
h = !$kypher -i all \
--match ' \
    (film)-[:P161]->(:Q2685)' \
--return 'distinct film as id' \
/ add-labels / html

display(HTML(" ".join(h)))

id,id;label
Q15140437,'Terminator Genisys'@en
Q162255,'The Terminator'@en
Q170564,'Terminator 2: Judgment Day'@en
Q200804,'Predator'@en
Q29054009,'Terminator 3: Rise of the Machines'@en
Q309003,'Conan the Barbarian'@en
Q39072454,'Terminator: Dark Fate'@en
Q728267,'T2-3D: Battle Across Time'@en
Q740516,'Conan the Destroyer'@en
Q858840,'Last Action Hero'@en


In [None]:
Q630018 Bambi Award

In [26]:
!$kypher -i all \
--match ' \
    (film)-[:P166]->(:Q630018)' \
--return 'distinct film as id' \
/ add-labels

[2021-09-28 12:59:37 query]: SQL Translation:
---------------------------------------------
  SELECT DISTINCT graph_1_c1."node1" "_aLias.id"
     FROM graph_1 AS graph_1_c1
     WHERE graph_1_c1."label" = ?
        AND graph_1_c1."node2" = ?
  PARAS: ['P166', 'Q630018']
---------------------------------------------
id	id;label
Q162389	'Tony Curtis'@en
Q212648	'Rudy Giuliani'@en
Q2685	'Arnold Schwarzenegger'@en
Q342617	'Ben Whishaw'@en
Q450675	'Francis'@en
Q66286	'Rita Süssmuth'@en
Q74268	'Bernhard Vogel'@en
        2.25 real         2.63 user         0.44 sys
