In [1]:
from dataclasses import asdict
import json

from IPython.display import display, Markdown
import pandas as pd

import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [2]:
from deepsearch.cps.client.queries import Query
from deepsearch.cps.client.queries.task import Value
from deepsearch.cps.client.queries.query_tasks import ForEach, Workflow

from deepsearch.core.client import DeepSearchKeyAuth, DeepSearchConfig
from deepsearch.cps.client.api import CpsApi, CpsApiClient

## CPS Authentication

In [3]:
auth_filename = "cps-auth.json" # this file should be populated with the credentials
auth_data = json.load(open(auth_filename))

auth = DeepSearchKeyAuth(
    username=auth_data["email"],
    api_key=auth_data["api_key"],
)

config = DeepSearchConfig(
    host="https://cps.foc-deepsearch.zurich.ibm.com", # IBM internal system
    auth=auth,
)

client = CpsApiClient(config)
api = CpsApi(client)

## Select KG

In [4]:
# Select KG
kg = api.knowledge_graphs.get(
    "f21574fe745fa0b8213cb08d0c4166513108158c", # proj_key
    "80511ed4a524fe446dc102e704fd7577f2c2a26c", # bag_key
)

## Foreach example 1

For given time period (from Jan 2020 to March 2020), find the conferences where IBMers published papers, the challenges those papers were related to (count & list), and researchers who were authors of those papers (count & list).


In [5]:
query = Query()

## 1. Find conferences in time period
##   i) Select 2020_1, 2020_2, 2020_3 nodes
##  ii) Traverse to all papapers in that period
## iii) Tracerse to conferences hosting these papers
find_conf_filename = './workflows/conferences_in_time_period.json'
find_conf_wf_query = json.load(open(find_conf_filename))


find_conf_wf = query.add(
    "Workflow",
    parameters={
        "workflow": find_conf_wf_query['template']
    },
    coordinates=kg
)

for k, spec in find_conf_wf_query.get("outputs", {}).items():
    find_conf_wf.output(k).output_as(spec["name"])


## 2. For each conference make stats
##   i) Select current conference
##  ii) Traverse from conference to papers
## iii) Traverse from papers to challenges
##      "outputs": {"nodes": {"type": "NODE_LIST", "parameters": {"limit": -1}}}
##      ^ this will output all nodes, without any pagination

projection = query.add(
    "Projection",
    inputs={"nodes": find_conf_wf.output("conferences")},
    parameters={"projections": {"nodes": {"field_path": ["nodes"]}}},
)

wf_for_each = query.add(
    ForEach(id="", query=Query(), items=projection.output("nodes")),
)

conf_stats_filename = './workflows/conference_stats_1.json'
conf_stats_wf_query = json.load(open(conf_stats_filename))

# set the input such that it uses the current element in the foreach iteration
conf_stats_wf_query['template'][0]['operation']['parameters']['ids'] = [wf_for_each.current_element()]

foreach_body = Query()
conf_stats_wf = wf_for_each.query.add(
    "Workflow",
    parameters={
        "workflow": conf_stats_wf_query['template']
    },
    coordinates=kg
)

for k, spec in conf_stats_wf_query.get('outputs', {}).items():
    conf_stats_wf.output(k).output_as(spec['name'])

wf_for_each.outputs.result.output_as("for_each_node")


In [6]:
# Execute the query
result = api.queries.run(query)

In [7]:
# Visualize the outputs as tables
for name, output in result.outputs.items():
    display(Markdown(f"## Output '{name}'"))
    if 'nodes' in output:
        display(pd.json_normalize(output['nodes']))
    else:
        display(pd.json_normalize(output))

## Output 'conferences'

Unnamed: 0,_categories,_db,_hash,_id,_name,_synonyms,index,weight
0,[conferences],7279829564703121795,334c0c8f7d90cac194518789ad9e072f,617fbbcbc3987115676c504c,International Conference on Computer Vision an...,[International Conference on Computer Vision a...,421322,27
1,[conferences],7279829564703121795,cfb1c3bbed5164da81f023fd32d55a79,617fbbcbc3987115676c505a,Conference on Artificial Intelligence 2020,[Conference on Artificial Intelligence 2020],421348,21
2,[conferences],7279829564703121795,4daf3ce80c5924a360a874a4a9f73998,617fbbcb1602dbef2bdebc89,"International Conference on Acoustics, Speech,...","[International Conference on Acoustics, Speech...",421339,11
3,[conferences],7279829564703121795,48b6b01822100adac5f2741475c9b47a,617fbbccc3987115676c5075,SPIE Advanced Lithography 2020,[SPIE Advanced Lithography 2020],421399,11
4,[conferences],7279829564703121795,bb1e688f3d78d7ebd55d9d9424526b67,617fbbccc3987115676c5085,ACM Conference on Human Factors in Computing S...,[ACM Conference on Human Factors in Computing ...,421422,8


## Output 'for_each_node'

Unnamed: 0,index,item._categories,item._db,item._hash,item._id,item._name,item._synonyms,item.index,item.weight,outputs.challenges.#-found-nodes,outputs.challenges.nodes
0,0,[conferences],7279829564703121795,334c0c8f7d90cac194518789ad9e072f,617fbbcbc3987115676c504c,International Conference on Computer Vision an...,[International Conference on Computer Vision a...,421322,27,4,"[{'_categories': ['challenges'], '_db': '14660..."
1,1,[conferences],7279829564703121795,cfb1c3bbed5164da81f023fd32d55a79,617fbbcbc3987115676c505a,Conference on Artificial Intelligence 2020,[Conference on Artificial Intelligence 2020],421348,21,30,"[{'_categories': ['challenges'], '_db': '14660..."
2,2,[conferences],7279829564703121795,4daf3ce80c5924a360a874a4a9f73998,617fbbcb1602dbef2bdebc89,"International Conference on Acoustics, Speech,...","[International Conference on Acoustics, Speech...",421339,11,5,"[{'_categories': ['challenges'], '_db': '14660..."
3,3,[conferences],7279829564703121795,48b6b01822100adac5f2741475c9b47a,617fbbccc3987115676c5075,SPIE Advanced Lithography 2020,[SPIE Advanced Lithography 2020],421399,11,2,"[{'_categories': ['challenges'], '_db': '14660..."
4,4,[conferences],7279829564703121795,bb1e688f3d78d7ebd55d9d9424526b67,617fbbccc3987115676c5085,ACM Conference on Human Factors in Computing S...,[ACM Conference on Human Factors in Computing ...,421422,8,1,"[{'_categories': ['challenges'], '_db': '14660..."


In [8]:
# Cleaner visualize
clean_results = [
    {
        'conference': row['item']['_name'],
        # nested['_name'] contains the full title, here we truncate to the number
        'challenges': ', '.join([nested['_name'].split(':')[0] for nested in row['outputs']['challenges']['nodes']])
    }
    for row in result.outputs['for_each_node']
]
df = pd.json_normalize(clean_results)
display(df)

Unnamed: 0,conference,challenges
0,International Conference on Computer Vision an...,"challenge [15], challenge [2540], challenge [1..."
1,Conference on Artificial Intelligence 2020,"challenge [965], challenge [1292], challenge [..."
2,"International Conference on Acoustics, Speech,...","challenge [1491], challenge [1906], challenge ..."
3,SPIE Advanced Lithography 2020,"challenge [1499], challenge [1302]"
4,ACM Conference on Human Factors in Computing S...,challenge [2029]


## Foreach example 2

Initially, same as example 1, but the Foreach is producing
- Count of Challenges contributing in the conference
- Count of Researches who contributed in the conference


In [9]:
query = Query()

## 1. Find conferences in time period
##   i) Select 2020_1, 2020_2, 2020_3 nodes
##  ii) Traverse to all papapers in that period
## iii) Tracerse to conferences hosting these papers
find_conf_filename = './workflows/conferences_in_time_period.json'
find_conf_wf_query = json.load(open(find_conf_filename))


find_conf_wf = query.add(
    "Workflow",
    parameters={
        "workflow": find_conf_wf_query['template']
    },
    coordinates=kg
)

for k, spec in find_conf_wf_query.get("outputs", {}).items():
    find_conf_wf.output(k).output_as(spec["name"])


## 2. For each conference make stats
##   i) Select current conference
##  ii) Traverse from conference to papers
##      "outputs": {"nodes": {"type": "NODE_LIST", "parameters": {"limit": 0}}}
##       ^ this alloes to just return the total number. it reduces the data transfer, when it is not needed.
## iii) Traverse from papers to authors
##      "outputs": {"nodes": {"type": "NODE_LIST", "parameters": {"limit": 0}}}
##       ^ this alloes to just return the total number. it reduces the data transfer, when it is not needed.
##  iv) Traverse from papers to challenges
##      "outputs": {"nodes": {"type": "NODE_LIST", "parameters": {"limit": 0}}}
##       ^ this alloes to just return the total number. it reduces the data transfer, when it is not needed.


projection = query.add(
    "Projection",
    inputs={"nodes": find_conf_wf.output("conferences")},
    parameters={"projections": {"nodes": {"field_path": ["nodes"]}}},
)

wf_for_each = query.add(
    ForEach(id="", query=Query(), items=projection.output("nodes")),
)

conf_stats_filename = './workflows/conference_stats_2.json'
conf_stats_wf_query = json.load(open(conf_stats_filename))

# set the input such that it uses the current element in the foreach iteration
conf_stats_wf_query['template'][0]['operation']['parameters']['ids'] = [wf_for_each.current_element()]

foreach_body = Query()
conf_stats_wf = wf_for_each.query.add(
    "Workflow",
    parameters={
        "workflow": conf_stats_wf_query['template']
    },
    coordinates=kg
)

for k, spec in conf_stats_wf_query.get('outputs', {}).items():
    conf_stats_wf.output(k).output_as(spec['name'])

wf_for_each.outputs.result.output_as("for_each_node")


In [10]:
# Execute the query
result = api.queries.run(query)

In [11]:
# Visualize the outputs as tables
for name, output in result.outputs.items():
    display(Markdown(f"## Output '{name}'"))
    if 'nodes' in output:
        display(pd.json_normalize(output['nodes']))
    else:
        display(pd.json_normalize(output))

## Output 'conferences'

Unnamed: 0,_categories,_db,_hash,_id,_name,_synonyms,index,weight
0,[conferences],7279829564703121795,334c0c8f7d90cac194518789ad9e072f,617fbbcbc3987115676c504c,International Conference on Computer Vision an...,[International Conference on Computer Vision a...,421322,27
1,[conferences],7279829564703121795,cfb1c3bbed5164da81f023fd32d55a79,617fbbcbc3987115676c505a,Conference on Artificial Intelligence 2020,[Conference on Artificial Intelligence 2020],421348,21
2,[conferences],7279829564703121795,4daf3ce80c5924a360a874a4a9f73998,617fbbcb1602dbef2bdebc89,"International Conference on Acoustics, Speech,...","[International Conference on Acoustics, Speech...",421339,11
3,[conferences],7279829564703121795,48b6b01822100adac5f2741475c9b47a,617fbbccc3987115676c5075,SPIE Advanced Lithography 2020,[SPIE Advanced Lithography 2020],421399,11
4,[conferences],7279829564703121795,bb1e688f3d78d7ebd55d9d9424526b67,617fbbccc3987115676c5085,ACM Conference on Human Factors in Computing S...,[ACM Conference on Human Factors in Computing ...,421422,8


## Output 'for_each_node'

Unnamed: 0,index,item._categories,item._db,item._hash,item._id,item._name,item._synonyms,item.index,item.weight,outputs.papers.#-found-nodes,outputs.papers.nodes,outputs.challenges.#-found-nodes,outputs.challenges.nodes,outputs.researchers.#-found-nodes,outputs.researchers.nodes
0,0,[conferences],7279829564703121795,334c0c8f7d90cac194518789ad9e072f,617fbbcbc3987115676c504c,International Conference on Computer Vision an...,[International Conference on Computer Vision a...,421322,27,36,[],4,[],60,[]
1,1,[conferences],7279829564703121795,cfb1c3bbed5164da81f023fd32d55a79,617fbbcbc3987115676c505a,Conference on Artificial Intelligence 2020,[Conference on Artificial Intelligence 2020],421348,21,74,[],30,[],294,[]
2,2,[conferences],7279829564703121795,4daf3ce80c5924a360a874a4a9f73998,617fbbcb1602dbef2bdebc89,"International Conference on Acoustics, Speech,...","[International Conference on Acoustics, Speech...",421339,11,19,[],5,[],72,[]
3,3,[conferences],7279829564703121795,48b6b01822100adac5f2741475c9b47a,617fbbccc3987115676c5075,SPIE Advanced Lithography 2020,[SPIE Advanced Lithography 2020],421399,11,13,[],2,[],75,[]
4,4,[conferences],7279829564703121795,bb1e688f3d78d7ebd55d9d9424526b67,617fbbccc3987115676c5085,ACM Conference on Human Factors in Computing S...,[ACM Conference on Human Factors in Computing ...,421422,8,10,[],1,[],31,[]


In [12]:
# Cleaner visualize
df = pd.json_normalize(result.outputs['for_each_node'])
display(df[['item._name', 'outputs.researchers.#-found-nodes', 'outputs.challenges.#-found-nodes', 'outputs.papers.#-found-nodes']])

Unnamed: 0,item._name,outputs.researchers.#-found-nodes,outputs.challenges.#-found-nodes,outputs.papers.#-found-nodes
0,International Conference on Computer Vision an...,60,4,36
1,Conference on Artificial Intelligence 2020,294,30,74
2,"International Conference on Acoustics, Speech,...",72,5,19
3,SPIE Advanced Lithography 2020,75,2,13
4,ACM Conference on Human Factors in Computing S...,31,1,10


## Foreach example 3

Repeat the query multiple times, paginating through the results of the first input query


In [13]:
# List of all results
page_size = 5
num_pages = 4
all_results = []


for page_no in range(num_pages):
    offset = page_no * page_size
    print(f"Querying {page_size} elements from {offset}...")

    query = Query()

    ## 1. Find conferences in time period
    ##   i) Select 2020_1, 2020_2, 2020_3 nodes
    ##  ii) Traverse to all papapers in that period
    ## iii) Tracerse to conferences hosting these papers
    find_conf_filename = './workflows/conferences_in_time_period.json'
    find_conf_wf_query = json.load(open(find_conf_filename))
    
    # Set the pagination parameters (see JSON workflow for more context)
#     find_conf_wf_query['template'][2]['outputs'] = {"nodes": {"type": "NODE_LIST", "parameters": {"limit": page_size, "offset": offset}}}
    find_conf_wf_query['template'][2]['operation']['outputs']['nodes'] = {"type": "NODE_LIST", "parameters": {"limit": page_size, "offset": offset}}

    find_conf_wf = query.add(
        "Workflow",
        parameters={
            "workflow": find_conf_wf_query['template']
        },
        coordinates=kg
    )

    for k, spec in find_conf_wf_query.get("outputs", {}).items():
        find_conf_wf.output(k).output_as(spec["name"])


    ## 2. For each conference make stats
    ##   i) Select current conference
    ##  ii) Traverse from conference to papers
    ##      "outputs": {"nodes": {"type": "NODE_LIST", "parameters": {"limit": 0}}}
    ##       ^ this alloes to just return the total number. it reduces the data transfer, when it is not needed.
    ## iii) Traverse from papers to authors
    ##      "outputs": {"nodes": {"type": "NODE_LIST", "parameters": {"limit": 0}}}
    ##       ^ this alloes to just return the total number. it reduces the data transfer, when it is not needed.
    ##  iv) Traverse from papers to challenges
    ##      "outputs": {"nodes": {"type": "NODE_LIST", "parameters": {"limit": 0}}}
    ##       ^ this alloes to just return the total number. it reduces the data transfer, when it is not needed.


    projection = query.add(
        "Projection",
        inputs={"nodes": find_conf_wf.output("conferences")},
        parameters={"projections": {"nodes": {"field_path": ["nodes"]}}},
    )

    wf_for_each = query.add(
        ForEach(id="", query=Query(), items=projection.output("nodes")),
    )

    conf_stats_filename = './workflows/conference_stats_2.json'
    conf_stats_wf_query = json.load(open(conf_stats_filename))

    # set the input such that it uses the current element in the foreach iteration
    conf_stats_wf_query['template'][0]['operation']['parameters']['ids'] = [wf_for_each.current_element()]

    foreach_body = Query()
    conf_stats_wf = wf_for_each.query.add(
        "Workflow",
        parameters={
            "workflow": conf_stats_wf_query['template']
        },
        coordinates=kg
    )

    for k, spec in conf_stats_wf_query.get('outputs', {}).items():
        conf_stats_wf.output(k).output_as(spec['name'])

    wf_for_each.outputs.result.output_as("for_each_node")

    # Execute the query
    result = api.queries.run(query)
    all_results.extend(result.outputs['for_each_node'])

print(f"Done. Got {len(all_results)} elements.")

Querying 5 elements from 0...
Querying 5 elements from 5...
Querying 5 elements from 10...
Querying 5 elements from 15...
Done. Got 20 elements.


In [14]:
# Cleaner visualize
df = pd.json_normalize(all_results)
display(df[['item._name', 'outputs.researchers.#-found-nodes', 'outputs.challenges.#-found-nodes', 'outputs.papers.#-found-nodes']])

Unnamed: 0,item._name,outputs.researchers.#-found-nodes,outputs.challenges.#-found-nodes,outputs.papers.#-found-nodes
0,International Conference on Computer Vision an...,60,4,36
1,Conference on Artificial Intelligence 2020,294,30,74
2,"International Conference on Acoustics, Speech,...",72,5,19
3,SPIE Advanced Lithography 2020,75,2,13
4,ACM Conference on Human Factors in Computing S...,31,1,10
5,International Conference on Learning Represent...,49,7,17
6,Conference on Human Factors in Computing Syste...,64,4,11
7,Intelligent User Interfaces 2020,40,4,9
8,International Joint Conference on Artificial I...,75,7,24
9,The World Congress of Medical and Health Infor...,17,0,5
