In [1]:
from dataclasses import asdict
import json

from IPython.display import display, Markdown
import pandas as pd

import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [2]:
from deepsearch.cps.client.queries import Query
from deepsearch.cps.client.queries.task import Value
from deepsearch.cps.client.queries.query_tasks import Workflow
from deepsearch.cps.client.builders.wf_builder import WorkflowBuilder, WorkflowTaskOperation

from deepsearch.core.client import DeepSearchKeyAuth, DeepSearchConfig
from deepsearch.cps.client.api import CpsApi, CpsApiClient

## CPS Authentication

In [3]:
auth_filename = "cps-auth.json" # this file should be populated with the credentials
auth_data = json.load(open(auth_filename))

auth = DeepSearchKeyAuth(
    username=auth_data["email"],
    api_key=auth_data["api_key"],
)

config = DeepSearchConfig(
    host="https://cps.foc-deepsearch.zurich.ibm.com", # IBM internal system
    auth=auth,
)

client = CpsApiClient(config)
api = CpsApi(client)

## Select KG

In [4]:
# Select KG
kg = api.knowledge_graphs.get(
    "f21574fe745fa0b8213cb08d0c4166513108158c", # proj_key
    "80511ed4a524fe446dc102e704fd7577f2c2a26c", # bag_key
)

## Inspect the KG topology

In [5]:
# Get KG topoogy
topo_resp = kg.get_topology()

topo_nodes = [
    {
        'name': node['name'],
        'size': node['size'],
    }
    for i, node in enumerate(topo_resp['nodes']['categories'])
]

topo_edges = [
    {
        'name': edge['name'],
        'source': edge['categories']['source'],
        'target': edge['categories']['target'],
        'size': edge['matrix']['NNZ'],
    }
    for i, edge in enumerate(topo_resp['edges'])
]


# Print 
display(Markdown('**Nodes**'), pd.json_normalize(topo_nodes))
display(Markdown('**Edges**'), pd.json_normalize(topo_edges))

**Nodes**

Unnamed: 0,name,size
0,accomplishments,85
1,authors,6814
2,challenges,1777
3,conferences,764
4,data,229
5,key-phrases,209651
6,papers,2498
7,pillars,9
8,recognition,2244
9,subthemes,166


**Edges**

Unnamed: 0,name,source,target,size
0,accomplishments-to-authors,"[accomplishments, authors]","[accomplishments, authors]",3904
1,accomplishments-to-texts,"[accomplishments, texts]","[accomplishments, texts]",2886
2,challenges-to-authors,"[authors, challenges]","[authors, challenges]",27068
3,challenges-to-pillars,"[challenges, pillars]","[challenges, pillars]",3554
4,challenges-to-subthemes,"[challenges, subthemes]","[challenges, subthemes]",2548
5,challenges-to-texts,"[challenges, texts]","[challenges, texts]",50404
6,challenges-to-themes,"[challenges, themes]","[challenges, themes]",3554
7,papers-to-authors,"[authors, papers]","[authors, papers]",24472
8,papers-to-challenges,"[challenges, papers]","[challenges, papers]",2374
9,papers-to-conferences,"[conferences, papers]","[conferences, papers]",4996


## Query 1

1. Create a Workflow with:
    1. Search for a term
    2. Traverse to texts nodes (similar to paragraphs)
2. Project the nodeData to retrieve the text

In [6]:
query = Query()

builder = WorkflowBuilder()

input_task = builder.add(
    WorkflowTaskOperation(
        type="SEARCH",
        parameters={
            "names": ["Modernization"],
            "type": "contains",
        },
    ),
    type="INPUT",
)
term_to_text = builder.add(
    WorkflowTaskOperation(
        type="EDGE-TRAVERSAL",
        parameters={
            "edges": [
                {
                    "index": builder.index(input_task),
                    "name": "texts-to-terms",
                }
            ]
        },
    ),
    inputs=[input_task],
)

wf = query.add(Workflow(id="", builder=builder, inputs={}, coordinates=kg))
# This is an intermediate output, it could be omitted to improve bandwidth,
# since it is not required to perform the next operation
wf.output(term_to_text).output_as("raw_nodes")

node_data = query.add(
    "RetrieveNodeData",
    inputs={"nodes": wf.output(term_to_text)},
    parameters={"projection": {
        "text": True
    }},
    coordinates=kg
)
node_data.output("nodes").output_as("nodes_data")


# For debug: print the whole query
# print(json.dumps(query.to_flow(), indent=2))

In [7]:
# Execute the query
result = api.queries.run(query)

# Structure of the result
# `result.output` is a dict matching each of the query outputs created in the previous block
# For example, it will contain `result.outputs["nodes_data"]`, because we declared it with `node_data.output("nodes").output_as("nodes_data")`

# Visualize the outputs as table
for name, output in result.outputs.items():
    display(Markdown(f"## Output '{name}'"))
    display(pd.json_normalize(output))

## Output 'raw_nodes'

Unnamed: 0,_categories,_db,_hash,_id,index,weight
0,[texts],658248536671399613,012793e1cbcb2643e3199b1f93e8562d,617fbbeda3ea97f4f0d61031,20383,0.997979
1,[texts],658248536671399613,92a386a36d405b069d1d583d46a42dbd,617fbc25a3ea97f4f0d75560,34906,0.817296
2,[texts],658248536671399613,ceb50d7ad8b5cbf05091a74d5d012357,617fbc24a3ea97f4f0d7515e,34305,0.807669
3,[texts],658248536671399613,ed8417685ce9f270b576be6ded836c8c,617fbc25a3ea97f4f0d75552,34901,0.799954
4,[texts],658248536671399613,42b55a5e2b4b5416e44c933088d9a472,617fbbeda3ea97f4f0d609c8,19208,0.779406
5,[texts],658248536671399613,9cfcd1d280f6f005f1020957708491be,617fbbe8a3ea97f4f0d5687b,3237,0.720268
6,[texts],658248536671399613,b712afc3e35540ed12e84b0635a57099,617fbbe8a3ea97f4f0d552ee,2693,0.720241
7,[texts],658248536671399613,61af3b2cd488e05e2494f36d9745d9f2,617fbbe8a3ea97f4f0d565fe,3166,0.705901
8,[texts],658248536671399613,795a3c2fda651e45a037c57be6bca6a4,617fbbeba3ea97f4f0d5d820,15404,0.57288
9,[texts],658248536671399613,f9dc4f9aadb11ab4f9b009f966eb4c68,617fbbe8a3ea97f4f0d55c5d,2898,0.540779


## Output 'nodes_data'

Unnamed: 0,_id,text,_db
0,617fbbe8a3ea97f4f0d54c68,Yorktown Audio Visual Event Support/Modernization,658248536671399613
1,617fbbe8a3ea97f4f0d54c7a,China A/V and Telephony Support and Modernization,658248536671399613
2,617fbbe8a3ea97f4f0d54d22,Modernization and securing of Haifa File Stora...,658248536671399613
3,617fbbe8a3ea97f4f0d54f28,MAM: AI assisted Mainframe Application Moderni...,658248536671399613
4,617fbbe8a3ea97f4f0d552ee,Automation of Data Inventory Model Discovery &...,658248536671399613
5,617fbbe8a3ea97f4f0d55c5d,Intelligent Application Insights for Continuou...,658248536671399613
6,617fbbe8a3ea97f4f0d565f4,Mainframe Application Modernization using AI a...,658248536671399613
7,617fbbe8a3ea97f4f0d565fb,AI infused App Modernization Execution and Val...,658248536671399613
8,617fbbe8a3ea97f4f0d565fe,Data Modernization,658248536671399613
9,617fbbe8a3ea97f4f0d5687b,TargetDB Advisor for Data Modernization,658248536671399613


## Query 2

1. Only run RetrieveNodeData on some nodes

In [8]:
nodes = [
    {
        "_db": "658248536671399613",
        "_id": "617fbbe8a3ea97f4f0d565f4"
    },
    {
        "_db": "658248536671399613",
        "_id": "617fbbeba3ea97f4f0d5e1b8"
    }
]

query = Query()

builder = WorkflowBuilder()

node_data = query.add(
    "RetrieveNodeData",
    inputs={"nodes": Value(nodes)},
    parameters={"projection": {
        "text": True
    }},
    coordinates=kg
)
node_data.output("nodes").output_as("nodes_data")


# For debug: print the whole query
#print(json.dumps(query.to_flow(), indent=2))

In [9]:
# Execute the query
result = api.queries.run(query)

# Visualize the outputs as table
for name, output in result.outputs.items():
    display(Markdown(f"## Output '{name}'"))
    display(pd.json_normalize(output))

## Output 'nodes_data'

Unnamed: 0,_id,text,_db
0,617fbbe8a3ea97f4f0d565f4,Mainframe Application Modernization using AI a...,658248536671399613
1,617fbbeba3ea97f4f0d5e1b8,Our vision is to consolidate and to integrate ...,658248536671399613
