In [1]:
from dataclasses import asdict
import json

from IPython.display import display, Markdown
import pandas as pd

import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [2]:
from deepsearch.cps.client.queries import Query
from deepsearch.cps.client.queries.task import Value
from deepsearch.cps.client.queries.query_tasks import Workflow
from deepsearch.cps.client.builders.wf_builder import WorkflowBuilder, WorkflowTaskOperation

from deepsearch.core.client import DeepSearchKeyAuth, DeepSearchConfig
from deepsearch.cps.client.api import CpsApi, CpsApiClient

## CPS Authentication

In [3]:
auth_filename = "cps-auth.json" # this file should be populated with the credentials
auth_data = json.load(open(auth_filename))

auth = DeepSearchKeyAuth(
    username=auth_data["email"],
    api_key=auth_data["api_key"],
)

config = DeepSearchConfig(
    host="https://cps.foc-deepsearch.zurich.ibm.com", # IBM internal system
    auth=auth,
)

client = CpsApiClient(config)
api = CpsApi(client)

## Select KG

In [4]:
# Select KG
kg = api.knowledge_graphs.get(
    "f21574fe745fa0b8213cb08d0c4166513108158c", # proj_key
    "80511ed4a524fe446dc102e704fd7577f2c2a26c", # bag_key
)

## Paginated query 1

1. List the first 10 elements in the `texts` category
2. List the next 10 elements


In [5]:
## Query the first 10 elements

query = Query()

builder = WorkflowBuilder()

list_task = builder.add(
    WorkflowTaskOperation(
        type="SEARCH",
        parameters={
            "categories": ["texts"],
        },
        outputs={"nodes": {"type": "NODE_LIST", "parameters": {"limit": 10}}}
    ),
    type="INPUT",
)


wf = query.add(Workflow(id="", builder=builder, inputs={}, coordinates=kg))
wf.output(list_task).output_as("raw_nodes_page")

In [6]:
# Execute the query
result = api.queries.run(query)

In [7]:
# Visualize the outputs as table
for name, output in result.outputs.items():
    display(Markdown(f"## Output '{name}'"))
    display(pd.json_normalize(output['nodes']))

## Output 'raw_nodes_page'

Unnamed: 0,_categories,_db,_hash,_id,index,weight
0,[texts],658248536671399613,1261337fb523b6cd475338b082b21f29,617fbbe8a3ea97f4f0d54c51,2498,1
1,[texts],658248536671399613,c58714f96331aa5f26373144f03fc3a1,617fbbe8a3ea97f4f0d54c62,2499,1
2,[texts],658248536671399613,1b3ddb104c4d6417cb4557b67baf3edf,617fbbe8a3ea97f4f0d54c68,2500,1
3,[texts],658248536671399613,9bc5cb1e0f50b685dbeb5a6946fb6354,617fbbe8a3ea97f4f0d54c6a,2501,1
4,[texts],658248536671399613,25bec7e67bf534afcea16d0b05e0d7fb,617fbbe8a3ea97f4f0d54c6e,2502,1
5,[texts],658248536671399613,f74558b7bdd6f345a3eaad42124ca03b,617fbbe8a3ea97f4f0d54c6f,2503,1
6,[texts],658248536671399613,5ce7343d257f5123eb21b14173fb2d73,617fbbe8a3ea97f4f0d54c72,2504,1
7,[texts],658248536671399613,ac5941995a0fe67539bae027453ec392,617fbbe8a3ea97f4f0d54c74,2505,1
8,[texts],658248536671399613,bb1f6e2ff4dac1746f823c707773cf5e,617fbbe8a3ea97f4f0d54c7a,2506,1
9,[texts],658248536671399613,a7b334fbb2a730d1e386ad659d0e8535,617fbbe8a3ea97f4f0d54c7c,2507,1


In [8]:
## Query from 10 to 20
## This will be visible comparing the `index` column with the previous results

query = Query()

builder = WorkflowBuilder()

list_task = builder.add(
    WorkflowTaskOperation(
        type="SEARCH",
        parameters={
            "categories": ["texts"],
        },
        outputs={"nodes": {"type": "NODE_LIST", "parameters": {"limit": 10, "offset": 10}}}
    ),
    type="INPUT",
)


wf = query.add(Workflow(id="", builder=builder, inputs={}, coordinates=kg))
wf.output(list_task).output_as("raw_nodes_page")

In [9]:
# Execute the query
result = api.queries.run(query)

In [10]:
# Visualize the outputs as table
for name, output in result.outputs.items():
    display(Markdown(f"## Output '{name}'"))
    display(pd.json_normalize(output['nodes']))

## Output 'raw_nodes_page'

Unnamed: 0,_categories,_db,_hash,_id,index,weight
0,[texts],658248536671399613,bb24d16a10b1245972ade74ea8f50733,617fbbe8a3ea97f4f0d54c7e,2508,1
1,[texts],658248536671399613,e4cf745721de0e62069e878e3ac6095e,617fbbe8a3ea97f4f0d54c81,2509,1
2,[texts],658248536671399613,726d9decbf2fc748dfd0a9cf961c561d,617fbbe8a3ea97f4f0d54c84,2510,1
3,[texts],658248536671399613,c33e8a6f7d666d2fa7ca8b6a2510128a,617fbbe8a3ea97f4f0d54c86,2511,1
4,[texts],658248536671399613,424ba0a7c8adb677946c3ab0dd14cff6,617fbbe8a3ea97f4f0d54c89,2512,1
5,[texts],658248536671399613,e6d3721cd9c562d59dda697c7370ea36,617fbbe8a3ea97f4f0d54c8c,2513,1
6,[texts],658248536671399613,0cf4ef7d849470cfb38dd6184a30e7ba,617fbbe8a3ea97f4f0d54c8e,2514,1
7,[texts],658248536671399613,96246b5919dbe473587d6b6127e942a4,617fbbe8a3ea97f4f0d54c91,2515,1
8,[texts],658248536671399613,145bc0841c6590b3ebe919a4b9e7fbe4,617fbbe8a3ea97f4f0d54c93,2516,1
9,[texts],658248536671399613,dffaafb6dfb7b7fbd1a241fb2869d748,617fbbe8a3ea97f4f0d54c95,2517,1


## Paginated query 2


Obtain the text from NodesData by looping in chunks of 10 elements

In [11]:
# List of all results
page_size = 10
num_pages = 10
all_results = []


for page_no in range(num_pages):
    offset = page_no * page_size
    print(f"Querying {page_size} elements from {offset}...")
    query = Query()

    builder = WorkflowBuilder()

    list_task = builder.add(
        WorkflowTaskOperation(
            type="SEARCH",
            parameters={
                "categories": ["texts"],
            },
            outputs={"nodes": {"type": "NODE_LIST", "parameters": {"limit": page_size, "offset": offset}}}
        ),
        type="INPUT",
    )

    wf = query.add(Workflow(id="", builder=builder, inputs={}, coordinates=kg))
    # Useful for debugging
    #wf.output(list_task).output_as("raw_nodes_page")

    projection = query.add(
        "Projection",
        inputs={"nodes": wf.output(list_task)},
        parameters={"projections": {"nodes": {"field_path": ["nodes"]}}},
    )    
    
    node_data = query.add(
        "RetrieveNodeData",
        inputs={"nodes": projection.output("nodes")},
        parameters={"projection": {
            "text": True
        }},
        coordinates=kg
    )
    node_data.output("nodes").output_as("nodes_data")


    # Execute the query
    result = api.queries.run(query)
    all_results.extend(result.outputs['nodes_data'])

print(f"Done. Got {len(all_results)} elements.")

Querying 10 elements from 0...
Querying 10 elements from 10...
Querying 10 elements from 20...
Querying 10 elements from 30...
Querying 10 elements from 40...
Querying 10 elements from 50...
Querying 10 elements from 60...
Querying 10 elements from 70...
Querying 10 elements from 80...
Querying 10 elements from 90...
Done. Got 100 elements.


In [12]:
display(pd.json_normalize(all_results))

Unnamed: 0,_id,text,_db
0,617fbbe8a3ea97f4f0d54c51,Align IBM Journal of Research & Development th...,658248536671399613
1,617fbbe8a3ea97f4f0d54c62,Data Center Information Management (DCIM),658248536671399613
2,617fbbe8a3ea97f4f0d54c68,Yorktown Audio Visual Event Support/Modernization,658248536671399613
3,617fbbe8a3ea97f4f0d54c6a,Modernize Yorktown VOIP telephony sytems inclu...,658248536671399613
4,617fbbe8a3ea97f4f0d54c6e,IBM Research Developer Tools Catalog,658248536671399613
...,...,...,...
95,617fbbe8a3ea97f4f0d54ec7,DARPA Knowledge-directed Artificial Intelligen...,658248536671399613
96,617fbbe8a3ea97f4f0d54f1c,Advanced Speech Recognition for Customer Care,658248536671399613
97,617fbbe8a3ea97f4f0d54f1e,AI based Network Insights Solutions,658248536671399613
98,617fbbe8a3ea97f4f0d54f1f,AI Great Challenges in the IBM/USP/FAPESP Cent...,658248536671399613
