In [1]:
import json

from IPython.display import display, Markdown
import pandas as pd

import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [2]:
from deepsearch.core.client import DeepSearchKeyAuth, DeepSearchConfig
from deepsearch.cps.client.api import CpsApi, CpsApiClient

from deepsearch.cps.client.components.elastic import (
    ElasticDataCollectionSource,
    ElasticProjectDataCollectionSource,
)
from deepsearch.cps.queries import DataQuery

## CPS Authentication

In [3]:
auth_filename = "cps-auth.json" # this file should be populated with the credentials
auth_data = json.load(open(auth_filename))

auth = DeepSearchKeyAuth(
    username=auth_data["email"],
    api_key=auth_data["api_key"],
)

config = DeepSearchConfig(
    host="https://cps.foc-deepsearch.zurich.ibm.com", # IBM internal system
    auth=auth,
)

client = CpsApiClient(config)
api = CpsApi(client)

## List data collections

See also
```console
$ deepsearch cps elastic-data list
```

In [4]:


collections = api.elastic.list()

df_collections = pd.json_normalize([dict(coll) for coll in collections])
display(df_collections)

Unnamed: 0,instance,index,display_name,aliases,documents,health,metadata
1,default,annual-report,Annual Reports metadata,[.production],48883,green,description='Annual reports metadata organized...
2,default,arxiv,arXiv full documents,"[.production, arxiv]",1820368,green,description='arXiv® is a curated research-shar...
3,default,arxiv-abstract,arXiv abstracts,"[.production, arxiv-abstract]",2027813,green,description='arXiv® is a curated research-shar...
4,default,crossref,Crossref,"[.production, crossref]",129007816,green,description='Crossref is an official digital o...
5,default,patent,Patents from USPTO,"[.production, patent]",10557545,green,description='Patents from the US Patent and Tr...
6,default,pubmed,PubMed,"[.production, pubmed]",4207965,green,description='PubMed® comprises more than 32 mi...
7,default,semantic-scholar,Semantic Scholar Open Research Corpus metadata,"[.production, semantic-scholar]",203627381,green,"description='Semantic Scholar is a free, AI-po..."
8,materials,chemrxiv,ChemRxiv,"[.production, chemrxiv]",8972,green,"description=""ChemRxiv (pronounced 'chem-archiv..."


## Query data collection

See also
```console
$ deepsearch query data-query
```

In [5]:
query = DataQuery("bromide", source=["can_smiles"], coordinates=ElasticDataCollectionSource(elastic_id="materials", index_key="pubchem"))
results = api.queries.run(query)

display(Markdown(f"The query found {results.outputs['data_count']} records, for a total of {results.outputs['data_aggs']['deepsearch_total_size']['value']/1024**2:.2f} MB."))
display(pd.json_normalize(results.outputs["data_outputs"]))

The query found 1071 records, for a total of 3.86 MB.

Unnamed: 0,_index,_type,_id,_score,sort,_source.can_smiles
0,pubchem-20220223,_doc,00238fee2b4c544f740624796ef38886ca623cd36e64b1...,,[00238fee2b4c544f740624796ef38886ca623cd36e64b...,[Br[Au]]
1,pubchem-20220223,_doc,00860205b7bdd8046614885bf53a61511ef7ae5e9c7e7a...,,[00860205b7bdd8046614885bf53a61511ef7ae5e9c7e7...,[CN1C2=C(N=C1Br)N(C(=O)N(C2=O)C)C]
2,pubchem-20220223,_doc,012ea7b069acb799816a2570f2c585d854ead82ddab974...,,[012ea7b069acb799816a2570f2c585d854ead82ddab97...,[C[P+](C)(C)C.[Br-]]
3,pubchem-20220223,_doc,01db85d0966befb8c72ca6a96edf47c4e4540655bdfa7e...,,[01db85d0966befb8c72ca6a96edf47c4e4540655bdfa7...,[CCCCCCCCCCCCCC[N+](C)(C)C.CCCCCCCCCCCCCC[N+](...
4,pubchem-20220223,_doc,01fbf55d21f59e8f8f27e0ca2b4eb9fcb9e7f6d72e9acd...,,[01fbf55d21f59e8f8f27e0ca2b4eb9fcb9e7f6d72e9ac...,[CCCC[N+]1(C2CC(CC1C3C2O3)OC(=O)C(CO)C4=CC=CC=...
5,pubchem-20220223,_doc,02077e17da74844d73080ce01b6b177cbe2e7a03c89c29...,,[02077e17da74844d73080ce01b6b177cbe2e7a03c89c2...,[CC(=O)OC1CC2CCC3C(C2(CC1[N+]4(CCCCC4)C)C)CCC5...
6,pubchem-20220223,_doc,022dcaabd91950fd273f219ed8e27675fe7b57a4b6c595...,,[022dcaabd91950fd273f219ed8e27675fe7b57a4b6c59...,[CC[N+](C)(CC)C1=CC(=CC=C1)O]
7,pubchem-20220223,_doc,0258177156a9969969739bb1fd01f8fa55490e292db988...,,[0258177156a9969969739bb1fd01f8fa55490e292db98...,[C(Br)Br]
8,pubchem-20220223,_doc,02b3f1a49826082d115cd88271a7a7aa2b88ed6f0b4f7e...,,[02b3f1a49826082d115cd88271a7a7aa2b88ed6f0b4f7...,[C[N+](C)(C)C.I[I-]I]
9,pubchem-20220223,_doc,0308aa8e5559269263deb28b3f225e2cc80fcee00b5f19...,,[0308aa8e5559269263deb28b3f225e2cc80fcee00b5f1...,[CC(C)[N+]1(C2CCC1CC(C2)OC(=O)C(CO)C3=CC=CC=C3...


### Paginate all results

In [6]:
all_results = []
query = DataQuery("bromide", source=["can_smiles"], limit=50, coordinates=ElasticDataCollectionSource(elastic_id="materials", index_key="pubchem"))
cursor = api.queries.run_paginated_query(query)

for page_no, result_page in enumerate(cursor):
    print('Fetching page', page_no+1)
    all_results.extend(result_page.outputs["data_outputs"])

print(f'Finished fetching all data. Total is {len(all_results)} records.')
display(pd.json_normalize(all_results))

Fetching page 1
Fetching page 2
Fetching page 3
Fetching page 4
Fetching page 5
Fetching page 6
Fetching page 7
Fetching page 8
Fetching page 9
Fetching page 10
Fetching page 11
Fetching page 12
Fetching page 13
Fetching page 14
Fetching page 15
Fetching page 16
Fetching page 17
Fetching page 18
Fetching page 19
Fetching page 20
Fetching page 21
Fetching page 22
Fetching page 23
Fetching page 24
Finished fetching all data. Total is 1071 records.


Unnamed: 0,_index,_type,_id,_score,sort,_source.can_smiles
0,pubchem-20220223,_doc,00238fee2b4c544f740624796ef38886ca623cd36e64b1...,,[00238fee2b4c544f740624796ef38886ca623cd36e64b...,[Br[Au]]
1,pubchem-20220223,_doc,00860205b7bdd8046614885bf53a61511ef7ae5e9c7e7a...,,[00860205b7bdd8046614885bf53a61511ef7ae5e9c7e7...,[CN1C2=C(N=C1Br)N(C(=O)N(C2=O)C)C]
2,pubchem-20220223,_doc,012ea7b069acb799816a2570f2c585d854ead82ddab974...,,[012ea7b069acb799816a2570f2c585d854ead82ddab97...,[C[P+](C)(C)C.[Br-]]
3,pubchem-20220223,_doc,01db85d0966befb8c72ca6a96edf47c4e4540655bdfa7e...,,[01db85d0966befb8c72ca6a96edf47c4e4540655bdfa7...,[CCCCCCCCCCCCCC[N+](C)(C)C.CCCCCCCCCCCCCC[N+](...
4,pubchem-20220223,_doc,01fbf55d21f59e8f8f27e0ca2b4eb9fcb9e7f6d72e9acd...,,[01fbf55d21f59e8f8f27e0ca2b4eb9fcb9e7f6d72e9ac...,[CCCC[N+]1(C2CC(CC1C3C2O3)OC(=O)C(CO)C4=CC=CC=...
...,...,...,...,...,...,...
1066,pubchem-20220223,_doc,ff487d7290ffe42f2ecc6161680f6060680f9585b007ff...,,[ff487d7290ffe42f2ecc6161680f6060680f9585b007f...,[CCCC[N+]1=CC=CC(=C1)C.[Br-]]
1067,pubchem-20220223,_doc,ff805620597e92258a4fdf2324268ecb9704a8d0600924...,,[ff805620597e92258a4fdf2324268ecb9704a8d060092...,[CN(C)C(=O)OC1=CC=CC(=C1)[N+](C)(C)C]
1068,pubchem-20220223,_doc,ff909fb7f274ca25567ee71757727cded1dea3f2530271...,,[ff909fb7f274ca25567ee71757727cded1dea3f253027...,[[Br-].[Br-].[Br-].[Br-]]
1069,pubchem-20220223,_doc,ffa66bbb09aca87a7fbf9b1be6de7599c1f9fa464050b2...,,[ffa66bbb09aca87a7fbf9b1be6de7599c1f9fa464050b...,[CCCCCCCC[N+](C)(C)C.[Br-]]
