In [1]:
import json
import deepsearch as ds

## Authentication

In this example, we initialize the Deep Search client from the credentials
contained in the file `cps-auth.json`.

In [2]:
auth_filename = "cps-auth.json" # this file should be populated with the credentials
auth_data = json.load(open(auth_filename))

auth = ds.DeepSearchKeyAuth(
    username=auth_data["email"],
    api_key=auth_data["api_key"],
)

config = ds.DeepSearchConfig(
    # the host of the Deep Search instance you are using
    host="https://deepsearch-experience.res.ibm.com",

    # if needed, the validation of the SSL certificate can be avoided
    # verify_ssl=True, 

    # auth credentials
    auth=auth,
)

client = ds.CpsApiClient(config)
api = ds.CpsApi(client)

## Convert your first file


In the next block we convert our first document. As an input, we use a URL to a PDF file.


In [3]:
PROJ_KEY="1234567890abcdefghijklmnopqrstvwyz123456"
documents = ds.convert_documents(api=api, 
                                 proj_key=PROJ_KEY, 
                                 urls="https://arxiv.org/pdf/2206.00785.pdf", 
                                 progress_bar=True)

Submitting input:   : 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.50s/it]
Converting input:   : 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:22<00:00, 22.43s/it]


In [4]:
# We can now work with the returned object.

# let's download all the converted documents:
result_dir = './result_dir'
documents.download_all(result_dir=result_dir,progress_bar=True)

Downloading result: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.20s/it]


In [5]:
# We can also generate report about the conversion task:
info = documents.generate_report(result_dir=result_dir)
print(info)

{'Total online documents': 1, 'Successfully converted documents': 1}


## More convert options

The Deep Search toolkit provides helper functions which can convert documents from different type of inputs.

- From a single url
- From a list of urls. In this case, the toolkit will launch a batch processing with all tasks.
- From a local PDF file
- From a local zip archive containing PDF files.
- From a local local folder containing PDF files. In this case, the toolkit is packaging the files into batches and creates multiple zip archives.


In [6]:
# Process multiple urls

PROJ_KEY="1234567890abcdefghijklmnopqrstvwyz123456"
documents = ds.convert_documents(api=api, 
                                 proj_key=PROJ_KEY, 
                                 urls=["https://arxiv.org/pdf/2206.00785.pdf", "https://arxiv.org/pdf/2206.01062.pdf"], 
                                 progress_bar=True)

Submitting input:   : 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.79it/s]
Converting input:   : 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:54<00:00, 27.03s/it]


In [7]:
# as before we can use the documents object to download all jsons. We can also iterate over them individually.
for doc in documents:
    #get url of converted document:
    url_json = doc.url_json
    #download individual json:
    doc.download(result_dir=result_dir,progress_bar=True)

Downloading result: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.19s/it]
Downloading result: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.11s/it]


In [8]:
# Process local file

PROJ_KEY="1234567890abcdefghijklmnopqrstvwyz123456"
documents = ds.convert_documents(api=api, 
                                 proj_key=PROJ_KEY, 
                                 source_path="/Users/mis/Downloads/IEEE_CLOUD_2022_paper_20.pdf", 
                                 progress_bar=True)

Processing input:   : 100%|████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 173.31it/s]
Submitting input:   : 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:08<00:00,  8.09s/it]
Converting input:   : 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:26<00:00, 26.93s/it]


In [9]:
# Process folder of files

PROJ_KEY="1234567890abcdefghijklmnopqrstvwyz123456"
documents = ds.convert_documents(api=api, 
                                 proj_key=PROJ_KEY, 
                                 source_path="/Users/mis/LM/Documents/pdf_docs/test/", 
                                 progress_bar=True)

Processing input:   : 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 61.87it/s]
Submitting input:   : 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:07<00:00,  3.57s/it]
Converting input:   : 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:21<00:00, 10.58s/it]


In [10]:
info = documents.generate_report(result_dir)
print(info)

{'Total files (pdf+zip)': 3, 'Total batches': 2, 'Successfully converted batches': 2}
