In [1]:
import json
import deepsearch as ds

# IBM Deep Search Document Conversion
# Minimum Working Example

The [Deep Search Toolkit](https://ds4sd.github.io/deepsearch-toolkit/) allows document conversion with the following few lines of code. It's that simple! For more info or step-by-step guide:
- Visit https://ds4sd.github.io/deepsearch-toolkit/guide/convert_doc/
- Follow this example notebook

In [None]:
host = "https://deepsearch-experience.res.ibm.com"
proj = "1234567890abcdefghijklmnopqrstvwyz123456"

username = "<fill-in-your-username>"
api_key = "<fill-in-your-api-key>"

auth = ds.DeepSearchKeyAuth(username=username, api_key=api_key)
config = ds.DeepSearchConfig(host=host, auth=auth)
client = ds.CpsApiClient(config)
api = ds.CpsApi(client)

documents = ds.convert_documents(api=api, 
                                 proj_key=proj, 
                                 source_path=<path-to-file>, 
                                 progress_bar=True)                                                                  
info = documents.generate_report(result_dir="./converted_docs")
print(info)              

documents.download_all(result_dir="./converted_docs")

---

# There's more! 

The Deep Search Toolkit provides utility functions which can convert documents from different type of inputs.

- From a single url
- From a list of urls. In this case, the toolkit will launch a batch processing with all tasks.
- From a local PDF file
- From a local zip archive containing PDF files.
- From a local folder containing PDF files. In this case, the toolkit is packaging the files into batches and creates multiple zip archives.


---

# Let's explore document conversion

### Authentication via stored credentials

In this example, we initialize the Deep Search client from the credentials
contained in the file `cps-auth.json`. See example [here](https://github.com/DS4SD/deepsearch-toolkit/blob/main/examples/notebooks/cps-auth.example.json)

In [2]:
auth_filename = "cps-auth.json" # this file should be populated with the credentials
auth_data = json.load(open(auth_filename))

auth = ds.DeepSearchKeyAuth(
    username=auth_data["email"],
    api_key=auth_data["api_key"],
)

config = ds.DeepSearchConfig(
    # the host of the Deep Search instance you are using
    host="https://deepsearch-experience.res.ibm.com",

    # if needed, the validation of the SSL certificate can be avoided
    # verify_ssl=True, 

    # auth credentials
    auth=auth,
)

client = ds.CpsApiClient(config)
api = ds.CpsApi(client)

In [3]:
PROJ_KEY="1234567890abcdefghijklmnopqrstvwyz123456"

## Single URL

In [4]:
documents = ds.convert_documents(api=api, 
                                 proj_key=PROJ_KEY, 
                                 urls="https://arxiv.org/pdf/2206.00785.pdf", 
                                 progress_bar=True)

Submitting input:     : 100%|[38;2;15;98;254m██████████████████████████████[0m| 1/1 [00:01<00:00,  1.67s/it][38;2;15;98;254m                                                             [0m
Converting input:     : 100%|[38;2;15;98;254m██████████████████████████████[0m| 1/1 [00:29<00:00, 29.13s/it][38;2;15;98;254m                                                             [0m


In [5]:
# Let's check what happened. 
# We generate a csv report about the conversion task and store it locally
result_dir = './converted_docs/'
info = documents.generate_report(result_dir=result_dir)
print(info)

{'Total online documents': 1, 'Successfully converted documents': 1}


The saved report may help in debugging and analysing the conversion task

In [6]:
# let's download all the converted documents:
documents.download_all(result_dir=result_dir,progress_bar=True)

Downloading result:   : 100%|[38;2;15;98;254m██████████████████████████████[0m| 1/1 [00:00<00:00,  1.02it/s][38;2;15;98;254m                                                             [0m


In [7]:
# the documents object stores some additional info like:
documents.statuses, documents.task_ids

(['SUCCESS'], ['1766eaec-1caf-4952-8d88-0f478bb4eba9'])

## Multiple URLs

In [8]:
# let's create a list of urls we want to convert:
urls = ["https://arxiv.org/pdf/2206.00785.pdf", "https://arxiv.org/pdf/2206.01062.pdf"]

In [9]:
# Process multiple urls
documents = ds.convert_documents(api=api, 
                                 proj_key=PROJ_KEY, 
                                 urls= urls, 
                                 progress_bar=True)

Submitting input:     : 100%|[38;2;15;98;254m██████████████████████████████[0m| 2/2 [00:01<00:00,  1.23it/s][38;2;15;98;254m                                                             [0m
Converting input:     : 100%|[38;2;15;98;254m██████████████████████████████[0m| 2/2 [00:44<00:00, 22.10s/it][38;2;15;98;254m                                                             [0m


In [10]:
# as before we can use the documents object to download all jsons. We can also iterate over them individually.
for doc in documents:
    doc.download(result_dir=result_dir,progress_bar=True)

Downloading result:   : 100%|[38;2;15;98;254m██████████████████████████████[0m| 1/1 [00:01<00:00,  1.59s/it][38;2;15;98;254m                                                             [0m
Downloading result:   : 100%|[38;2;15;98;254m██████████████████████████████[0m| 1/1 [00:01<00:00,  1.00s/it][38;2;15;98;254m                                                             [0m


## Process local file

In [11]:
documents = ds.convert_documents(api=api, 
                                 proj_key=PROJ_KEY, 
                                 source_path="/Users/mis/Downloads/IEEE_CLOUD_2022_paper_20.pdf", 
                                 progress_bar=True)

Processing input:     : 100%|[38;2;15;98;254m██████████████████████████████[0m| 1/1 [00:00<00:00, 146.39it/s][38;2;15;98;254m                                                            [0m
Submitting input:     : 100%|[38;2;15;98;254m██████████████████████████████[0m| 1/1 [00:04<00:00,  4.32s/it][38;2;15;98;254m                                                             [0m
Converting input:     : 100%|[38;2;15;98;254m██████████████████████████████[0m| 1/1 [00:21<00:00, 21.62s/it][38;2;15;98;254m                                                             [0m


# Process folder of files

In [12]:
documents = ds.convert_documents(api=api, 
                                 proj_key=PROJ_KEY, 
                                 source_path="/Users/mis/LM/Documents/pdf_docs/test/", 
                                 progress_bar=True)

Processing input:     : 100%|[38;2;15;98;254m██████████████████████████████[0m| 2/2 [00:00<00:00, 61.75it/s][38;2;15;98;254m                                                             [0m
Submitting input:     : 100%|[38;2;15;98;254m██████████████████████████████[0m| 2/2 [00:06<00:00,  3.10s/it][38;2;15;98;254m                                                             [0m
Converting input:     : 100%|[38;2;15;98;254m██████████████████████████████[0m| 2/2 [00:31<00:00, 15.65s/it][38;2;15;98;254m                                                             [0m


In [13]:
info = documents.generate_report(result_dir)
print(info)

{'Total files (pdf+zip)': 3, 'Total batches': 2, 'Successfully converted batches': 2}


In [14]:
# let's download all the converted documents:
documents.download_all(result_dir=result_dir,progress_bar=True)

Downloading result:   : 100%|[38;2;15;98;254m██████████████████████████████[0m| 3/3 [00:03<00:00,  1.06s/it][38;2;15;98;254m                                                             [0m
