In [1]:
import json
from pathlib import Path
import deepsearch as ds
from deepsearch.documents.core.models import ConversionSettings, DefaultConversionModel, ProjectConversionModel, \
    OCRSettings

# IBM Deep Search Document Conversion

## Getting started

The [Deep Search Toolkit](https://ds4sd.github.io/deepsearch-toolkit/) allows document conversion with the following few lines of code. It's that simple! For more info or step-by-step guide:
- Visit https://ds4sd.github.io/deepsearch-toolkit/guide/convert_doc/
- Follow this example notebook

⚠️ Before running this notebook, generate the file `../../ds-auth.json` via
```shell
deepsearch login --output ../../ds-auth.json
```
More details in the [docs](https://ds4sd.github.io/deepsearch-toolkit/getting_started/#authentication).

In [2]:
host = "https://deepsearch-experience.res.ibm.com"
proj = "1234567890abcdefghijklmnopqrstvwyz123456"

# This file can be generated via `deepsearch login --output ../../ds-auth.json`,
# or see the example ../../ds-auth.json.example
config_file = Path("../../ds-auth.json")

config = ds.DeepSearchConfig.parse_file(config_file)
client = ds.CpsApiClient(config)
api = ds.CpsApi(client)

## Modify conversion pipeline
cs = ConversionSettings.from_project(api, proj_key=proj)

# OCR
cs.ocr.enabled = True ## Enable or disable OCR
# cs.ocr.merge_mode = "prioritize-ocr" # Pick how OCR cells are treated when mixed with programmatic content

# backends = OCRSettings.get_backends(api) # list OCR backends
cs.ocr.backend = "alpine-ocr" ## Pick OCR backend

documents = ds.convert_documents(
    api=api,
    proj_key=proj,
    source_path="../../data/samples/2206.01062.pdf",
    conversion_settings=cs,
    progress_bar=True
)           
documents.download_all(result_dir="./converted_docs")
info = documents.generate_report(result_dir="./converted_docs")
print(info) 

Processing input:     : 100%|[38;2;15;98;254m██████████████████████████████[0m| 1/1 [00:00<00:00, 51.03it/s][38;2;15;98;254m                                                                                                    [0m
Submitting input:     : 100%|[38;2;15;98;254m██████████████████████████████[0m| 1/1 [00:06<00:00,  6.26s/it][38;2;15;98;254m                                                                                                    [0m
Converting input:     : 100%|[38;2;15;98;254m██████████████████████████████[0m| 1/1 [00:33<00:00, 33.69s/it][38;2;15;98;254m                                                                                                    [0m


{'Total documents': 1, 'Successfully converted documents': 1}


---