# Bring Your Own PDFs

In this example we combine the document conversion capabilities of Deep Search with its data query capabilities.
From the Deep Search Workspace, we create a new project data index which can host our own PDF documents.
Once the upload is completed, we will be able to query the documents, similar to the public data which we
explored in the [Data query quick start example](../data_query_quick_start/). 
In the last steps of the example, we additionally export the converted documents as JSON files.


Sections
1. [Create data index and upload data](#Create-data-index-and-upload-data)
2. [Query your data](#Query-your-data)
3. [Download your data](#Download-your-data)
4. Custom upload settings
    1. [Enable OCR](#Enable-OCR)
    2. [Enable raw PDF cells](#Enable-raw-PDF-cells)


### Access required

The content of this notebook requires access to Deep Search capabilities which are not
available on the public access system.

[Contact us](https://ds4sd.github.io) if you are interested in exploring
these Deep Search capabilities.

### Set notebook parameters

In [None]:
from dsnotebooks.settings import ProjectNotebookSettings
from pathlib import Path

# notebook settings auto-loaded from .env / env vars
notebook_settings = ProjectNotebookSettings()

PROFILE_NAME = notebook_settings.profile  # profile to use
PROJ_KEY = notebook_settings.proj_key  # project to use
INDEX_NAME = notebook_settings.new_idx_name  # index to create
CLEANUP = notebook_settings.cleanup  # whether to clean up
INPUT_FILES_FOLDER = Path("../../data/samples/")
INPUT_OCR_FILE = Path("../../data/scanned-samples/2206.00785-7.png")

############
_GARBAGE_COLLECTOR = (
    []
)  # list of resources to clean up at the end of the execution (if CLEANUP=True)

print(f"The example will be executed on the Deep Search instance {PROFILE_NAME}")

### Import example dependencies

In [None]:
# Import standard dependenices
from copy import deepcopy
import json
from tqdm.notebook import tqdm
import pandas as pd
import tempfile

# IPython utilities
from IPython.display import display, Markdown, HTML

# Import the deepsearch-toolkit
import deepsearch as ds
from deepsearch.documents.core.export import export_to_markdown
from deepsearch.cps.queries import DataQuery
from deepsearch.cps.data_indices import utils as data_indices_utils

### Connect to Deep Search

In [None]:
api = ds.CpsApi.from_env(profile_name=PROFILE_NAME)

---

### Create data index and upload data

In [None]:
# Create a new data index in your project
data_index = api.data_indices.create(proj_key=PROJ_KEY, name=INDEX_NAME)
_GARBAGE_COLLECTOR.append(data_index)
index_key = data_index.source.index_key

In [None]:
# Upload and convert documents
data_indices_utils.upload_files(
    api=api, coords=data_index.source, local_file=INPUT_FILES_FOLDER
)

In [None]:
display(
    Markdown(
        f"The data is now available. You can query it programmatically (see next section) or access it via the Deep Search UI at <br />{api.client.config.host}/projects/{PROJ_KEY}/library/private/{index_key}"
    )
)

---

### Query your data

In [None]:
# Count the documents in the data index
query = DataQuery("*", source=[""], limit=0, coordinates=data_index.source)
query_results = api.queries.run(query)
num_results = query_results.outputs["data_count"]
print(f"The data index contains {num_results} entries.")

In [None]:
# Find documents matching query
search_query = "speedup"
query = DataQuery(
    search_query,
    source=["file-info.filename", "description.title", "description.authors"],
    coordinates=data_index.source,
)
query_results = api.queries.run(query)

all_results = []
cursor = api.queries.run_paginated_query(query)
for result_page in tqdm(cursor):
    # Iterate through the results of a single page, and add to the total list
    for row in result_page.outputs["data_outputs"]:
        print()
        metadata = row["_source"].get(
            "description", {}
        )  # setting default, in case no title and authors are detected
        # Add row to results table
        all_results.append(
            {
                "Filename": row["_source"]["file-info"]["filename"],
                "Title": metadata.get("title", ""),
                "Authors": ", ".join(
                    [author["name"] for author in metadata.get("authors", [])]
                ),
            }
        )

num_results = len(all_results)
print(f"Finished fetching all data. Total is {num_results} records.")

In [None]:
# Visualize the table with all results
df = pd.json_normalize(all_results)
display(
    Markdown(f"#### Results\nDocuments matching the search query '{search_query}':")
)
display(HTML(df.head().to_html(render_links=True)))

---

### Download your data

In [None]:
# Run query
query = DataQuery(search_query="*", source=["*"], coordinates=data_index.source)
cursor = api.queries.run_paginated_query(query)

# Using a temp dir for demo purposes; to persist instead, set output dir accordingly
temp_dir = tempfile.TemporaryDirectory()
output_dir = temp_dir.name

# Iterate through query results
all_results = []
for result_page in tqdm(cursor):
    for row in result_page.outputs["data_outputs"]:
        print(row)
        metadata = row["_source"].get(
            "description", {}
        )  # setting default, in case no title and authors are detected

        # Download JSON file
        file_path_json = Path(output_dir) / f"{row['_id']}.json"
        with open(file_path_json, "w") as outfile:
            json.dump(row["_source"], outfile, indent=2)

        # Export JSON to Markdown
        file_path_md = Path(output_dir) / f"{row['_id']}.md"
        with open(file_path_md, "w") as outfile:
            outfile.write(export_to_markdown(row["_source"]))

        all_results.append(
            {
                "Filename": row["_source"]["file-info"]["filename"],
                "Title": metadata.get("title", ""),
                "JSON Path": file_path_json,
                "Markdown Path": file_path_md,
            }
        )

print(f"Finished fetching all data. Total is {len(all_results)} records.")
print(f"Data downloaded in {output_dir}")

# Visualize a table listing document titles and locations
df = pd.json_normalize(all_results)
display(df)

In [None]:
# Peek first lines of a downloaded file
with open(df.iloc[0]["Markdown Path"]) as demo_file:
    content = ""
    for _ in range(20):
        line = demo_file.readline()
        content += line

    display(Markdown("## Markdown content"))
    display(Markdown(content))

with open(df.iloc[0]["JSON Path"]) as demo_file:
    content = ""
    for _ in range(20):
        line = demo_file.readline()
        content += line
    display(Markdown("## JSON content"))
    display(Markdown(f"<code>{content}</code>"))

---

## Enable OCR

This section is using the `ConversionSettings` object to enable OCR when converting PDF documents.

Refer to the [OCR settings documentation](https://ds4sd.github.io/deepsearch-toolkit/guide/convert_doc/#modify-ocr-settings) for more details. 


In [None]:
from deepsearch.documents.core.models import ConversionSettings

In [None]:
# Create a new data index to process with OCR
data_index = api.data_indices.create(proj_key=PROJ_KEY, name=INDEX_NAME + "-ocr")
_GARBAGE_COLLECTOR.append(data_index)

In [None]:
# Load conversion settings and enable OCR
cs = ConversionSettings.from_project(api, proj_key=PROJ_KEY)
cs.ocr.enabled = True  # Enable or disable OCR

# Upload and convert documents with custom conversion settings
data_indices_utils.upload_files(
    api=api, coords=data_index.source, local_file=INPUT_OCR_FILE, conv_settings=cs
)

# Display message
display(
    Markdown(
        f"#### Results\nThe data is now available. This file will now display the text from the scanned pages. Access it via the Deep Search UI at <br />{api.client.config.host}/projects/{data_index.source.proj_key}/library/private/{data_index.source.index_key}"
    )
)

---

## Enable raw PDF cells

The document conversion pipeline is producing a JSON file corresponsing to the PDF documents, where all document components have been grouped, classified and further inspected (e.g. table structure) for a simple usage.

However, in some use cases it is convenient to rely on the raw text cells contained in the PDF document.
This an auxiliary file that Deep Search is making available on demand.
To following section is demonstrating how this is enabled.


In [None]:
from deepsearch.documents.core.models import TargetSettings

In [None]:
# Create a new data index to process with OCR
data_index = api.data_indices.create(proj_key=PROJ_KEY, name=INDEX_NAME + "-raw")
_GARBAGE_COLLECTOR.append(data_index)

In [None]:
# Set custom target settings with raw pdf cells enabled
tsettings = TargetSettings(add_raw_pages=True)

# Upload and convert documents with custom conversion settings
data_indices_utils.upload_files(
    api=api,
    coords=data_index.source,
    local_file=INPUT_FILES_FOLDER,
    target_settings=tsettings,
)

In [None]:
# Run query
query = DataQuery(
    search_query="*",
    source=["file-info.filename", "_s3_data.raw-pages"],
    coordinates=data_index.source,
)
cursor = api.queries.run_paginated_query(query)

# Iterate through query results
all_results = []
for result_page in cursor:
    for row in result_page.outputs["data_outputs"]:
        filename = row["_source"]["file-info"]["filename"]
        for raw_page in row["_source"]["_s3_data"]["raw-pages"]:

            all_results.append(
                {
                    "Filename": filename,
                    "Page": raw_page["page"],
                    "RAW file": f"<a target='_blank' href='{raw_page['url']}'>Link</a>",
                }
            )

print(f"Finished fetching all data. Total is {len(all_results)} records.")
print(f"Data downloaded in {output_dir}")
display(
    Markdown(
        "#### Results\nHere is the list of the files uploaded and the urls where to download the raw pdf cells details."
    )
)

# Visualize a table listing document titles and locations
df = pd.json_normalize(all_results)
display(HTML(df.to_html(render_links=True, escape=False)))

### Cleanup
If enabled, we will delete all the resources created in the example

In [None]:
# Delete data index
if CLEANUP:
    for data_index in _GARBAGE_COLLECTOR:
        api.data_indices.delete(data_index.source)
        print(f"Data index {data_index.name} deleted")