# Manage index item attachments

This example explores how index item attachments can be managed.

Specifically, it considers a case of using a project data index of type "Generic", however
note that any type of project data index can be used just as well.

### Access required

The content of this notebook requires access to Deep Search capabilities which are not
available on the public access system.

[Contact us](https://ds4sd.github.io) if you are interested in exploring
these Deep Search capabilities.

### Set notebooks parameters

In [None]:
from dsnotebooks.settings import ProjectNotebookSettings
import tempfile

# notebook settings auto-loaded from .env / env vars
notebook_settings = ProjectNotebookSettings()

PROFILE_NAME = notebook_settings.profile  # profile to use
PROJ_KEY = notebook_settings.proj_key  # project to use
INDEX_NAME = notebook_settings.new_idx_name  # index to create
CLEANUP = notebook_settings.cleanup  # whether to clean up
ATTACHMENT_KEY = "usr_attachments"  # format must be: "usr_<snake_case>"
FILES_TO_ATTACH = [
    "../../data/samples/2206.00785.pdf",
    "../../data/samples/2206.01062.pdf",
]
TMP_DIR = tempfile.TemporaryDirectory()

WAIT_S = 3

### Import example dependencies

In [None]:
import hashlib
import json
import random
import time
from pathlib import Path

import pandas as pd
from IPython.display import display, HTML
from slugify import slugify

import deepsearch as ds
from deepsearch.cps.queries import DataQuery

### Define helper functions

In [None]:
def find_index_item(
    api, coordinates, search_query="*", source=None, page_size=50, pred=None
):
    """Find first index item that satisfies the criteria"""
    source_to_use = ["_id", "_name", "_s3_data"] if source is None else source
    query = DataQuery(
        search_query=search_query,
        source=source_to_use,
        limit=page_size,
        coordinates=coordinates,
    )
    cursor = api.queries.run_paginated_query(query)
    for result_page in cursor:
        for item in result_page.outputs["data_outputs"]:
            if pred is None or pred(item):
                return item
    return None


def list_item_attachments(api, coordinates, index_item_id, attch_key):
    pred = lambda x: x["_id"] == index_item_id
    item = find_index_item(api, coordinates, pred=pred)
    return item["_source"]["_s3_data"][attch_key]

### Connect to Deep Search

In [None]:
api = ds.CpsApi.from_env(profile_name=PROFILE_NAME)

### Create project data index

In [None]:
print(f"{INDEX_NAME=}")
data_index = api.data_indices.create(proj_key=PROJ_KEY, name=INDEX_NAME, type="Generic")
index_key = data_index.source.index_key
print(f"{index_key=}")

### Prepare data to upload to index

In [None]:
headers = [f"FIELD {i}" for i in range(6)]
table = [headers]
set1 = ["a", "b", "c", "d", "e", "1", "2", "3", "4"]
for _ in range(4):
    row = []
    for _ in range(len(table[0])):
        row.append("".join(random.sample(set1, 4)))
    table.append(row)

In [None]:
df_raw = pd.DataFrame(table[1:], columns=table[0])
display(df_raw)

In [None]:
df = df_raw.copy()
for col_name in df_raw.columns:
    new_name = slugify(col_name, separator="_")
    print(f"{col_name} --> {new_name}")
    df.rename(columns={col_name: new_name}, inplace=True)
display(df)

### Upload data to index

In [None]:
# convert DataFrame to records, and add extra metadata needed for the Deep Search upload
file_data = df.to_dict(orient="records")
for i, row in enumerate(file_data):
    row.pop("file-info", None)
    m = hashlib.sha1()
    m.update(json.dumps(row, sort_keys=True).encode())
    h = m.hexdigest()
    row["_name"] = f"row-{i:06d}-{h[:5]}"
    row["file-info"] = {"document-hash": h}

In [None]:
# upload data to index
input_dir = Path(TMP_DIR.name)
input_filename = input_dir / "upload.json"
json.dump(file_data, input_filename.open("w"))

task = api.data_indices.upload(coords=data_index.source, source=input_filename)
api.tasks.wait_for(PROJ_KEY, task.task_id)

In [None]:
time.sleep(WAIT_S)  # allow recent changes to become visible to search

### Add attachments to an index item

In [None]:
# define item where to attach
item_name = row["_name"]  # last item iterated used as example
print(f"{item_name=}")
pred = lambda x: x["_source"]["_name"] == item_name
index_item = find_index_item(api, data_index.source, pred=pred)
idx_item_id = index_item["_id"]
print(f"{idx_item_id=}")

In [None]:
# add attachment to index item
indices = api.data_indices.list(proj_key=PROJ_KEY)
index = next((x for x in indices if x.source.index_key == index_key), None)
for file_to_attach in FILES_TO_ATTACH:
    index.add_item_attachment(
        api=api,
        index_item_id=idx_item_id,
        attachment_path=file_to_attach,
        attachment_key=ATTACHMENT_KEY,
    )
    print(f'File "{file_to_attach}" attached.')

In [None]:
time.sleep(WAIT_S)  # allow recent changes to become visible to search

### List item attachments

In [None]:
# get item attachments data under given key
attch_list = list_item_attachments(
    api=api,
    coordinates=data_index.source,
    index_item_id=idx_item_id,
    attch_key=ATTACHMENT_KEY,
)

In [None]:
# browse attachments
for attachment in attch_list:
    filename = Path(attachment["path"]).name
    download_url = attachment["url"]
    display(HTML(f'&#128073; Download <a href="{download_url}">{filename}</a>'))
    print()

> Note that attachment filenames are slugified.

### Cleanup

In [None]:
if CLEANUP:
    api.data_indices.delete(data_index.source)
    print("Data index deleted")
    TMP_DIR.cleanup()
    print("Temporary directory deleted")