# Manage index item attachments

This example explores how index item attachments can be managed.

Specifically, it considers a case of using a project data index of type "Generic", however
note that any type of project data index can be used just as well.

### Access required

The content of this notebook requires access to Deep Search capabilities which are not
available on the public access system.

[Contact us](https://ds4sd.github.io/#unlimited-access) if you are interested in exploring
this Deep Search capabilities.

### Authentication via stored credentials

⚠️ Before running this notebook, ensure you have your auth configuration in `../../ds-auth.json`.

To generate, run the following from this notebook's directory:

```shell
deepsearch login --output ../../ds-auth.json
```

More details in the [docs](https://ds4sd.github.io/deepsearch-toolkit/getting_started/#authentication).

### Notebooks parameters

The following block defines the parameters used to execute the notebook

- `CONFIG_FILE`: location of the Deep Search configuration file
- `PROJ_KEY`: the project to use
- `INDEX_NAME`: the name for the data index to create
- `ATTACHMENT_KEY`: the key under which to add attachments
- `FILES_TO_ATTACH`: the files to attach

In [1]:
import datetime
import hashlib
import json
import random
import time
from pathlib import Path

import pandas as pd
from IPython.display import display, HTML
from slugify import slugify

import deepsearch as ds
from deepsearch.cps.queries import DataQuery

In [2]:
# input parameters (update accordingly)
CONFIG_FILE = Path("../../ds-auth.json")
PROJ_KEY = "62b0587f6bde1462dfc62200e45b7f421f199460"
INDEX_NAME = f"example_{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}"
ATTACHMENT_KEY = "usr_attachments"  # format must be: "usr_<snake_case>"
FILES_TO_ATTACH = [
    "../../data/samples/2206.00785.pdf",
    "../../data/samples/2206.01062.pdf",
]

WAIT_S = 3

### Define helper functions

In [3]:
def find_index_item(api, coordinates, search_query="*", source=None, page_size=50, pred=None):
    """ Find first index item that satisfies the criteria """
    source_to_use = ["_id", "_name", "_s3_data"] if source is None else source
    query = DataQuery(
        search_query=search_query,
        source=source_to_use,
        limit=page_size,
        coordinates=coordinates,
    )
    cursor = api.queries.run_paginated_query(query)
    for result_page in cursor:
        for item in result_page.outputs["data_outputs"]:
            if pred is None or pred(item):
                return item
    return None

def list_item_attachments(api, coordinates, index_item_id, attch_key):
    pred = lambda x: x["_id"] == index_item_id
    item = find_index_item(api, coordinates, pred=pred)
    return item["_source"]["_s3_data"][attch_key]

### Connect to Deep Search

In [4]:
# initialize the Deep Search client from the config file
config = ds.DeepSearchConfig.parse_file(CONFIG_FILE)
client = ds.CpsApiClient(config)
api = ds.CpsApi(client)
cps_api_url = api.client.swagger_client.configuration.host

### Create project data index

In [5]:
print(f"{INDEX_NAME=}")
data_index = api.data_indices.create(proj_key=PROJ_KEY, name=INDEX_NAME, type="Generic")
index_key = data_index.source.index_key
print(f"{index_key=}")

INDEX_NAME='example_20230303170009'
index_key='d1ba4412a13fea1d65ece54de5d09ae0a3c6a457'


### Prepare data to upload to index

In [6]:
headers = [f"FIELD {i}" for i in range(6)]
table = [headers]
set1 = ["a", "b", "c", "d", "e", "1", "2", "3", "4"]
for _ in range(4):
    row = []
    for _ in range(len(table[0])):
        row.append("".join(random.sample(set1, 4)))
    table.append(row)

In [7]:
df_raw = pd.DataFrame(table[1:], columns=table[0])
display(df_raw)

Unnamed: 0,FIELD 0,FIELD 1,FIELD 2,FIELD 3,FIELD 4,FIELD 5
0,4ae3,2c41,b21c,eb1c,e14a,31dc
1,3da2,cd42,3a12,ecb1,2dba,12e3
2,c42d,b2a3,ad12,de3c,ad1b,2bce
3,3a2e,3eda,23ec,ba3e,a4cd,43ec


In [8]:
df = df_raw.copy()
for col_name in df_raw.columns:
    new_name = slugify(col_name, separator="_")
    print(f"{col_name} --> {new_name}")
    df.rename(columns={col_name: new_name}, inplace=True)
display(df)

FIELD 0 --> field_0
FIELD 1 --> field_1
FIELD 2 --> field_2
FIELD 3 --> field_3
FIELD 4 --> field_4
FIELD 5 --> field_5


Unnamed: 0,field_0,field_1,field_2,field_3,field_4,field_5
0,4ae3,2c41,b21c,eb1c,e14a,31dc
1,3da2,cd42,3a12,ecb1,2dba,12e3
2,c42d,b2a3,ad12,de3c,ad1b,2bce
3,3a2e,3eda,23ec,ba3e,a4cd,43ec


### Upload data to index

In [9]:
# convert DataFrame to records, and add extra metadata needed for the Deep Search upload
file_data = df.to_dict(orient="records")
for i, row in enumerate(file_data):
    row.pop("file-info", None)
    m = hashlib.sha1()
    m.update(json.dumps(row, sort_keys=True).encode())
    h = m.hexdigest()
    row["_name"] = f"row-{i:06d}-{h[:5]}"
    row["file-info"] = {
        "document-hash": h
    }

In [10]:
# upload data to index
r = api.client.session.post(
    f"{cps_api_url}/project/{data_index.source.proj_key}/data_indices/{data_index.source.index_key}/actions/upload",
    files={"file": ("input.json", json.dumps(file_data))}
)
r.raise_for_status()

resp = r.json()
print(f"Uploads: {resp['success']} successful and {resp['errors']} failures.")

Uploads: 4 successful and 0 failures.


In [11]:
time.sleep(WAIT_S)  # allow recent changes to become visible to search

### Add attachments to an index item

In [12]:
# define item where to attach
item_name = row["_name"]  # last item iterated used as example
print(f"{item_name=}")
pred = lambda x: x["_source"]["_name"] == item_name
index_item = find_index_item(api, data_index.source, pred=pred)
idx_item_id = index_item["_id"]
print(f"{idx_item_id=}")

item_name='row-000003-78241'
idx_item_id='mxUzqIYB2g1yadKIxiRq'


In [13]:
# add attachment to index item
indices = api.data_indices.list(proj_key=PROJ_KEY)
index = next((x for x in indices if x.source.index_key == index_key), None)
for file_to_attach in FILES_TO_ATTACH:
    index.add_item_attachment(
        api=api,
        index_item_id=idx_item_id,
        attachment_path=file_to_attach,
        attachment_key=ATTACHMENT_KEY,
    )
    print(f'File "{file_to_attach}" attached.')

File "../../data/samples/2206.00785.pdf" attached.
File "../../data/samples/2206.01062.pdf" attached.


In [14]:
time.sleep(WAIT_S)  # allow recent changes to become visible to search

### List item attachments

In [15]:
# get item attachments data under given key
attch_list = list_item_attachments(
    api=api,
    coordinates=data_index.source,
    index_item_id=idx_item_id,
    attch_key=ATTACHMENT_KEY,
)

In [16]:
# browse attachments
for attachment in attch_list:
    display(attachment)
    filename = Path(attachment["path"]).name
    download_url = attachment["url"]
    display(HTML(f'&#128073; Download <a href="{download_url}">{filename}</a>'))
    print()
    

{'date': '2023-03-03T16:00:15.348483+00:00',
 'path': 'attachments/83c0a0c4-22b3-4b77-9179-01fcaee03f67/2206-00785.pdf',
 'mime': 'application/pdf',
 'url': 'https://s3.eu-de.cloud-object-storage.appdomain.cloud/cps-dev-deepsearch-dev-bags/cps-dev-deepsearch-dev-projdatad1ba44/attachments/83c0a0c4-22b3-4b77-9179-01fcaee03f67/2206-00785.pdf?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=b7d02de376e24e97904c2f698795e3d0%2F20230303%2Feu-de-standard%2Fs3%2Faws4_request&X-Amz-Date=20230303T160020Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=48c2b5b2cded793eceb40630708362e5740d42b0eb265b2a2d41d5bfa6af524f'}




{'date': '2023-03-03T16:00:17.411964+00:00',
 'path': 'attachments/e8d20361-7e4a-4521-bea9-bad156f952ac/2206-01062.pdf',
 'mime': 'application/pdf',
 'url': 'https://s3.eu-de.cloud-object-storage.appdomain.cloud/cps-dev-deepsearch-dev-bags/cps-dev-deepsearch-dev-projdatad1ba44/attachments/e8d20361-7e4a-4521-bea9-bad156f952ac/2206-01062.pdf?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=b7d02de376e24e97904c2f698795e3d0%2F20230303%2Feu-de-standard%2Fs3%2Faws4_request&X-Amz-Date=20230303T160020Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=282188f3c3ff0b4a7336205d844b60fe068b0175173e60b561e3f26a4185df39'}




> Note that attachment filenames are slugified.

### Remove project data index

In [17]:
api.data_indices.delete(data_index.source)