# Generate JSON documents for submission to Gen3 indexd

This notebook will create documents for files listed in the s3 bucket `kf-seq-data-broad` for the Chung 2016 cohort.

In [None]:
import os
import uuid
import boto3
import requests
import pandas as pd
pd.set_option('display.max_colwidth', -1)

from dataservice.util.data_import.utils import (
    read_json,
    write_json
)

DATA_DIR = '/Users/singhn4/Projects/kids_first/data/Chung'
GF_BY_UUID_FP = os.path.join(DATA_DIR, 'genomic_files_by_uuid.json')
STUDY_ID = 'phs001110'
BUCKET_NAME = 'kf-seq-data-broad'
PREFIX = 'fc-ff4e8f53-e153-4c78-b630-0ebe66030d80'

In [None]:
client = boto3.resource('s3')

In [None]:
def generate_uuids():
    # Generate UUIDs
    # Use etag part 1 for md5sum since there are no md5sums for this dataset
    bucket = client.Bucket(BUCKET_NAME)
    by_uuid = {}
    for obj in bucket.objects.filter(Prefix=PREFIX):
        # Create file registration
        md5 = obj.e_tag.split('-')[0].strip('\"')
        size = obj.size
        file_name = os.path.basename(obj.key)
        path = 's3://{}/{}'.format(BUCKET_NAME, obj.key)
        _id = str(uuid.uuid4())
        body = {
            'metadata': {'acls': STUDY_ID},
            'did': _id,
            'file_name': file_name,
            'form': 'object',
            'size': size,
            'urls': [path],
            'hashes': {
                'md5': md5
            }
        }
        by_uuid[_id] = body
        return by_uuid

In [None]:
# Create or read from file
if not os.path.exists(GF_BY_UUID_FP):
    print('Generating genomic files info and uuids and writing to file...')
    data = generate_uuids()
    write_json(data, GF_BY_UUID_FP)
else:
    print('{} already exists. Reading from file ...'.format(GF_BY_UUID_FP))
    data = read_json(GF_BY_UUID_FP)
    
data

In [None]:
# Submit files to Gen3 QA via indexd endpoint
ENABLE=False
if ENABLE:
    # Get auth 
    auth = (os.environ.get('KF_INDEXD_UNAME'), os.environ.get('KF_INDEXD_PWD'))
    # Submit
    for k, body in data.items():
            resp = requests.post('https://gen3qa.kids-first.io/index/index/',
                                 auth=auth,
                                 json=body)
            print(resp.status_code)
print('Completed submission!')