# Generate JSON documents for submission to Gen3 indexd

This notebook will create documents for files listed in the s3 bucket `kf-seq-data-hudsonalpha` for the Rios_Wise_2016 cohort.

In [1]:
import os
import json
import uuid
from pprint import pprint

import requests
import boto3
import pandas as pd
pd.set_option('display.max_colwidth', -1)

DATA_DIR = '/Users/singhn4/Projects/kids_first/data/Rios_Wise_2016/'
GF_DIR = os.path.join(DATA_DIR, 'genomic_files')
GF_BY_UUID_FP = os.path.join(DATA_DIR, 'genomic_files_by_uuid.json')
STUDY_ID = 'phs001410'
BUCKET_NAME = 'kf-seq-data-hudsonalpha'

In [2]:
client = boto3.resource('s3')

In [3]:
# Helper functions
def read_json(filepath):
    with open(filepath, 'r') as json_file:
        return json.load(json_file)
    
def write_json(data, filepath):
    with open(filepath, 'w') as json_file:
        json.dump(data, json_file, sort_keys=True, indent=4, separators=(',', ':'))

In [34]:
# Read Rios genomic file manifest into dict
filepath = os.path.join(DATA_DIR, 'manifests', 'manifest_171210.csv')
df = pd.read_csv(filepath)
df['Sample Description'] = df['Sample Description'].apply(
    lambda x: x.split(':')[-1].strip())
df.set_index('Library', inplace=True)
gf_manifest_dict = df.to_dict('index')

(395, 5)

In [None]:
# Iterate over all HudsonAlpha objects
by_uuid = {}
bucket = client.Bucket(BUCKET_NAME)
# For all objects in hudsonalpha bucket
for obj in bucket.objects.all():
    if obj.key.startswith('hai'):
        # Check if this file exists in the manifest
        if obj.key.split('/')[-2] not in gf_manifest_dict:
            print('{} has no matching entry in manifest'.format(obj.key))
            continue

        # Create file registration
        md5 = obj.e_tag.split('-')[0].strip('\"')
        size = obj.size
        file_name = os.path.basename(obj.key)
        path = 's3://{}/{}'.format(BUCKET_NAME, obj.key)
        _id = str(uuid.uuid4())
        body = {
            'metadata': {'acls': STUDY_ID},
            'did': _id,
            'file_name': file_name,
            'form': 'object',
            'size': size,
            'urls': [path],
            'hashes': {
                'md5': md5
            }
        }
        by_uuid[_id] = body


In [None]:
# Write to file
if not os.path.exists(GF_BY_UUID_FP):
    write_json(by_uuid, GF_BY_UUID_FP)
else:
    print('{} already exists'.format(GF_BY_UUID_FP))

In [5]:
# Read uuid file
data = read_json(GF_BY_UUID_FP)
df = pd.DataFrame(list(data.values()))
urls = {val['urls'][0] for val in data.values()}

In [None]:
# Submit files to Gen3 QA via indexd endpoint
# Get auth 
ENABLE=False
if ENABLE:
    auth = (os.environ.get('KF_INDEXD_UNAME'), os.environ.get('KF_INDEXD_PWD'))
    for k, body in data.items():
            resp = requests.post('https://gen3qa.kids-first.io/index/index/',
                                 auth=auth,
                                 json=body)
            print(resp.status_code)

In [None]:
# Retrieve lost uuids :(
_id_stack = list(urls)
auth = (os.environ.get('KF_INDEXD_UNAME'), os.environ.get('KF_INDEXD_PWD'))
API_URL = 'https://gen3qa.kids-first.io/index/index'

# Get first id
r = requests.get(API_URL)
current_id = r.json()['ids'][0]
payloads = []
count = 0
while _id_stack or current_id:
    # Get a page
    print('Page {}'.format(count))
    endpoint = API_URL + '/?start=' + str(current_id)
    print('Get {}'.format(endpoint))
    r = requests.get(endpoint, auth=auth)
    _ids = r.json()['ids']
    # For each id in page, get payload and save it
    for i, _id in enumerate(_ids):
        print('\tId #{} = {}'.format(i,_id))
        r1 = requests.get(API_URL + '/' + _id)
        b = r1.json()
        url_key = b['urls'][0]
        # If this is a url we're looking for, pop it off the stack
        if url_key in urls and _id_stack:
            print('\t\tFound a url {}'.format(url_key))
            _id_stack.pop()
        else:
            print('Url {}'.format(url_key))
        payloads.append(b)
    # Set next page
    if _ids:
        current_id = _ids[-1]
    else:
        current_id = None
    count+=1
    print('Payload total {}'.format(len(payloads)))
    print('Id Stack total {}'.format(len(_id_stack)))

In [32]:
print('Complete!')
if payloads:
    write_json(payloads, os.path.join(DATA_DIR, 'payloads.json'))

Complete!


In [63]:
# Results
in_payloads = read_json(os.path.join(DATA_DIR, 'payloads.json'))
df = pd.DataFrame(in_payloads)
def func(row):
    if 'acls' in row['metadata']:
        return row['metadata']['acls'].strip()
    else:
        return None
df['study_id'] = df.apply(func, axis=1)
df = df[df['study_id'] == 'phs001410']
df['s3_path'] = df['urls'].apply(lambda x: x[0])
df = df[['did', 's3_path']]

# Originals
orig = read_json(GF_BY_UUID_FP)
df0 = pd.DataFrame(list(orig.values()))
df0['s3_path'] = df0['urls'].apply(lambda x: x[0])
del df0['did']

# Merge
merged_df = pd.merge(df, df0, on='s3_path')
del merged_df['s3_path']
merged_df['_index'] = merged_df['did']
merged_df.set_index('_index', inplace=True)
results = merged_df.to_dict(orient='index')

# Write to file
verified_gf_fp = os.path.join(DATA_DIR, 'verified_genomic_files_by_uuid.json')
write_json(results, verified_gf_fp)