# DSS-AZUL-INDEXING

### This jupyter notebook is to do a couple testing indexing operations using the DataExtractor class and the FileIndexer class.

Below here we import our modules and set up:
* ElasticSearch Client
* Dummy payload of event
* Parse the bundle_uuid and the bundle_version

In [1]:
from elasticsearch import Elasticsearch
from chalicelib.utils import DataExtractor
from chalicelib.indexer import FileIndexer, AssayOrientedIndexer, SampleOrientedIndexer,\
    ProjectOrientedIndexer, BundleOrientedIndexer
import json
from pprint import pprint

# Create an ElasticSearch client
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
# Sample event payload
payload = { "query": { "query": { "match_all":{}} }, "subscription_id": "ba50df7b-5a97-4e87-b9ce-c0935a817f0b", "transaction_id": "ff6b7fa3-dc79-4a79-a313-296801de76b9", "match": { "bundle_version": "2018-01-03T173141.039382Z", "bundle_uuid": "d8a4576b-66f5-4b10-aecf-f6c4025b9997" } }
bundle_uuid = payload['match']['bundle_uuid']
bundle_version = payload['match']['bundle_version']
# Create 

Next, we will create an instance of the DataExtractor and use it to get the contents from the bundle referenced by the variable `payload`. We will be pulling from the AWS replica.

In [2]:
# Create DataExtractor instance pointing to HCA Staging
extractor = DataExtractor("https://dss.data.humancellatlas.org/v1")
# Use dummy payload and get the metadata_files and the data_files
metadata_files, data_files = extractor.extract_bundle(payload, "aws")
# Print each dictionary
print("\n#####################################################")
print("#                    PRINTING METADATA               #")
print("#####################################################")
pprint(metadata_files, indent=4)
print("\n#####################################################")
print("#                  PRINTING DATA FILES               #")
print("#####################################################")
pprint(data_files, indent=4)


#####################################################
#                    PRINTING METADATA               #
#####################################################
{   'assay.json': {   'content': {   'assay_id': 'assay_1',
                                     'core': {   'schema_url': 'https://raw.githubusercontent.com/HumanCellAtlas/metadata-schema/4.6.1/json_schema/assay.json',
                                                 'schema_version': '4.6.1',
                                                 'type': 'assay'},
                                     'rna': {   'end_bias': 'five_prime_end',
                                                'library_construction': 'smart-seq2',
                                                'strand': 'both'},
                                     'seq': {   'instrument_model': 'HiSeq '
                                                                    '2500',
                                                'instrument_platform': 'Illumina',
      

Next we pass this on to the FileIndexer class to create a File Oriented index entry on ElasticSearch running on `localhost:9200`. But first, we get the index settings and get the configuration files.

In [3]:
# Define helper method to open files
def open_and_return_json(file_path):
    """
    Opens and returns the contents of the json file given in file_path
    :param file_path: Path of a json file to be opened
    :return: Returns an obj with the contents of the json file
    """
    with open(file_path, 'r') as file_:
        loaded_file = json.load(file_)
    return loaded_file

# Get the index's settings
index_settings = open_and_return_json('chalicelib/settings.json')
# Get the index overall config
index_mapping_config = open_and_return_json('chalicelib/config.json')

file_indexer = FileIndexer(metadata_files,
                           data_files,
                           es,
                           "file_index_v4",
                           "doc",
                           index_settings=index_settings,
                           index_mapping_config=index_mapping_config)

bundle_indexer = BundleOrientedIndexer(metadata_files,
                                       data_files,
                                       es,
                                       "bundle_index_v4",
                                       "doc",
                                       index_settings=index_settings,
                                       index_mapping_config=index_mapping_config)

assay_indexer = AssayOrientedIndexer(metadata_files,
                                     data_files,
                                     es,
                                     "assay_index_v4",
                                     "doc",
                                     index_settings=index_settings,
                                     index_mapping_config=index_mapping_config)

sample_indexer = SampleOrientedIndexer(metadata_files,
                                       data_files,
                                       es,
                                       "sample_index_v4",
                                       "doc",
                                       index_settings=index_settings,
                                       index_mapping_config=index_mapping_config)

project_indexer = ProjectOrientedIndexer(metadata_files,
                                         data_files,
                                         es,
                                         "project_index_v4",
                                         "doc",
                                         index_settings=index_settings,
                                         index_mapping_config=index_mapping_config)

file_indexer.index(bundle_uuid, bundle_version)
bundle_indexer.index(bundle_uuid, bundle_version)
assay_indexer.index(bundle_uuid, bundle_version)
sample_indexer.index(bundle_uuid, bundle_version)
project_indexer.index(bundle_uuid, bundle_version)

print("INDEXING DONE")

PRINTING FILE INDEX DOCUMENT:

{'analysisComputationalMethod': ['None'],
 'analysisId': ['None'],
 'assayId': ['assay_1'],
 'bundles': [{'type': 'scRNA-Seq Upload',
              'uuid': 'd8a4576b-66f5-4b10-aecf-f6c4025b9997',
              'version': '2018-01-03T173141.039382Z'}],
 'es_uuid': 'd8a4576b-66f5-4b10-aecf-f6c4025b9997:5da5ace1-e8ca-4454-9d8f-d0105b984632',
 'files': {'content-type': 'application/gzip; dcp-type=data',
           'crc32c': '4ef74578',
           'format': 'fastq.gz',
           'indexed': False,
           'name': 'R1.fastq.gz',
           's3_etag': 'c7bbee4c46bbf29432862e05830c8f39',
           'sha1': '17f8b4be0cc6e8281a402bb365b1283b458906a3',
           'sha256': 'fe6d4fdfea2ff1df97500dcfe7085ac3abfb760026bff75a34c20fb97a4b2b29',
           'size': 125191,
           'uuid': '5da5ace1-e8ca-4454-9d8f-d0105b984632',
           'version': '2018-01-03T173137.798791Z'},
 'pairedEnds': [True],
 'projectContributorsEmail': ['mfreeberg@ebi.ac.uk'],
 'projectId'

PRINTING SAMPLE INDEX DOCUMENT:

defaultdict(<class 'list'>,
            {'analysisComputationalMethod': ['None'],
             'analysisId': ['None'],
             'assayId': ['assay_1'],
             'bundles': [{'type': 'scRNA-Seq Upload',
                          'uuid': 'd8a4576b-66f5-4b10-aecf-f6c4025b9997',
                          'version': '2018-01-03T173141.039382Z'}],
             'es_uuid': 'Q4_DEMO-donor_MGH30',
             'files': [{'content-type': 'application/gzip; dcp-type=data',
                        'crc32c': '4ef74578',
                        'format': 'fastq.gz',
                        'indexed': False,
                        'name': 'R1.fastq.gz',
                        's3_etag': 'c7bbee4c46bbf29432862e05830c8f39',
                        'sha1': '17f8b4be0cc6e8281a402bb365b1283b458906a3',
                        'sha256': 'fe6d4fdfea2ff1df97500dcfe7085ac3abfb760026bff75a34c20fb97a4b2b29',
                        'size': 125191,
                       