## Digest data sets to simplified json objects for GEO submission

This notebook exports (1) ExperimentSets and related (2) Experiments and (3) Biosamples as json files, compatible with GEO submission. Embedded objects and complex data structures are simplified in these output files.

* Part 0. Initialize all functions.
* Part 1. List all sets to export.
* Part 2. Generate simplified dictionaries for each ExpSet, Experiment and Biosample.
* Part 3. Save dictionaries as json files.
* Part 4. Patch items (ExpSets, Experiments, Biosamples, Files) with date of export for external submission.

**NOTES**

Things to check manually:
* restricted files (e.g. from HeLa) are not exported, but this does not prevent exporting them before release. Be careful!
* status only checked for files
* if multiple biosources are linked to one biosample, sort things out manually

**ToDo**

* improve File provenance tracking
* support other_processed_files

### Part 0 - Initialize all functions

In [None]:
import json
import time
from pathlib import Path
from datetime import datetime, timezone
from dcicutils import ff_utils
from functions.notebook_functions import get_key
from functions.geo_minimization import *

my_auth = get_key('andrea_hs')
DB = 'GEO'

# this selects the file formats for processed files to be exported
FORMATS = ['mcool', 'pairs', 'bw']
# this selects the file status for raw and processed files to be exported
STATUSES = ['uploaded', 'pre-release', 'released', 'released to project']

data_use_guidelines = ff_utils.get_metadata("621e8359-3885-40ce-965d-91894aa7b758", key=my_auth)['content']
pmid_4dn_paper = '28905911'


def get_item_in_store(item):
    '''Gets item with frame=embedded from store. If absent, uses get_metadata and stores it.
    Input can be @id or a dictionary with @id as key.'''
    if isinstance(item, dict):
        item = item['@id']
    if item not in store:
        store[item] = ff_utils.get_metadata(item, key=my_auth)
    return store[item]


### Files
def boildown_files(files_list):
    '''Takes list of raw files and produces list of runs.
    Each run is a pair of files (if paired end) or a single file (if single end).
    runs = [[pe1, pe2], ..., [se], ...]
    '''
    files_to_export = [f for f in files_list if f['status'] in STATUSES]
    exported_files = [boildown_file(get_item_in_store(f)) for f in files_to_export]
    exported_files_ids = [f['@id'] for f in files_to_export]
    runs = []
    for a_file in exported_files:
        if a_file.get('paired_end') is None:
            runs.append([a_file])
        elif a_file.get('paired_end') == '1':
            pe1 = a_file
            for another_file in exported_files:
                if another_file['accession'] == a_file['related_files']:
                    pe2 = another_file
                    break
            runs.append([pe1, pe2])
    return runs, exported_files_ids


def boildown_processed_files(processed_files_list):
    output_list = []
    exported_files_ids = []
    for pf in processed_files_list:
        if pf['file_format']['display_title'] in FORMATS and pf['status'] in STATUSES:
            file_object = get_item_in_store(pf)
            file_dict = boildown_file(file_object)
            output_list.append(file_dict)
            exported_files_ids.append(file_object['@id'])
    return output_list, exported_files_ids


### Experiment Set
expset_simple_values = [
    'accession', 'description', 'dataset_label',
    'condition', 'public_release', 'number_of_experiments'
]

expset_function_dispatch = {
    '@id': atid2url,
    'lab': boildown_title,
    'contributing_labs': boildown_list_to_titles,
    'submitted_by': boildown_title,
    'award': boildown_award,
#     'last_modified': boildown_date_modified,
    'experiments_in_set': boildown_experiments_in_set,  # gets exps list, replicates, exp type(s), !series_title
#     'documents': boildown_protocols,
    'external_references': boildown_external_references,  # use instead of dbxrefs because it is validated
    'produced_in_pub': boildown_publication,  # returns also !Series_citation if no PMID is available
    'processed_files': boildown_processed_files,
}


def simplify_expset(expset_object):
    ''' Keys are explicitly declared. Keys not present are ignored.'''
    expset_dict = {}
    file_ids = []
    exp_ids = []
    for key, value in expset_object.items():
        export_value = None
        if key in expset_simple_values:
            export_value = value
        elif key == 'processed_files':
            export_value, files_list = expset_function_dispatch[key](value)
            file_ids.extend(files_list)
        elif key == 'experiments_in_set':
            export_value, exps_list = expset_function_dispatch[key](value)
            exp_ids.extend(exps_list)
        elif key in expset_function_dispatch:
            export_value = expset_function_dispatch[key](value)

        if export_value:
            expset_dict = add_to_output_dict(key, export_value, expset_dict)
    if not expset_object.get('produced_in_pub'):  # attach data use guidelines and 4DN white paper
        expset_dict['data_use_guidelines'] = data_use_guidelines
        expset_dict['produced_in_pub'] = pmid_4dn_paper
    expset_dict['series_title'] = expset_dict.setdefault('series_title', '') + expset_object['accession']
    return expset_dict, file_ids, exp_ids


### Protocols
def boildown_experimental_protocol(experiment_object):
    ''' Return experimental_protocol (list), by combining protocol,
    protocol_variations and cell_sorting_protocol'''
    protocols = []
    if experiment_object.get('protocol'):
        protocol = get_item_in_store(experiment_object['protocol'])
        protocols.append(protocol)
    if experiment_object.get('protocol_variations'):
        protocol_variations = [p for p in get_item_in_store(experiment_object['protocol_variations'])]
        protocols.extend(protocol_variations)
    if experiment_object.get('cell_sorting_protocol'):
        protocol_sorting = get_item_in_store(experiment_object['cell_sorting_protocol'])
        protocols.append(protocol_sorting)
    protocols_list = boildown_protocols(protocols)
    return {'experimental_protocol': protocols_list}


def boildown_cell_culture_details(biosample_object):
    ''' Return cell_culture_protocols (list), by combining SOP_cell_culture from Biosource, and
    'protocols_additional' and 'authentication_protocols' from BCC
    '''
    protocols = []
    biosources = biosample_object['biosource']
    for bsr in biosources:
        biosource = get_item_in_store(bsr)
        if biosource.get('SOP_cell_line'):
            protocol = get_item_in_store(biosource['SOP_cell_line'])
            protocols.append(protocol)
    cell_culture_details = biosample_object.get('cell_culture_details', [])
    for bs_cc in cell_culture_details:
        protocols_add = [get_item_in_store(protocol) for protocol in bs_cc.get('protocols_additional', [])] 
        protocols.extend(protocols_add)
#         protocols_auth = [get_item_in_store(protocol) for protocol in bs_cc.get('authentication_protocols', [])] 
#         protocols.extend(protocols_auth)
    protocols_list = boildown_protocols(protocols)
    return {'cell_culture_protocols': protocols_list}


def boildown_biosample_protocols(biosample_protocols):
    ''' Get each protocol object from store'''
    protocols = [get_item_in_store(protocol) for protocol in biosample_protocols if protocol]
    protocols_list = boildown_protocols(protocols)
    return protocols_list


### Experiment

# note that some values appear in multiple schemas, but here are only listed once
experiment_simple_values = [

    ## add? 'other_processed_files'
    
    # mixins
    'public_release',
    'library_prep_kit', 'average_fragment_size', 'fragment_size_range', 'fragmentation_method',
    'fragment_size_selection_method', 'pcr_cycles', 'spikin_description',
    'antibody_lot_id', 'antibody_dilution',

    # calcprops
    'experiment_summary',
    
    # experiment (generic)
    'accession', 'description',

    # experiment_hi_c
    'crosslinking_method', 'crosslinking_time', 'crosslinking_temperature',
    'enzyme_lot_number', 'digestion_time', 'digestion_temperature', 'tagging_method',
    'ligation_time', 'ligation_temperature', 'ligation_volume', 'biotin_removed',
    
    # experiment_atacseq
    'transposase', 'enzyme_incubation_time', 'incubation_temperature', 'primer_removal_method',
    
    # experiment_capture_c
    'rna_tag',
    
    # experiment_chiapet
    
    # experiment_damid
    'sap_treatment', 'me_pcr_cycles', 'y_ligation_dna_input',
    
    # experiment_repliseq
    'dna_label', 'labeling_time', 'cell_cycle_phase', 'stage_fraction', 'total_fractions_in_exp',
    
    # experiment_seq
    'tagging_rounds', 'reaction_time', '3p_adenylation_time', '3p_adenylation_temperature',
    'strandedness', 'molecule',

    # experiment_tsaseq
    'protocol_version', 'resolution', 'secondary_antibody_lot_id', 'secondary_antibody_dilution',
    'tyramide_concentration', 'reaction_buffer', 'reaction_temperature', 'affinity_rounds',
    'average_biotin_range', 'biotinylated_spikein_source', 'non_biotinylated_spikein_source',
    ## add? 'biotinylated_spikein_sequences', 'non_biotinylated_spikein_sequences' linkTo FileReferences
]

experiment_function_dispatch = {
    '@id': atid2url,
    'lab': boildown_title,
    'contributing_labs': boildown_list_to_titles,
    'submitted_by': boildown_title,
    'award': boildown_award,
    'display_title': boildown_exp_display_title,
    'experiment_type': boildown_experiment_type,
    'experiment_relation': boildown_experiment_relations,
#     'last_modified': boildown_date_modified,

#     'experiment_categorizer': boildown_exp_categorizer,  # 'combined' key (enzyme or target) + value
    'targeted_factor': boildown_list_to_titles,
    'targeted_regions': boildown_targeted_regions,  # Capture Hi-C Experiment
    'digestion_enzyme': boildown_title,
    'antibody': boildown_title,
    'secondary_antibody': boildown_title,
    'reporter_construct': boildown_title,
    
    'biosample': boildown_biosample_name,
    'biosample_quantity': boildown_biosample_quantity,  # includes units
    
    # experimental protocol
    'protocol': boildown_experimental_protocol,  # includes protocol_variation and cell_sorting_protocol # !Sample_extract_protocol,
    'protocol_variation': boildown_experimental_protocol,  # includes protocol and cell_sorting_protocol # !Sample_extract_protocol,
    'cell_sorting_protocol': boildown_experimental_protocol,  # includes protocol and protocol_variation # !Sample_extract_protocol,

    'files': boildown_files,
    'processed_files': boildown_processed_files,
}

def simplify_experiment(experiment_object):
    ''' Keys are explicitly declared. Keys not present are ignored.'''
    experiment_dict = {}
    file_ids = []
    biosample_id = ''
    for key, value in experiment_object.items():
        export_value = None
        if key in experiment_simple_values:
            export_value = value
        elif key in ['biosample_quantity', 'protocol', 'protocol_variation', 'cell_sorting_protocol']:
            export_value = experiment_function_dispatch[key](experiment_object)
        elif key in ['files', 'processed_files']:
            export_value, files_list = experiment_function_dispatch[key](value)
            file_ids.extend(files_list)
        elif key == 'biosample':
            export_value, biosample_id = experiment_function_dispatch[key](value)
        elif key in experiment_function_dispatch:
            export_value = experiment_function_dispatch[key](value)

        if export_value:
            experiment_dict = add_to_output_dict(key, export_value, experiment_dict)
    return experiment_dict, file_ids, biosample_id


### Biosample
individual_simple_values = [
    'age', 'age_units', 'sex', 'life_stage', 'mouse_life_stage',
    'mouse_strain', 'ethnicity', 'health_status',
]

def boildown_individual(biosample_object):
    '''Get individual and return all keys in interesting values'''
    individual_dict = {}
    individual = get_item_in_store(biosample_object['biosource'][0]['individual'])
    for key, value in individual.items():
        export_value = None
        if key in individual_simple_values:
            export_value = value
        elif key == 'organism':
            organism_object = get_item_in_store(individual['organism'])
            export_value = boildown_organism(organism_object)
        if export_value:
            individual_dict = add_to_output_dict(key, export_value, individual_dict)
    return individual_dict


biosource_function_dispatch = {
    'biosource_vendor': boildown_title,
    'cell_line': boildown_title,
    'individual': boildown_individual,
    'SOP_cell_line': boildown_cell_culture_details,  # also retrieved from Biosample if cell_culture_details exists
}

def minimize_biosource(biosample_object):
    ''' Biosources list is obtained from biosample.
    Often (always?) there is just one Biosource. Return list of accessions otherwise.
    Most of the interesting values are embedded fields, apart from SOP_cell_line
    which requires to get_metadata.'''
    biosources_list = biosample_object['biosource']
    if len(biosources_list) > 1:
        return ', '.join([bsr['accession'] for bsr in biosources_list])
    # most cases have only 1 biosource
    biosource = biosources_list[0]
    biosource_dict = {}
    for key, value in biosource.items():
        export_value = None
        if key in ['individual', 'SOP_cell_line']:  # pass the entire object
            export_value = biosource_function_dispatch[key](biosample_object)
        elif key in biosource_function_dispatch:
            export_value = biosource_function_dispatch[key](value)

        if export_value:
            biosource_dict = add_to_output_dict(key, export_value, biosource_dict)
    return biosource_dict


biosample_simple_values = [
    'accession', 'biosource_summary', 'biosample_type', 'description',
    'modifications_summary', 'treatments_summary',
]

biosample_function_dispatch = {
    '@id': atid2url,
    'tissue_organ_info': boildown_tissue_organ_info,  # OK also with multiple biosources
    'biosource': minimize_biosource,
    'biosample_protocols': boildown_biosample_protocols,
    'cell_culture_details': boildown_cell_culture_details,  # returns cell_culture_protocols
#     'last_modified': boildown_date_modified,
#     'documents': boildown_protocols,
#     'external_references': boildown_external_references,  # dbxrefs
}


def simplify_biosample(biosample_object):
    ''' Keys are explicitly declared. Keys not present are ignored.'''
    biosample_dict = {}
    for key, value in biosample_object.items():
        export_value = None
        if key in biosample_simple_values:
            export_value = value
        elif key in ['cell_culture_details', 'biosource']:  # pass the entire object
            export_value = biosample_function_dispatch[key](biosample_object)
        elif key in biosample_function_dispatch:
            export_value = biosample_function_dispatch[key](value)

        if export_value:
            biosample_dict = add_to_output_dict(key, export_value, biosample_dict)
    return biosample_dict

### Part 1 - List all sets to export

In [None]:
# List ExpSets (or a search query) to export for GEO submission

sets_list = ['4DNESML2L8RP'] # ['4DNESRJ8KV4Q', '4DNESNMAAN97']
search_url = '' #'/search/?type=ExperimentSetReplicate&condition=Enzyme%20DpnII%20-%20in%20situ%20Hi-C%20on%20cells%20cultured%20prior%20to%204DN%20SOP'

time1 = time.time()
if sets_list:
    sets_to_export = [ff_utils.get_metadata(set_id, my_auth)['uuid'] for set_id in sets_list]
elif search_url:
    sets_to_export = [i['uuid'] for i in ff_utils.search_metadata(search_url, my_auth)]
    
result_expand_es = {}
uuids = []
result_expand_es, uuids = ff_utils.expand_es_metadata(sets_to_export, my_auth, store_frame='embedded',add_pc_wfr=True, ignore_field = ['experiment_relation', 'biosample_relation', 'references', 'experiment_type'])

experiment_sets = result_expand_es.get('experiment_set_replicate', []) + result_expand_es.get('experiment_set', [])
print(len(experiment_sets), 'exp sets collected')
print(len(uuids), 'items collected')
time2 = time.time()
print(round((time2-time1), 1), 'sec for collection')

# get date of metadata export (to be reported in the external_submissions)
date_exported = datetime.now(timezone.utc).isoformat()

# Reorder result in store, using @id as keys
store = {}  # key is @id, value is the object with frame=embedded
for item_type, items_list in result_expand_es.items():
    if item_type.startswith('experiment'):
        # we want to use get_metadata for these, due to the ignore_field in expand_es_metadata 
        continue
    for item in items_list:
        store[item['@id']] = item

### Part 2 - Generate simplified dictionaries
Export simplified dictionaries for each ExpSet, Experiment, Biosample

In [None]:
exported_items = []  # list of exported simplified dictionaries (ExpSet, Exp, Biosample)
id_exported_items = []  # list of @id of items that need date_exported (ExpSet, Exp, Biosample, FileFastq)

# Experiment Sets
expsets_to_export = [es['@id'] for es in experiment_sets]
experiments_to_export = []
for expset in expsets_to_export:
    es_dictionary, file_ids, exp_ids = simplify_expset(get_item_in_store(expset))
    exported_items.append(es_dictionary)
    id_exported_items.append(expset)
    id_exported_items.extend(file_ids)
    experiments_to_export.extend(exp_ids)

# Experiments
experiments_to_export = list(set(experiments_to_export))
biosamples_to_export = []
for exp in experiments_to_export:
    ex_dictionary, file_ids, biosample_id = simplify_experiment(get_item_in_store(exp))
    exported_items.append(ex_dictionary)
    id_exported_items.append(exp)
    id_exported_items.extend(file_ids)
    biosamples_to_export.append(biosample_id)

# Biosamples
biosamples_to_export = list(set(biosamples_to_export))
for bs in biosamples_to_export:
    bs_dictionary = simplify_biosample(get_item_in_store(bs))
    exported_items.append(bs_dictionary)
    id_exported_items.append(bs)

print('Exp Sets exported:\n', '\n'.join(expsets_to_export), sep='')
print('\nExperiments exported:\n', '\n'.join(experiments_to_export), sep='')
print('\nBiosamples exported:\n', '\n'.join(biosamples_to_export), sep='')

### Part 3 - Save dictionaries as json files

In [None]:
# set directory for output files
directory = Path("~/Documents/GEO/tsa_test").expanduser()
overwrite = False  # overwrites existing files if set to True

for item in exported_items:
    file_name = item['accession'] + '.json'
    full_path = Path.joinpath(directory, file_name)
    if full_path.exists() and not overwrite:
        print(file_name, 'already exists in the folder', directory)
    else:
        with open(full_path, 'w', encoding='utf-8') as fp:
            json.dump(item, fp, indent=4, ensure_ascii=False)
        print(file_name, 'file saved')

### Part 4 - Patch export date for each item in external_submission

In [None]:
# does not patch unless action is True
action = False

patch_body = {'external_submission': {'date_exported': date_exported, 'database': DB}}
count = 0
for item_id in id_exported_items:
    if action:
        res = ff_utils.patch_metadata(patch_body, item_id, key=my_auth)
        if res['status'] != 'success':
            print(res)
            break
        else:
            count += 1
    else:
        print(item_id)
print('patched {} items'.format(count))