### PLEASE COPY NOTEBOOKS TO YOUR FOLDERS TO PREVENT COMMIT CONFLICTS
* If you would like to contribute to this notebook, make changes on it in useful_notebooks folder, run "Restart and Clear Output" before commit.

#### Unexpected Expansion Cases
###### Experiment set fetches unrealated labs and awards
Sometimes a fetch will get some unrelated labs and awards, this is because of the multiple awards a lab can have. This multiple awards are visited, which have users linked to them. This users also have labs, so here you go. Hopefully all are released/current already.

###### Experiment set fetches unrealted biosmaple/experiment/set
This was so far because of the experiment and biosample relation field, or the references field that links to a publication. If you ignore these fields, if should be fine.
`ignore_field = ['experiment_relation', 'biosample_relation', 'references']`

## Initial Setup

In [None]:
from dcicutils import ff_utils
from functions.notebook_functions import *
from functions.cleanup import *
import time

# status mapping for ordering purposes
STATUS_LEVEL = {
    # standard_status
    "released": 10, "current": 10, "restricted": 10,
    "released to project": 9,
    "pre-release": 8,
    "planned": 6, "submission in progress": 6,
    "in review by lab": 4,
    "revoked": 0, "archived": 0,"deleted": 0, "obsolete": 0, "replaced": 0, "archived to project": 0,
    # additional file statuses
    'to be uploaded by workflow': 4, 'uploading': 4, 'uploaded': 4, 'upload failed': 4, 'draft': 4, 'released to lab': 4}


my_auth = get_key('andyprod', keyfile='~/keypairs.json')

# Which status to change
change_status = 'released'
change_level = STATUS_LEVEL.get(change_status, 1)
additional_changes = None


## helper function to deal with extra_files of file items
def check_extra_files(extra_files):
    ef_fields = ['href', 'md5sum', 'file_size', 'use_for', 'status']
    extras = []
    chgcnt = 0
    for ef in extra_files:
        extra = {pname: ef.get(pname) for pname in ef_fields if pname in ef}
        efformat = ef.get('file_format').get('uuid')
        extra['file_format'] = efformat
        
        estatus = ef.get('status')
        estatus_level = STATUS_LEVEL.get(estatus)
        if estatus_level < 4 or estatus_level >= change_level:
            # we don't want to change this extra_file at all
            continue
        else:
            extra['status'] = change_status
            chgcnt += 1
        extras.append(extra)
    
    return (chgcnt, extras)  # returning the count of extras needing to be changed along with list of all extras
        

### (optional) Do you want to patch other properties, besides status?
Examples: add/remove viewing groups, tags, contributing labs, patch lab/award

In [34]:
# Use validate_change() to add each change to the list additional_changes
additional_changes = []

# add_viewing_group = validate_change(
#     key='viewable_by', value=['IWG'], verb='add', level_min=6)
# additional_changes.append(add_viewing_group)

# remove_viewing_group = validate_change(
#     key='viewable_by', value=['IWG'], verb='remove', level_min=9, level_max=10)
# additional_changes.append(remove_viewing_group)

# tag_experiments = validate_change(
#     key='tags', value=['my_tag'], verb='add',  types=['experiment_hi_c'], level_max=10)
# additional_changes.append(tag_experiments)

# add_contributing_lab = validate_change(
#     key='contributing_labs', value=['828cd4fe-ebb0-4b36-a94a-d2e3a36cc989'], verb='add',
#     types=['file_fastq', 'file_processed'])
# additional_changes.append(add_contributing_lab)

# patch_award = validate_change(
#     key='award', value='71171a4e-dca1-44cb-8375-fafd896c6923', verb='patch',
#     level_min=4, level_max=4)
# additional_changes.append(patch_award)

## Retrieve the ExperimentSetReplicate Items to release along with all their linked Items

In [None]:
sets_in_scope = ['4DNES19URNAH'] # ['4DNACCCC', '4DNACCCCC']

#search_url  = '/search/?award.project=4DN&experiments_in_set.experiment_type=dilution+Hi-C&experimentset_type=replicate&lab.display_title=Bing+Ren%2C+UCSD&status=pre-release&type=ExperimentSetReplicate'

time1 = time.time()
if sets_in_scope:
    set_to_release = [ff_utils.get_metadata(i, my_auth)['uuid'] for i in sets_in_scope]
elif search_url:
    set_to_release = [i['uuid'] for i in ff_utils.search_metadata(search_url, my_auth)]

store={}
item_uuids=[]
store, uuids = ff_utils.expand_es_metadata(set_to_release, my_auth, store_frame='embedded',add_pc_wfr=True, ignore_field = ['experiment_relation', 'biosample_relation', 'references', 'experiment_type'])

print(len(store['experiment_set_replicate']), 'exp sets for status change')
print(len(uuids), 'items collected')
time2 = time.time()
print(round((time2-time1), 1), 'sec for collection')

## Do Some QC checks on several of the expected and most important item types

### NOTE: if check_wfrs is True QC will be done on workflows - in some cases for may not care to check this when pre-releasing but generally it's a good idea to leave True
### When delete_problematic is set to True then problematic workflow_runs will have their status changed to 'deleted' 

- this cleans up failed duplicate runs and errored runs

In [None]:
# TODO
# Check audits

# Please Modify the following accordingly 
# do you want to check for duplicate/problematic runs on files?
# it will take some time
check_wfrs = True
# if any are found do you want to remove them?
delete_problematic = False



# create stash of wfrs to pass to delete_wfrs
stash = store.get('workflow_run_sbg', []) + store.get('workflow_run_awsem', [])


# check expsets
print('EXPSET CHECK')
for a_set in store['experiment_set_replicate']:
    if not a_set.get('completed_processes'):
        print(a_set['accession'], 'missing processing tag', a_set['description'][:50])

# check exps 
print('\nEXP CHECK')
# check for experiment numbers
exp_names = [i for i in store if i.startswith('experiment') and not i.startswith('experiment_set')]
all_exps_on_sets = [a for i in store['experiment_set_replicate'] for a in i['experiments_in_set']]
all_exps = [a['uuid'] for i in store.keys() for a in store[i] if i in exp_names]
if len(all_exps_on_sets) != len(all_exps):
    print('Number of experiments is not same as experiments associated with sets')
    print('# of exps: {}. # of exps on sets: {}'.format(len(all_exps), len(all_exps_on_sets)))

hela_exps = []  # a list of exp['uuid'] with biosource from HeLa individual
hela_exps_unsure = []
for exp_type in exp_names:
    for exp in store[exp_type]:
        biosource_is_hela = 0
        for bs in exp['biosample']['biosource']:
            if bs.get('individual') and bs['individual'].get('display_title') == '4DNINEL8T2GK':  # the HeLa individual
                biosource_is_hela += 1
        if biosource_is_hela == len(exp['biosample']['biosource']):  # all biosources are HeLa: exp is HeLa
            hela_exps.append(exp['uuid'])
        elif biosource_is_hela > 0:  # some but not all biosources are HeLa: unsure
            hela_exps_unsure.append(exp['uuid'])
if hela_exps_unsure and change_level >= 9:
    release_hela = input('Experiments with multiple Biosources found, some of which are HeLa. ' +
                         'Sequence files associated with these experiments will be restricted. ' +
                         'Do you want to release them, instead? (yes/no)')
    if release_hela == 'no':
        hela_exps.extend(hela_exps_unsure)
    elif release_hela != 'yes':
        raise ValueError('Invalid response')

print('\nFILE FASTQ CHECK')
for a_file in store['file_fastq']:
    if not a_file.get('quality_metric'):
        print(a_file['accession'], 'missing fastqc')
    if not a_file.get('content_md5sum'):
        print(a_file['accession'], 'missing content md5 sum')
    if not a_file.get('md5sum'):
        print(a_file['accession'], 'md5 was not calculated during upload, missing md5sum')
    if check_wfrs:
        dw = delete_wfrs(a_file, my_auth, delete=delete_problematic, stash=stash)

# check processed files
print('\nFILE PROCESSED CHECK')
if store.get('file_processed'):
    for a_file in store['file_processed']:
        # in select cases fastq can be a processed file
        if a_file['file_format']['file_format'] == '/file-formats/fastq/':
            if not a_file.get('quality_metric'):
                print(a_file['accession'], 'missing Fastqc for FastQ processed file')
        if a_file['file_format']['file_format'] == '/file-formats/pairs/':
            if not a_file.get('quality_metric'):
                print(a_file['accession'], 'missing Pairsqc')
        if a_file['file_format']['file_format'] == '/file-formats/bam/':
            if not a_file.get('quality_metric'):
                print(a_file['accession'], 'missing BAMqc')
        if a_file['file_format']['file_format'] == '/file-formats/bed/':
            if a_file.get('track_and_facet_info').get('experiment_type') in ['ChIP-seq', 'ATAC-seq', 'CUT&RUN']:
                if not a_file.get('quality_metric'):
                    print(a_file['accession'], 'missing QualityMetric')
        if a_file['file_format']['file_format'] == '/file-formats/tsv/':
            if a_file.get('track_and_facet_info').get('experiment_type') == 'RNA-seq':
                qc_info = a_file.get('quality_metric')
                if not qc_info:
                    print(a_file['accession'], 'missing RNAseq QualityMetric info')
                elif len(qc_info) < 2:
                    print(a_file['accession'], 'missing some RNAseq QualityMetric info')
                    
        if not a_file.get('source_experiments'):
            print(a_file['accession'], 'user submitted or produced by sbg runs')
        if check_wfrs:
            dw = delete_wfrs(a_file, my_auth, delete=delete_problematic, stash=stash)   

# check wfrs
print('\nWFR CHECK')
# list all wf types found
print('  Following run types are found:')
for wf in set([i['display_title'].split(' run')[0] for i in store.get('workflow_run_awsem')]):
           print('    ' + wf)
if store.get('workflow_run_awsem'):
    for wfr in store['workflow_run_awsem']:
        if wfr['run_status'] != 'complete':
            print('problematic wfr', wfr['uuid'], wfr['run_status'])
        
# check for weird status
print('\nREPORT NUMBERS AND CHECK STATUS')
for i in store:
    print(i, len(store[i]))
    weird = [[i, x['uuid'], x['status']] for x in store[i] if STATUS_LEVEL.get(x['status']) == 0]
    if weird:
        for case in weird:
            print(case)
        print()

## This cell checks each item against the requested status change and let's you know what will happen

#### print_each = True will provide info on every Item that will be changed - it is useful to check this output to get an idea of what will happen in the next cell but if you are confident and want shorter output you can set this to False

In [None]:
# Check status
print_each = True

for a_type in store:
    total = len(store[a_type])
    change = 0
    matching = 0
    unusual = 0
    skipping = 0
    for raw_data in store[a_type]:
        item_level = STATUS_LEVEL.get(raw_data['status'])
        patch_data = {}
        if item_level > change_level:
            skipping += 1
            msg = ('{} {} ITEM HAS STATUS {} HIGHER THAN {} - SKIPPING'.format(a_type, raw_data['uuid'], raw_data['status'], change_status))
        elif item_level == change_level:
            matching += 1
            msg = ('MATCHING ACCESS STATUS', a_type, raw_data['uuid'], raw_data['status'])
        elif item_level == 0:
            unusual += 1
            msg = ('SKIP UNUSUAL STATUS   ', a_type, raw_data['uuid'], raw_data['status'])
        else:
            change += 1
            msg = ('        CHANGE        ', a_type, raw_data['uuid'], raw_data['status'])

            # Special case: HeLa sequences (FASTQ and BAM files) are not released but restricted
            if change_level >= 9 and hela_exps and a_type in ['file_fastq', 'file_processed'] and raw_data['file_format']['file_format'] in ['fastq', 'bam']:
                is_hela = file_in_exp(raw_data, hela_exps)
                if is_hela:
                    msg = ('       RESTRICT       ', a_type, raw_data['uuid'], raw_data['status'])
                elif is_hela is None:
                    change -= 1
                    skipping += 1
                    print('\nERROR! SKIPPING {} {} Impossible to determine whether is HeLa'.format(a_type, raw_data['uuid']))

        if additional_changes:
            patch_data = change_additional_fields(patch_data, raw_data, a_type, item_level, change_level, additional_changes)
            if patch_data:
                msg += tuple(k + ': ' + v for k, v in patch_data.items())

        if print_each:
            if a_type.startswith('file_'):
                # check to see if there are any extra_files and report how many will be patched
                if 'extra_files' in raw_data:
                    cnt, extras = check_extra_files(raw_data.get('extra_files'))
                    msg = msg + ('\n\textra_file patch - update {} of {}'.format(cnt, len(extras)),)
            print(msg)
            
    print('{:<25} Out of {t}, {r} skipped, {m} matching, {u} unusual, and {c} needs change'.format(a_type, t=total, r=skipping, m=matching, u=unusual, c=change))
    
       

## And finally do the patching if all looks good

- don't forget wrangler review - you'll be asked
- set action = True to effect the patch otherwise it will just report what will happen
- generally print_each=False is fine in this cell but can be changed if you like

In [None]:
# If you want to patch the status, change action to True
action = False
print_each = False

reviewed = ""
reviewed = input('Did another wrangler review this release? (y/n):')
if reviewed != 'y':
    raise KeyError('A key step is missing!')

for a_type in store:
    total = len(store[a_type])
    change = 0
    matching = 0
    unusual = 0
    skipping = 0
    for raw_data in store[a_type]:
        item_level = STATUS_LEVEL.get(raw_data['status'])
        patch_data = {}

        # Status change
        if item_level > change_level:
            skipping += 1
            msg = ('{} {} ITEM HAS STATUS {} HIGHER THAN {} - SKIPPING'.format(a_type, raw_data['uuid'], raw_data['status'], change_status))
        elif item_level == change_level:
            matching += 1
            msg = ('MATCHING ACCESS STATUS', a_type, raw_data['uuid'], raw_data['status'])
        elif item_level == 0:
            unusual += 1
            msg = ('SKIP UNUSUAL STATUS   ', a_type, raw_data['uuid'], raw_data['status'])
        else:
            # Normal case
            change += 1
            msg = ('        CHANGE        ', a_type, raw_data['uuid'], raw_data['status'])
            patch_data['status'] = change_status

            # Special case: publication
            if change_status == 'released' and a_type in ['publication']:
                patch_data['status'] = 'current'

            # Special case: HeLa sequences (FASTQ and BAM files) are not released but restricted
            if change_level >= 9 and hela_exps and a_type in ['file_fastq', 'file_processed'] and raw_data['file_format']['file_format'] in ['fastq', 'bam']:
                is_hela = file_in_exp(raw_data, hela_exps)
                if is_hela:
                    msg = ('       RESTRICT       ', a_type, raw_data['uuid'], raw_data['status'])
                    patch_data['status'] = 'restricted'
                elif is_hela is None:
                    change -= 1
                    skipping += 1
                    print('\nERROR! SKIPPING {} {} Impossible to determine whether is HeLa'.format(a_type, raw_data['uuid']))
                    continue
                    
            # Special handling of extra_files
            if a_type.startswith('file_'):
                # check to see if there are any extra_files and report how many will be patched
                if 'extra_files' in raw_data:
                    cnt, extras = check_extra_files(raw_data.get('extra_files'))
                    if cnt:
                        patch_data['extra_files'] = extras

        # Additional changes
        if additional_changes:
            patch_data = change_additional_fields(patch_data, raw_data, a_type, item_level, change_level, additional_changes)
            if [k for k in patch_data if k not in ['status', 'extra_files']]:
                msg += tuple(k + ': ' + v for k, v in patch_data.items() if k not in ['status', 'extra_files'])

        # Do the patch
        if patch_data and action:
            ff_utils.patch_metadata(patch_data, obj_id=raw_data['uuid'],key=my_auth)

        if print_each:
            print(msg)
    print('{:<25} Out of {t}, {r} skipped, {m} matching, {u} unusual, and {c} UPDATED with status'.format(a_type, t=total, r=skipping, m=matching, u=unusual, c=change))