In [1]:
import encode_utils.stream as es
import encode_utils.grab as grab

In [559]:
def find_pair(experiments, filter_dict):
    filters = []
    for k, v in filter_dict.items():
        filters.append(es.filter_field_by_comparison(field=k, value=v[0], comparison=v[1]))
    yield from es.match(experiments, *filters)

def has_pair(parsed_data, get_data=False):
    for d in parsed_data:
        matches = list(find_pair(parsed_data, {'biosample_term_name': (d['biosample_term_name'], 'equals'),
                                               'biosample_term_id': (d['biosample_term_id'], 'equals'),
                                               'biosample_type': (d['biosample_type'], 'equals'),
                                               'biosample_summary': (d['biosample_summary'], 'equals'),
                                               'target': (d['target'], 'equals'),
                                               'lab': (d['lab'], 'equals'),
                                               'assay_title': (d['assay_title'], 'equals')}))
        if len(matches) == 2:
            has_pair = True
        else:
            has_pair = False
        if get_data:
            if has_pair:
                yield matches
        else:
            yield d['accession'], has_pair
def get_unique_pairs(pairs):           
    unique_set = set()
    for pair in pairs:
        pair_tup_one = (pair[0]['accession'], pair[1]['accession'])
        pair_tup_two = (pair[1]['accession'], pair[0]['accession'])
        if pair_tup_one in unique_set or pair_tup_two in unique_set:
            continue
        else:
            unique_set.add(pair_tup_one)
    return unique_set
    
def parse_experiments(experiments):
    for e in experiments:
        yield {'accession': e.get('accession', e.get('uuid')),
               'biosample_summary': e.get('biosample_summary', ''),
               'biosample_term_name': e.get('biosample_term_name', ''),
               'biosample_term_id': e.get('biosample_term_id', ''), 
               'biosample_type': e.get('biosample_type', ''),
               'target': e.get('target', {}).get('name', ''),
               'lab': e.get('lab', {}).get('name', ''),
               'assay_title': e.get('assay_title', ''),
               'alternate_accessions': e.get('alternate_accessions', [])}
        
def get_pair_data(pair):
    data = grab.quick_grab_data(['{}/{}?{}&frame=embedded'.format(grab.base_url,
                                                                  p,
                                                                  grab.json_only) for p in pair])
    return data

def get_merge_info(e):
    return {'reps': [(r.get('uuid'),
                      r.get('experiment'),
                      r.get('biological_replicate_number'),
                      r.get('technical_replicate_number')) for r in e.get('replicates', {})],
            'original_files': e.get('original_files', {}),
            'dbxrefs': e.get('dbxrefs', []),
            'aliases': e.get('aliases', []),
            'documents': e.get('documents', []),
            'alternate_accessions': e.get('alternate_accessions', [])}

def determine_base_exp(pair_a, pair_b):
    """
    Return True if pair_a should be base experiment else False.
    """
    bio_reps_a = [b[2] for b in get_merge_info(pair_a)['reps']]
    bio_reps_b = [b[2] for b in get_merge_info(pair_b)['reps']]
    if min(bio_reps_a) == min(bio_reps_b):
        # Take one with more tech_reps if bio_rep same.
        if len(bio_reps_b) > len(bio_reps_a):
            return False
        return True
    # Take one with lower bio_rep_number in general.
    if min(bio_reps_a) <= min(bio_reps_b):
        return True
    return False

def rep_patch(base_exp, merge_exp):
    updated_replicates = []
    base_exp_id = base_exp['@id']
    next_bio_rep = max([r[2] for r in get_merge_info(base_exp)['reps']]) + 1
    merge_exp_reps_sorted = sorted(get_merge_info(merge_exp)['reps'], key=lambda x: x[2])
    for j, bio_rep in enumerate(sorted(set([r[2] for r in merge_exp_reps_sorted]))):
        for i, y in enumerate(sorted([r for r in merge_exp_reps_sorted if r[2] == bio_rep], key=lambda x: x[3])):
            d = ({'uuid': y[0],
                 'experiment': base_exp_id,
                 'biological_replicate_number': next_bio_rep + j,
                 'technical_replicate_number': i + 1},
                 'REPLICATE')
            updated_replicates.append(d)
    return updated_replicates

def original_files_patch(base_exp, merg_exp):
    updated_original_files = []
    base_exp_id = base_exp['@id']
    for file in get_merge_info(merg_exp)['original_files']:
        d = ({'accession': file,
             'dataset': base_exp_id},
             'ORIGINAL FILE')
        updated_original_files.append(d)
    return updated_original_files

def parse_pair(pair):
    if determine_base_exp(pair[0], pair[1]):
        base_exp, merge_exp = pair[0], pair[1]
    else:
        base_exp, merge_exp = pair[1], pair[0]
    return base_exp, merge_exp

def values_from_both(field):
    return list(set([*get_merge_info(base_exp)[field],
                     *get_merge_info(merge_exp)[field]]))

def patch_item(url, data, auth, show_output=False):
    r = requests.patch(url,
                       auth=grab.auth,
                       json=data)
    print('PATCHING: {}'.format(item), data)
    #print(json.dumps(data, indent=4, sort_keys=True), '\n')
    if show_output or r.status_code != 200:
        print(json.dumps(r.json(), indent=4, sort_keys=True))
    return r.status_code

def parse_patch_set(patch):
    data = patch[0].copy()
    item = data.get('accession', data.get('uuid'))
    [data.pop(field, None) for field in ['accession', 'uuid']]
    return item, data

def make_patch(patch_set, base_url):
    for p in patch_set:
        item, data = parse_patch_set(p)
        url = grab.urljoin(base_url, item)
        patch_item(url, data, grab.auth) 

In [556]:
# Step 0: Get embedded data.
url = 'https://www.encodeproject.org/search/?type=Experiment'\
      '&replicates.library.biosample.donor.uuid=53b21c14-4f83-40c1-bc07-8621fdc70ce2'\
      '&replicates.library.biosample.donor.uuid=6007378b-528a-4dbf-8086-bc3502abc6f5'\
      '&limit=all&status!=revoked&frame=embedded&format=json&limit=all'
# url = 'https://www.encodeproject.org/search/?type=Experiment'\
#       '&replicates.library.biosample.donor.accession=ENCDO981EWY'\
#       '&replicates.library.biosample.donor.accession=ENCDO030VWZ'\
#       '&frame=embedded&format=json&limit=all'
assert 'embedded' in url
data = grab.quick_grab_data([url])
len(data)

24

In [557]:
# Step I: Match on
#     - biosample_term_name
#     - biosample_term_id
#     - biosample_type
#     - target.name
#     - lab.name

parsed_data = list(parse_experiments(data))
pairs = list(has_pair(parsed_data, get_data=True))
unique_pairs = get_unique_pairs(pairs)

In [558]:
len(unique_pairs)

10

In [560]:
list(has_pair(parsed_data))

[('ENCSR534RNS', True),
 ('ENCSR584XHX', False),
 ('ENCSR511NSE', True),
 ('ENCSR911WPA', True),
 ('ENCSR575ICR', True),
 ('ENCSR690UYB', True),
 ('ENCSR684NQL', True),
 ('ENCSR064VLH', True),
 ('ENCSR181NUJ', False),
 ('ENCSR850JSG', True),
 ('ENCSR528QDT', True),
 ('ENCSR764XUL', True),
 ('ENCSR517EEQ', True),
 ('ENCSR717BNA', True),
 ('ENCSR696SOJ', True),
 ('ENCSR969JYY', True),
 ('ENCSR736YYJ', True),
 ('ENCSR800JXT', True),
 ('ENCSR820ABR', True),
 ('ENCSR676CFI', True),
 ('ENCSR470JHE', False),
 ('ENCSR494OXB', True),
 ('ENCSR359JHU', True),
 ('ENCSR374BVM', False)]

In [561]:
for d in parsed_data:
    matches = list(find_pair(parsed_data, {'biosample_term_name': (d['biosample_term_name'], 'equals'),
                                           'biosample_term_id': (d['biosample_term_id'], 'equals'),
                                           'biosample_type': (d['biosample_type'], 'equals'),
                                           'biosample_summary': (d['biosample_summary'], 'equals'),
                                           'target': (d['target'], 'equals'),
                                           'lab': (d['lab'], 'equals'),
                                           'assay_title': (d['assay_title'], 'equals')}))
    if len(matches) != 2:
        print('\n\n')
        print('SEARCH')
        for k, v in sorted(d.items()):
            print(k, v)
        print()
        print('MATCHES:', len(matches))
        for m in matches:
            for k, v in sorted(m.items()):
                print(k, v)
            print()




SEARCH
accession ENCSR584XHX
alternate_accessions []
assay_title microRNA-seq
biosample_summary cerebellum male embryo (20 weeks)
biosample_term_id UBERON:0002037
biosample_term_name cerebellum
biosample_type tissue
lab joseph-costello
target 

MATCHES: 1
accession ENCSR584XHX
alternate_accessions []
assay_title microRNA-seq
biosample_summary cerebellum male embryo (20 weeks)
biosample_term_id UBERON:0002037
biosample_term_name cerebellum
biosample_type tissue
lab joseph-costello
target 




SEARCH
accession ENCSR181NUJ
alternate_accessions []
assay_title MRE-seq
biosample_summary germinal matrix male embryo (20 weeks)
biosample_term_id NTR:0001407
biosample_term_name germinal matrix
biosample_type tissue
lab joseph-costello
target 

MATCHES: 1
accession ENCSR181NUJ
alternate_accessions []
assay_title MRE-seq
biosample_summary germinal matrix male embryo (20 weeks)
biosample_term_id NTR:0001407
biosample_term_name germinal matrix
biosample_type tissue
lab joseph-costello
target 





In [554]:
base_url = 'https://encd-3608-remove-proposed-9653f4422-keenan.demo.encodedcc.org'
update = False

# Must update:
# rep number in EXP_B replicates
# EXP_B.status to replaced
# EXP_A.alternate_accession.append(EXP_B)
# original_files.dataset in EXP_B to point to EXP_A
# dbxrefs in EXP_A to be list(set(EXP_A.dbxrefs, EXP_B.dbxrefs))
# aliases in EXP_A to be list(set(EXP_A.aliases, EXP_B.aliases))
# remove aliases from EXP_B.
# check for unique documents in EXP_B
# add submitter_comment to EXP_A explaining the merge

for z, pair in enumerate(unique_pairs):
    # {'accession': 'UUID', 'field': 'value'} to update.
    patch_set = []
    a, b = pair[0], pair[1]
    pair = get_pair_data([a, b])
    base_exp, merge_exp = parse_pair(pair)
    # Calculate new replicate numbers and point to base experiment.
    patch_set.extend(rep_patch(base_exp, merge_exp))
    # Replace EXP_B.
    patch_set.append(({'accession': merge_exp['uuid'],
                       'status': 'replaced'},
                      'REPLACE'))
    # Add EXP_B to EXP_A alternate_accessions.
    patch_set.append(({'accession': base_exp['accession'],
                       'alternate_accessions': [a for a in
                                                grab.chain(get_merge_info(base_exp)['alternate_accessions'],
                                                      [merge_exp['accession']])]},
                      'ALTERNATE ACCESSIONS'))
    # Update dataset in orignal files of EXP_B.
    patch_set.extend(original_files_patch(base_exp, merge_exp))
    # Update dbxrefs in EXP_A.
    patch_set.append(({'accession': base_exp['accession'],
                       'dbxrefs': values_from_both('dbxrefs')},
                       'UPDATE DBXREFS'))
    # Remove aliases from EXP_B.
    patch_set.append(({'accession': merge_exp['uuid'],
                       'aliases': []},
                      'CLEAR ALIASES'))
    # Update aliases in EXP_A.
    patch_set.append(({'accession': base_exp['accession'],
                       'aliases': values_from_both('aliases')},
                      'UPDATE ALIASES'))
    # Update documents in EXP_A.
    patch_set.append(({'accession': base_exp['accession'],
                       'documents': values_from_both('documents')},
                      'UPDATE DOCUMENTS'))
    comment = 'Experiment {} merged into {}.'.format(merge_exp['uuid'],
                                                     base_exp['accession'])
    patch_set.append(({'accession': base_exp['accession'],
                       'submitter_comment': comment},
                      'ADD COMMENT'))
    
    print('*PAIR {}*\n-----------'.format(z + 1))
    print('Base:', base_exp['accession'],
          '({})\n'.format(base_exp['uuid']),
          json.dumps(get_merge_info(base_exp), indent=4, sort_keys=True),
          '\n\nMerge:', merge_exp['accession'],
          '({})\n'.format(merge_exp['uuid']),
          json.dumps(get_merge_info(merge_exp), indent=4, sort_keys=True), '\n')
    for i, x in enumerate(patch_set):
        print(x[1])
        for k, v in sorted(x[0].items()):
            print('{}:'.format(k), v)
        print()
    if update:
        print('MAKE PATCH')
        make_patch(patch_set, base_url)
    print()
    

*PAIR 1*
-----------
Base: ENCSR575ICR (b1453892-1831-4bb7-b45b-b3d6fb6f84a8)
 {
    "aliases": [
        "roadmap-epigenomics:H3K4me3 Brain-Germinal Matrix_Jul-20-2011_25805"
    ],
    "alternate_accessions": [],
    "dbxrefs": [
        "GEO:GSM806940"
    ],
    "documents": [],
    "original_files": [
        "/files/ENCFF881PIP/",
        "/files/ENCFF230KFJ/",
        "/files/ENCFF514QDL/",
        "/files/ENCFF507SDF/",
        "/files/ENCFF065UYM/",
        "/files/SRR2172596/"
    ],
    "reps": [
        [
            "81bc941b-759e-4dba-a1c1-a04cf55ace68",
            "/experiments/ENCSR575ICR/",
            1,
            1
        ]
    ]
} 

Merge: ENCSR528QDT (b10a7e77-4d81-49c9-aa90-102ecc0aab47)
 {
    "aliases": [
        "roadmap-epigenomics:H3K4me3 Germinal Matrix_Apr-07-2011_98586"
    ],
    "alternate_accessions": [],
    "dbxrefs": [
        "GEO:GSM706999"
    ],
    "documents": [],
    "original_files": [
        "/files/SRR2172571/"
    ],
    "reps": [
   