In [112]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import asyncio
import aiohttp
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import requests
import seaborn as sns
from datetime import datetime
from dateutil.parser import parse
from collections import defaultdict
pd.options.display.max_rows = 200
pd.options.display.max_columns = 50
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [113]:
# Copied from pyencoded-tools/encodedcc.py to avoid dependency.
class ENC_Key:
    def __init__(self, keyfile, keyname):
        if os.path.isfile(str(keyfile)):
            keys_f = open(keyfile, 'r')
            keys_json_string = keys_f.read()
            keys_f.close()
            keys = json.loads(keys_json_string)
        else:
            keys = keyfile
        key_dict = keys[keyname]
        self.authid = key_dict['key']
        self.authpw = key_dict['secret']
        self.server = key_dict['server']
        if not self.server.endswith("/"):
            self.server += "/"

            
class ENC_Connection(object):
    def __init__(self, key):
        self.headers = {'content-type': 'application/json', 'accept': 'application/json'}
        self.server = key.server
        self.auth = (key.authid, key.authpw)

In [114]:
# Define key if private data desired.
key = ENC_Key(os.path.expanduser("~/keypairs.json"), 'prod')
auth = (key.authid, key.authpw)
base_url = 'https://www.encodeproject.org'
json_all = 'limit=all&format=json'
json_only = 'format=json'

In [142]:
experiment_data_cache = {}

In [129]:
original_file_data_cache = {}

In [60]:
loop = asyncio.get_event_loop()
session = create_session()
request_auth = aiohttp.BasicAuth(key.authid, key.authpw)

In [156]:
bad_statuses = ['archived', 'revoked']
filter_formats = ['fastq', 'csfasta', 'csqual']
filter_file_statuses = []

def create_session():
    connector = aiohttp.TCPConnector(keepalive_timeout=10, limit=100)
    return aiohttp.ClientSession(connector=connector)

async def async_get_data(url, session):
    r = await session.get(url, auth=request_auth)
    try:
        assert r.status == 200
    except AssertionError as e:
        raise Exception(url, await r.text()) from e
    try:
        return await r.json()['@graph']
    except:
        return await r.json()
    
def quick_grab_data(urls):
    data = []
    f = [async_get_data(url, session) for url in urls]
    async_tasks = asyncio.gather(*f)
    data.extend(loop.run_until_complete(async_tasks))
    return data

def get_data(url):
    r = requests.get(url, auth=auth)
    try:
        assert r.status_code == 200
    except AssertionError as e:
        raise Exception(url, r.text) from e
    try:
        return r.json()['@graph']
    except:
        return r.json()

def save_data(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f)
        
def open_data(filename):
    with open(filename, 'r') as f:
        data = json.load(f)
    return data

def parse_replicates(file, rep_type, rep_num):
    if rep_num is np.nan:
        new_rep_num = file.get(rep_type, [])
        if new_rep_num:
            nums = set()
            for num in new_rep_num:
                if '_' in str(num):
                    nums.add(int(str(num).split('_')[1]))
                else:
                    nums.add(int(num))
            return nums
        else:
            return set([np.nan])
    else:
        return set([rep_num])
    
def get_replacement(replaced_accession):
    replacement = get_data('{}/{}/?{}'.format(base_url, replaced_accession, json_only))
    return replacement

def extract_file_data(file):
    accession = file['accession']
    file_id = file['@id']
    file_status = file['status']
    if file['status'] == 'replaced':
        # Use replacement data.
        file = get_replacement(accession)
    bio_rep_num = parse_replicates(file, 'biological_replicates', file.get('replicate', {}).get('biological_replicate_number', np.nan))
    tech_rep_num = parse_replicates(file, 'technical_replicates', file.get('replicate', {}).get('technical_replicate_number', np.nan))
    data = {'accession': accession,
            '@id': file_id,
            'status': file_status,
            'date_created': parse(file['date_created']),
            'dataset': file['dataset'],
            'file_format': file['file_format'],
            'biological_replicate_number': bio_rep_num,
            'technical_replicate_number': tech_rep_num,
            'derived_from': sorted([f for f in file.get('derived_from', []) if f not in excluded])}
    return data

def extract_experiment_data(experiment):
    data = {'accession': experiment['accession'],
            '@id': experiment['@id'],
            'original_files': experiment['original_files'],
            'files': experiment['files'],
            'lab': experiment['lab']['title']}
    return data

def parse_data(items, extract_method, **kwargs):
    parsed_data = []
    for item in items:
        parsed_data.append(extract_method(item, **kwargs))
    return parsed_data

def get_experiment_data(files_data):
    experiments_data = quick_grab_data(['{}{}?{}'.format(base_url, file['dataset'],json_only)
                                        for file in files_data if file['dataset'] not in experiment_data_cache])
    experiments_data = parse_data(experiments_data, extract_experiment_data)
    experiment_data_cache.update({e['@id']: e for e in experiments_data})
    experiments_data = [experiment_data_cache[file['dataset']] for file in files_data]
    return experiments_data

def search_original_files(experiment, file_bio_rep_num, file_tech_rep_num, file_created, child_file_id):
    all_file_data = quick_grab_data(['{}{}/?{}'.format(base_url, file_id, json_all)
                                     for file_id in experiment['original_files']
                                     if ((file_id not in original_file_data_cache) and (file_id != child_file_id))])
    all_file_data = parse_data(all_file_data, extract_file_data)
    original_file_data_cache.update({r['@id']: r for r in all_file_data})
    experiment_original_files = [original_file_data_cache[file_id] for file_id
                                 in experiment['original_files'] if file_id != child_file_id]
    all_file_data = experiment_original_files
    weakly_filtered_data = [f for f in all_file_data 
                            if ((f['biological_replicate_number'] == file_bio_rep_num)
                                and (f['file_format'] in filter_formats))]
    filtered_data = [f for f in all_file_data
                     if ((f['status'] not in bad_statuses)
                         and (f['biological_replicate_number'] == file_bio_rep_num)
                         and (f['technical_replicate_number'] == file_tech_rep_num)
                         and (f['date_created'] <= file_created)
                         and (f['file_format'] in filter_formats))]
    try:
        filtered_ids = [f['@id'] for f in filtered_data]
    except KeyError:
        print(filtered_data)
    experiment['new_files'] = filtered_ids
    experiment['matching_filters'] = len(weakly_filtered_data) == len(filtered_data)
    experiment['filtered_data'] = sorted([(f['accession'], f['status']) for f in filtered_data])
    experiment['weakly_filtered_data'] = sorted([(f['accession'], f['status']) for f in weakly_filtered_data])
    return experiment
    
def find_new_derived_from(file, experiment):
    child_file_id = file['@id']
    file_bio_rep_num = file['biological_replicate_number']
    file_tech_rep_num = file['technical_replicate_number']
    file_created = file['date_created']
    experiment = search_original_files(experiment, file_bio_rep_num, file_tech_rep_num, file_created, child_file_id)
    matching_files = sorted(experiment['new_files'])
    derived_from = matching_files if matching_files else []
    data = {'accession': file['accession'],
            'derived_from': derived_from,
            'matching_filters': experiment['matching_filters'],
            'filtered_data': experiment['filtered_data'],
            'weakly_filtered_data': experiment['weakly_filtered_data']}
    return data

def check_for_matching_derived_from(files, experiments):
    data_mask = []
    new_derived_from_data = []
    assert len(files) == len(experiments)
    for file, experiment in zip(files, experiments):
        # Generate possible derived_from list from original_files:
        new_derived_from = find_new_derived_from(file, experiment)
        new_derived_from_data.append(new_derived_from)
        if False:#not new_derived_from['matching_filters']:
            print('Filter mismatch',
                  '\nFile:', file['accession'], 
                  '\nFiltered data:', experiment['filtered_data'],
                  '\nWeakly filtered data:', experiment['weakly_filtered_data'])
        # If file already has derived_from:
        if file['derived_from']:
            if (set(file['derived_from']) == set(new_derived_from['derived_from'])):
                data_mask.append(True)
            else:
                print('Mismatch: ',
                      file['accession'],
                      file['status'],
                      '\nActual:',
                      file['derived_from'],
                      '\nCalculated:',
                      new_derived_from['derived_from'], '\n')
                data_mask.append(False)
        else:
            if new_derived_from['derived_from']:
                data_mask.append(True)
            else:
                print('No derived_from found: ', file['accession'], file['status'])
                data_mask.append(False)
    assert len(data_mask) == len(files) == len(experiments)
    return data_mask, new_derived_from_data
            
def main(data):
    files_data = [f for f in parse_data(data, extract_file_data)
                  if f['status'] not in filter_file_statuses]
    experiments_data = get_experiment_data(files_data)
    derived_from_match, new_derived_from_data = check_for_matching_derived_from(files_data, experiments_data)
    return np.array(files_data), np.array(experiments_data), derived_from_match, np.array(new_derived_from_data)

## Collect BAM files with biological_replicate_number

In [104]:
# Get derived_from .tar files to filter.
references = [d['@id'] for d in get_data('https://www.encodeproject.org/search/'
                                         '?type=File&output_category=reference&limit=all&format=json')]
blacklists = [d['@id'] for d in get_data('https://www.encodeproject.org/search/'
                                         '?type=File&output_type=blacklisted+regions&limit=all&format=json')]
excluded = references + blacklists

In [105]:
url_with_derived_from = 'https://www.encodeproject.org/search/?type=File&replicate.biological_replicate_number=%2A&file_format=bam&derived_from=%2A&frame=embedded&limit=all&format=json'
url_no_derived_from = 'https://www.encodeproject.org/search/?type=File&replicate.biological_replicate_number=%2A&file_format=bam&derived_from%21=%2A&frame=embedded&limit=all&format=json'

### Only those files with derived_from field

In [106]:
derived_from = get_data(url_with_derived_from)

In [107]:
len(derived_from)

17092

In [187]:
derived_from_rows = pd.DataFrame([(df['accession'], df['status'], df['derived_from']) for df in derived_from]).rename(columns={0: 'accession', 1: 'status', 2: 'derived_from'})

In [192]:
derived_from_rows.to_csv('BAM_replicate_field_with_derived_from_08_16_2017.tsv', index=False, sep='\t')

In [532]:
save_data(derived_from, 'derived_from_with_replicate_08_16_2017.json')

In [None]:
derived_from = open_data('derived_from_with_replicate_08_16_2017.json')

### Only those files without derived_from field

In [11]:
no_derived_from = get_data(url_no_derived_from)

In [12]:
len(no_derived_from)

4422

In [17]:
save_data(no_derived_from, 'no_derived_from_with_replicate_08_16_2017.json')

In [None]:
no_derived_from = open_data('no_derived_from_with_replicate_08_16_2017.json')

In [197]:
no_derived_from_rows = pd.DataFrame([(df['accession'], df['status']) for df in no_derived_from]).rename(columns={0: 'accession', 1: 'status'})
no_derived_from_rows

Unnamed: 0,accession,status
0,ENCFF001QWY,archived
1,ENCFF001QFI,replaced
2,ENCFF001NVL,released
3,ENCFF000YWW,released
4,ENCFF000WCB,released
5,ENCFF000ZAF,archived
6,ENCFF000WJH,released
7,ENCFF000ZDX,archived
8,ENCFF001KZB,released
9,ENCFF001JZP,replaced


In [198]:
no_derived_from_rows.to_csv('BAM_replicate_field_with_no_derived_from_08_16_2017.tsv', index=False, sep='\t')

## Check that derived_from consistent with filtered original_files list

In [155]:
df_files, df_experiments, df_derived_from_match, df_new_derived_from_data = main(derived_from[:100])
#print(df_derived_from_match)
print('Calculated derived_from different from actual: {}%'.format(round((((1 - (sum(df_derived_from_match)) / len(df_derived_from_match))) * 100), 1)))
df_no_match_filters = [d for d in df_new_derived_from_data if not d['matching_filters']]
print('Mismatching weak versus regular filter: {}%'.format(round((len(df_no_match_filters) / len(df_new_derived_from_data)) * 100, 1)))
len(df_derived_from_match)

Mismatch:  ENCFF002DNZ archived 
Actual: ['/files/ENCFF002DOH/'] 
Calculated: [] 

Mismatch:  ENCFF000LBQ released 
Actual: ['/files/ENCFF000LAJ/', '/files/ENCFF000LAM/'] 
Calculated: ['/files/ENCFF000LAJ/', '/files/ENCFF000LAM/', '/files/ENCFF000LAN/', '/files/ENCFF000LAO/'] 

Mismatch:  ENCFF792CRN in progress 
Actual: ['/files/ENCFF209DSC/', '/files/ENCFF669WDN/'] 
Calculated: ['/files/ENCFF209DSC/', '/files/ENCFF494HWB/', '/files/ENCFF553WWW/', '/files/ENCFF669WDN/', '/files/ENCFF716AJV/', '/files/ENCFF735REN/', '/files/ENCFF942JPK/', '/files/ENCFF975OSR/'] 

Mismatch:  ENCFF874XDR in progress 
Actual: ['/files/ENCFF070KEN/', '/files/ENCFF779NJB/'] 
Calculated: ['/files/ENCFF036OKM/', '/files/ENCFF070KEN/', '/files/ENCFF117RDF/', '/files/ENCFF564ANK/', '/files/ENCFF619ZMW/', '/files/ENCFF779NJB/', '/files/ENCFF930VFF/', '/files/ENCFF957HAC/'] 

Calculated derived_from different from actual: 4.0%
Mismatching weak versus regular filter: 4.0%


100

In [139]:
a, b, c, d = main([get_data('https://www.encodeproject.org/files/5601bcd3-624c-461b-a323-1448f6d165c6/')])
c, d

([True],
 array([ {'weakly_filtered_data': [('ENCFF000SJD', 'released'), ('ENCFF000SJF', 'released'), ('ENCFF000SJL', 'released')], 'derived_from': ['/files/ENCFF000SJD/'], 'filtered_data': [('ENCFF000SJD', 'released')], 'accession': 'ENCFF000SIJ', 'matching_filters': False}], dtype=object))

In [796]:
# for x in no_match_filters:
#     print()
#     for k, v in sorted(x.items()):
#          print(k, v)

## Get replacement accession and calculate derived_from

In [141]:
# Get all replaced BAM files with no derived_from.
replaced_no_derived_from = get_data('https://www.encodeproject.org/'
                                    'search/?type=File&replicate.biological_replicate_number=%2A'
                                    '&file_format=bam&derived_from%21=%2A&status=replaced&limit=all&frame=embedded&format=json')

In [165]:
len(replaced_no_derived_from)

202

In [142]:
a, b, c, d = main(replaced_no_derived_from)

No derived_from found:  ENCFF001NRC replaced
No derived_from found:  ENCFF000ZVG replaced
No derived_from found:  ENCFF001NRA replaced
No derived_from found:  ENCFF001DYQ replaced
Mismatch:  ENCFF001QGL replaced 
Actual: ['/files/ENCFF001OZH/', '/files/ENCFF001OZI/', '/files/ENCFF001OZJ/', '/files/ENCFF001OZK/', '/files/ENCFF001OZL/', '/files/ENCFF001OZM/', '/files/ENCFF001OZN/', '/files/ENCFF001OZO/', '/files/ENCFF001OZP/', '/files/ENCFF001OZQ/', '/files/ENCFF001OZR/', '/files/ENCFF001OZS/', '/files/ENCFF001OZT/', '/files/ENCFF001OZU/', '/files/ENCFF001OZV/', '/files/ENCFF001OZW/', '/files/ENCFF001OZX/', '/files/ENCFF001OZY/', '/files/ENCFF001OZZ/', '/files/ENCFF001PAA/', '/files/ENCFF001PAB/', '/files/ENCFF001PAC/', '/files/ENCFF001PAD/', '/files/ENCFF001PAE/', '/files/ENCFF001PAF/', '/files/ENCFF001PAG/', '/files/ENCFF001PAH/', '/files/ENCFF001PAI/', '/files/ENCFF001PAJ/', '/files/ENCFF001PAK/', '/files/ENCFF001PAL/'] 
Calculated: [] 

No derived_from found:  ENCFF001KKV replaced
No

In [144]:
no_derived_found_replaced = d[[not b for b in c]]

In [156]:
no_derived_found_replaced

array([ {'weakly_filtered_data': [('ENCFF001NRF', 'revoked')], 'derived_from': [], 'filtered_data': [], 'accession': 'ENCFF001NRC', 'matching_filters': False},
       {'weakly_filtered_data': [], 'derived_from': [], 'filtered_data': [], 'accession': 'ENCFF000ZVG', 'matching_filters': True},
       {'weakly_filtered_data': [('ENCFF001NRE', 'revoked')], 'derived_from': [], 'filtered_data': [], 'accession': 'ENCFF001NRA', 'matching_filters': False},
       {'weakly_filtered_data': [], 'derived_from': [], 'filtered_data': [], 'accession': 'ENCFF001DYQ', 'matching_filters': True},
       {'weakly_filtered_data': [('ENCFF001OZH', 'released'), ('ENCFF001OZI', 'released'), ('ENCFF001OZJ', 'released'), ('ENCFF001OZK', 'released'), ('ENCFF001OZL', 'released'), ('ENCFF001OZM', 'released'), ('ENCFF001OZN', 'released'), ('ENCFF001OZO', 'released'), ('ENCFF001OZP', 'released'), ('ENCFF001OZQ', 'released'), ('ENCFF001OZR', 'released'), ('ENCFF001OZS', 'released'), ('ENCFF001OZT', 'released'), ('ENCFF

In [172]:
print(*['https://encodeproject.org/{}'.format(r['accession']) for r in no_derived_found_replaced if (r['weakly_filtered_data'])], sep='\n')

https://encodeproject.org/ENCFF001NRC
https://encodeproject.org/ENCFF001NRA
https://encodeproject.org/ENCFF001QGL
https://encodeproject.org/ENCFF001KKV
https://encodeproject.org/ENCFF001NCJ
https://encodeproject.org/ENCFF001LAD


In [171]:
print(*['https://encodeproject.org/{}'.format(r['accession']) for r in no_derived_found_replaced if not r['weakly_filtered_data']], sep='\n')

https://encodeproject.org/ENCFF000ZVG
https://encodeproject.org/ENCFF001DYQ
https://encodeproject.org/ENCFF000ZVN
https://encodeproject.org/ENCFF000ZVR
https://encodeproject.org/ENCFF001QFM
https://encodeproject.org/ENCFF000ZVX
https://encodeproject.org/ENCFF000ZVM


In [174]:
d[[b for b in c]]

array([ {'weakly_filtered_data': [('ENCFF001QDZ', 'released')], 'derived_from': ['/files/ENCFF001QDZ/'], 'filtered_data': [('ENCFF001QDZ', 'released')], 'accession': 'ENCFF001QFI', 'matching_filters': True},
       {'weakly_filtered_data': [('ENCFF001LJN', 'released')], 'derived_from': ['/files/ENCFF001LJN/'], 'filtered_data': [('ENCFF001LJN', 'released')], 'accession': 'ENCFF001JZP', 'matching_filters': True},
       {'weakly_filtered_data': [('ENCFF001NYS', 'released')], 'derived_from': ['/files/ENCFF001NYS/'], 'filtered_data': [('ENCFF001NYS', 'released')], 'accession': 'ENCFF001NYN', 'matching_filters': True},
       {'weakly_filtered_data': [('ENCFF001HWK', 'released')], 'derived_from': ['/files/ENCFF001HWK/'], 'filtered_data': [('ENCFF001HWK', 'released')], 'accession': 'ENCFF001GAZ', 'matching_filters': True},
       {'weakly_filtered_data': [('ENCFF001HVV', 'released')], 'derived_from': ['/files/ENCFF001HVV/'], 'filtered_data': [('ENCFF001HVV', 'released')], 'accession': 'ENCFF

In [79]:
rndf_files, rndf_experiments, rndf_derived_from_match, rndf_new_derived_from_data = main(replaced_no_derived_from)
sum(rndf_derived_from_match), rndf_new_derived_from_data

No derived_from found:  ENCFF001LLA replaced
No derived_from found:  ENCFF001DYQ replaced
No derived_from found:  ENCFF001NRC replaced
No derived_from found:  ENCFF000SIJ replaced
No derived_from found:  ENCFF001NRA replaced
No derived_from found:  ENCFF000OCR replaced
No derived_from found:  ENCFF001QFM replaced
No derived_from found:  ENCFF000OSC replaced
No derived_from found:  ENCFF001KKV replaced
No derived_from found:  ENCFF000ZVN replaced
No derived_from found:  ENCFF000ZVR replaced
No derived_from found:  ENCFF000SIG replaced
No derived_from found:  ENCFF001QGL replaced
No derived_from found:  ENCFF000ZVM replaced
No derived_from found:  ENCFF000ZNQ replaced
No derived_from found:  ENCFF000ZVG replaced
No derived_from found:  ENCFF000ZLF replaced
No derived_from found:  ENCFF000ZTB replaced
No derived_from found:  ENCFF000ZJM replaced
No derived_from found:  ENCFF000ZJN replaced
No derived_from found:  ENCFF000ZVX replaced
No derived_from found:  ENCFF000ZLH replaced
No derived

(177,
 array([ {'weakly_filtered_data': [('ENCFF001QDZ', 'released')], 'derived_from': ['/files/ENCFF001QDZ/'], 'filtered_data': [('ENCFF001QDZ', 'released')], 'accession': 'ENCFF001QFI', 'matching_filters': True},
        {'weakly_filtered_data': [('ENCFF001JZZ', 'released')], 'derived_from': ['/files/ENCFF001JZZ/'], 'filtered_data': [('ENCFF001JZZ', 'released')], 'accession': 'ENCFF001JZP', 'matching_filters': True},
        {'weakly_filtered_data': [('ENCFF001NYS', 'released')], 'derived_from': ['/files/ENCFF001NYS/'], 'filtered_data': [('ENCFF001NYS', 'released')], 'accession': 'ENCFF001NYN', 'matching_filters': True},
        {'weakly_filtered_data': [('ENCFF001HWK', 'released')], 'derived_from': ['/files/ENCFF001HWK/'], 'filtered_data': [('ENCFF001HWK', 'released')], 'accession': 'ENCFF001GAZ', 'matching_filters': True},
        {'weakly_filtered_data': [('ENCFF001HVV', 'released')], 'derived_from': ['/files/ENCFF001HVV/'], 'filtered_data': [('ENCFF001HVV', 'released')], 'accessi

In [99]:
replacement_info = [(r['accession'], get_replacement_accession(r['accession'])) for r in replaced_no_derived_from]

In [102]:
rp_files, rp_experiments, rp_derived_from_match, rp_new_derived_from_data = main([r[1] for r in replacement_info])
rp_derived_from_match, rp_new_derived_from_data

Mismatch:  ENCFF001QDF archived 
Actual: ['/files/ENCFF001QDZ/'] 
Calculated: [] 

Mismatch:  ENCFF001HVW archived 
Actual: ['/files/ENCFF001HWK/'] 
Calculated: [] 

Mismatch:  ENCFF001HVM archived 
Actual: ['/files/ENCFF001HVV/'] 
Calculated: [] 

Mismatch:  ENCFF000TEY archived 
Actual: ['/files/ENCFF000TFM/'] 
Calculated: [] 

Mismatch:  ENCFF001HUE archived 
Actual: ['/files/ENCFF001HUV/'] 
Calculated: [] 

Mismatch:  ENCFF001HKI archived 
Actual: ['/files/ENCFF001HKT/'] 
Calculated: [] 

Mismatch:  ENCFF000UUE archived 
Actual: ['/files/ENCFF000UUW/'] 
Calculated: [] 

Mismatch:  ENCFF001HUB archived 
Actual: ['/files/ENCFF001HUM/'] 
Calculated: [] 

Mismatch:  ENCFF001HWN archived 
Actual: ['/files/ENCFF001HWY/'] 
Calculated: [] 

No derived_from found:  ENCFF001NQN revoked
Mismatch:  ENCFF001HUO archived 
Actual: ['/files/ENCFF001HVB/'] 
Calculated: [] 

Mismatch:  ENCFF001HUZ archived 
Actual: ['/files/ENCFF001HVP/'] 
Calculated: [] 

Mismatch:  ENCFF001HOW archived 
Actual: ['

([False,
  True,
  True,
  False,
  False,
  True,
  True,
  False,
  False,
  True,
  False,
  False,
  False,
  False,
  True,
  True,
  True,
  False,
  True,
  True,
  False,
  False,
  False,
  False,
  True,
  True,
  True,
  False,
  False,
  False,
  False,
  False,
  True,
  True,
  True,
  True,
  False,
  False,
  False,
  True,
  False,
  True,
  False,
  True,
  False,
  False,
  False,
  False,
  False,
  False,
  True,
  True,
  False,
  True,
  True,
  True,
  True,
  False,
  True,
  True,
  False,
  False,
  True,
  True,
  True,
  True,
  True,
  False,
  False,
  False,
  False,
  False,
  True,
  False,
  True,
  True,
  False,
  False,
  False,
  True,
  False,
  False,
  False,
  True,
  True,
  True,
  True,
  True,
  False,
  True,
  False,
  True,
  False,
  False,
  True,
  True,
  True,
  False,
  False,
  False,
  True,
  False,
  False,
  False,
  False,
  False,
  False,
  True,
  True,
  True,
  True,
  True,
  True,
  True,
  False,
  True,
  True,
  Tr

([True],
 array([ {'derived_from': ['/files/ENCFF806NGJ/', '/files/ENCFF604XAU/', '/files/ENCFF287JEK/', '/files/ENCFF837PPR/'], 'accession': 'ENCFF578WIL'}], dtype=object))

## Generate new derived_from for files missing field

In [None]:
ndf_files, ndf_experiments, ndf_derived_from_match, ndf_new_derived_from_data = main(no_derived_from[:10])
#ndf_derived_from_match
#[(d['accession'], d['derived_from']) for d in ndf_new_derived_from_data]

In [855]:
ndf_new_derived_from_data[[not d for d in ndf_derived_from_match]]

array([], dtype=object)