# Download encode data

In [None]:
# Imports
import os
import numpy as np
import gzip
import shutil

## Download metadata and file list for histone modification data

Choose your data that you want to download. As an example I'll provide the steps I used to get the respective data for the developing mouse brain).

1. Go to encode (https://www.encodeproject.org)
2. Select `Mouse` button
3. Select `biosample type`: tissue
4. This will take you to the experiment matrix page (https://www.encodeproject.org/matrix/?type=Experiment&status=released&replicates.library.biosample.donor.organism.scientific_name=Mus+musculus&biosample_ontology.classification=tissue)
5. Remove `Transcription` tag
6. Nagivate to the `Biosample term name` and choose: `forebrain, hindbrain & midbrain`
7. Navigate to `Assay title` & remove `Control ChIP seq`
8. Finally, we only want to deal with one data type, so navigate to: `Available file types` and choose: `bed narrowPeak`
    
You should now have 201 results (or more this was at 14/10/2019).

In order to automatically assign informative names (rather than encodes default ENXXX) we want to download both the files and corresponding metadata for the files.

You can get the metadata for a filtered data matrix by changing the term `matrix` to report in the URL i.e.

https://www.encodeproject.org/matrix/?type=Experiment&status=released&replicates.library.biosample.donor.organism.scientific_name=Mus+musculus&biosample_ontology.classification=tissue&assay_slims%21=Transcription&biosample_ontology.term_name=forebrain&biosample_ontology.term_name=hindbrain&biosample_ontology.term_name=midbrain&biosample_ontology.term_name=midbrain&assay_title%21=Control+ChIP-seq

becomes:

https://www.encodeproject.org/report/?type=Experiment&status=released&replicates.library.biosample.donor.organism.scientific_name=Mus+musculus&biosample_ontology.classification=tissue&assay_slims%21=Transcription&biosample_ontology.term_name=forebrain&biosample_ontology.term_name=hindbrain&biosample_ontology.term_name=midbrain&biosample_ontology.term_name=midbrain&assay_title%21=Control+ChIP-seq


Note after the `?` is the query I was running. Now you can click download once you are on the report page.

To get the list of files, go back to the `matrix` link (i.e. the normal one you open when searching encode) and click download.

Rename these files to `encode-file-list_{file-explanation}_{date-of-download}` and `encode-experiment-report_file-explanation_{date-of-download}` place them in the `input` folder.

Lastly, you need to get the meta data file associated with each of the files that you may download.

This is done by copying the header line in the files.txt downloaded file, it will look something like so:
```
 "https://www.encodeproject.org/metadata/?type=Experiment&status=released&replicates.library.biosample.donor.organism.scientific_name=Mus+musculus&biosample_ontology.classification=tissue&target.label%21=CTCF&target.label%21=POLR2A&target.label%21=EP300&target.label%21=GATA4&target.label%21=H3K79me2&biosample_ontology.term_name=intestine&biosample_ontology.term_name=liver&biosample_ontology.term_name=heart&biosample_ontology.term_name=forebrain&biosample_ontology.term_name=midbrain&biosample_ontology.term_name=hindbrain&biosample_ontology.term_name=limb&biosample_ontology.term_name=embryonic+facial+prominence&biosample_ontology.term_name=neural+tube&biosample_ontology.term_name=lung&biosample_ontology.term_name=kidney&biosample_ontology.term_name=stomach&assay_title=Histone+ChIP-seq&files.file_type=bed+narrowPeak"
```

Simply go to terminal and paste:

```
curl the_header_of_the_file > encode-experiment-meta_file-explanation_{date-of-download}

```

## Setup folders and filenames

Here we just setup the folders and filenames for the data



In [None]:
# Directory structure
input_dir = '/Users/ariane/Documents/encode/'
date_download = '20201007'

file_explanations = ['mm10']
meta_file_str = 'encode-experiment-meta_'
file_list_str = 'encode-file-list_'

# Collecting the files we'll be using to download the data & assigning the generic names.
file_metas = []
file_lists = []
for c in file_explanations:
    file_metas.append(input_dir + meta_file_str + c + '_' + date_download + '.tsv')
    file_lists.append(input_dir + file_list_str + c + '_' + date_download + '.txt')

print(file_metas)
print(file_lists)

## Setup the file parsing

Here we have the functions for parsing the meta data file and also the file list.
We use `assign_header` to get the regions of interest from the metadata file, this can be edited & will need to be updated if endcode change their internal protocols.

In [None]:

"""
This section aims to pull down the data from ENCODE based on the meta data in the file:

metadata-enc3-mm10-histone-tissue-embryo-365-19022019

"""
ACCESSION = None      # File accession
OUTPUT_TYPE = None    # Output type
BIOSAMPLE = None      # Biosample term name
TARGET = None         # Experiment target
REPLICATES = None     # Biological replicate(s)
SIZE = None           # Size
URL = None            # File download URL
ASSEMBLY = None       # Assembly
STATUS = None         # File Status
FORMAT = None         # File format
ASSAY = None          # Assay


def assign_header(hdr_line):
    for i in range(0, len(hdr_line)):
        print(hdr_line[i])
        if hdr_line[i] == 'File accession' or hdr_line[i] == 'Accession':
            ACCESSION = i
        if hdr_line[i] == 'Output type':
            OUTPUT_TYPE = i
        if hdr_line[i] == 'Biosample term name':
            BIOSAMPLE = i
        if hdr_line[i] == 'Experiment target' or hdr_line[i] == 'Target of assay':
            TARGET = i
        if hdr_line[i] == 'Biological replicate(s)' or 'replicate' in hdr_line[i]:
            REPLICATES = i
        if hdr_line[i] == 'Size':
            SIZE = i
        if hdr_line[i] == 'File download URL':
            URL = i
        if hdr_line[i] == 'Assembly' or 'assembly' in hdr_line[i]:
            ASSEMBLY = i
        if hdr_line[i] == 'File Status':
            STATUS = i
        if hdr_line[i] == 'File format':
            FORMAT = i
        if hdr_line[i] == 'Assay':
            ASSAY = i
            
    return ACCESSION, OUTPUT_TYPE, BIOSAMPLE, TARGET, REPLICATES, SIZE, URL, ASSEMBLY, STATUS, FORMAT, ASSAY
            
def get_bed_files(metadata_file='metadata-enc3-mm10-histone-tissue-embryo-365-19022019.tsv', reduced_file='reduced-enc3-mm10-histone-tissue-embryo-365-19022019.txt', data_dir='../data/'):
    cnt_files = 0
    with open(data_dir + reduced_file, 'w+') as output:
        with open(input_dir + metadata_file, 'r+') as metadata:
            cols = None
            for str_line in metadata:
                line = str_line.split('\t')
                if not cols:
                    cols = line
                    ACCESSION, OUTPUT_TYPE, BIOSAMPLE, TARGET, REPLICATES, SIZE, URL, ASSEMBLY, STATUS, FORMAT, ASSAY = assign_header(line)
                else:
                    num_reps = len(line[REPLICATES].split(','))
                    # ATAC seq
                    """
                    if line[FORMAT] == 'bigWig' and line[OUTPUT_TYPE] == 'signal p-value' and line[ASSEMBLY] == 'mm10' and line[STATUS] == 'released':
                        output.write(line[URL] + '\n')
                        print('BIGWIG', line[0])
                        cnt_files += 1
                    """

                    # Histones & TFs
                    if line[FORMAT] == 'bed narrowPeak' and line[ASSEMBLY] == 'mm10' and line[STATUS] == 'released' and (line[OUTPUT_TYPE] == 'replicated peaks' or line[OUTPUT_TYPE] == 'optimal IDR thresholded peaks') :
                        output.write(line[URL] + '\n')
                        print('BED', line[0])
                        cnt_files += 1

    print(cnt_files)


def unzip_rename_beds(data_dir='../data/reduced/', metadata_file='experiment-metadata-enc3-mm10-histone-tissue-embryo-365-19022019.tsv'):
    files = os.listdir(data_dir)
    # Want the metadata to choose the correct filename, make a dict on accession
    meta_dict = dict()
    with open(input_dir + metadata_file, 'r+') as metadata:
        cols = None
        
        for str_line in metadata:
            line = str_line.split('\t')
            if not cols:
                cols = line
                ACCESSION, OUTPUT_TYPE, BIOSAMPLE, TARGET, REPLICATES, SIZE, URL, ASSEMBLY, STATUS, FORMAT, ASSAY = assign_header(line)
            else:
                meta_dict[line[ACCESSION]] = line

    for filename in files:
        # Check if a GZ file
        if filename[-2:] == 'gz':
            with gzip.open(data_dir + filename, 'rb') as f_in:
                accession = filename.split('.')[0]
                target = meta_dict[accession][TARGET]
                if len(target) < 2:
                    target = 'NA'
                new_filename = meta_dict[accession][BIOSAMPLE].replace(' ', '-') + '_' + meta_dict[accession][ASSAY] + '_' + target + '_' + accession + '.bed'
                new_filename = new_filename.replace(' ', '-')
                print(new_filename)
                
                with open(f'{data_dir}{new_filename}', 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)
        elif filename.split('.')[-1] == 'bigWig':
            accession = filename.split('.')[0]
            target = meta_dict[accession][TARGET]
            if len(target) < 2:
                target = 'NA'
            new_filename = meta_dict[accession][BIOSAMPLE].replace(' ', '-') + '_' + meta_dict[accession][ASSAY] + '_' + target + '_' + accession + '.bed'
            print(new_filename)
            #os.system("sortBed -i ../data/downloads/" + f + " > ../data/input/bed/" + f)
def read_bed(filename):
    with open(filename, 'r+') as filein:
        count = 0
        for line in filein:
            count += 1
        print(count)

def add_metadata(data_dir='../data/processed/', experiments_file='experiment-metadata-enc3-mm10-histone-tissue-embryo-365-19022019.tsv'):
    """
    Downloaded the metadata after the initial naming. Here we are adding extra information based on the experiment
    i.e. the age of the sample.
    (not attached to the actual file this needed to be downloaded using the report function).

    Need to get the file list from the experiment and match it with the accession numbers in the other file list
    """
    FILES = 12          #/files/ENCFF861WQV/,/files/ENCFF566NFW/
    ACCESSION = 1       # Accession in experiment file
    BIOSAMPLE = 7       # Biosample
    LIFE_STAGE = 19     # Age Life stage
    AGE = 20            # Biosample age
    TARGET = 4          # Target Label
    ASSAY_NAME = 2      # Assay name
    files = os.listdir(data_dir)
    file_accession_to_experiment = dict()
    experiments_dict = dict()
    count = 0
    with open(experiments_file, 'r+') as experiments:
        for line in experiments:
            if count < 2:
                count += 1
            else:
                line = line.split('\t')
                
                experiment_files = line[FILES].replace('files', '').replace('/', '').split(',')
                for file_accession in experiment_files:
                    file_accession_to_experiment[file_accession] = line[ACCESSION]
                target = line[TARGET]
                if len(target) < 2:
                    target = 'DNase'
            
                new_label =  line[BIOSAMPLE] + '_' + line[AGE] + '_' + line[LIFE_STAGE] + '_' + target + '_' + line[ASSAY_NAME] + '_'
                experiments_dict[line[ACCESSION]] = new_label.replace(' ', '-')


    for filename in files:
        if filename[-3:] == 'bed':
            accession = filename.split('_')[-1].split('.')[0]
            new_filename = experiments_dict[file_accession_to_experiment[accession]] + accession + '.bed'
            print(filename, '-->', new_filename)
            os.rename(data_dir + filename, data_dir + new_filename)

## Download the files

This section will take a long time depending on how many files you are downloading, here we download all the encode files that have two replicates and are bed files.

In [None]:
# First we only want the files that are pooled (i.e. we don't want each individual sample and to have to do the processing)
for i in range(0, len(file_metas)):
    get_bed_files(file_metas[i].split('/')[-1], 'reduced-list_' + file_metas[i].split('/')[-1], input_dir)
    
    # Run the command to download the files from the files list
    for file_list in file_lists:
        print("xargs -L 1 curl -O -L < " + 'downloads/' + 'reduced-list_' + file_metas[i].split('/')[-1])
        

## Unzip the bed files

Here we unzip the bed files and change their name so that it is informative i.e. contains the tissue and assay type as well as mark.

In [None]:
# Unzip the beds and rename them
for i in range(0, len(file_metas)):
    unzip_rename_beds(f'{data_dir}downloads/', file_metas[i].split('/')[-1])
    

## Add metadata

I was interested in the age of the sample, so here we add in the age.

This is done by the report file downloaded in the first bit.

In [None]:
# Add any extra metadata about the file (i.e. the age of the sample)

meta_data_file_str = input_dir + 'encode-experiment-report_mm10_20201007.tsv'
add_metadata(f'{data_dir}processed/', meta_data_file_str)


## Sort bed files

Here just for convention we sort the bed files.

In [None]:
# Sort the bed files
bed_tools_dir = ''

files = os.listdir(f'{data_dir}processed/')

for f in files:
    os.system(f'{bed_tools_dir}bedtools2/bin/./sortBed -i {data_dir}processed/{f} > {data_dir}sorted_bed/{f}')
    

## EXTRA: Download for ATAC seq data

Choose your data that you want to download. As an example I'll provide the steps I used to get the respective data for the developing mouse brain).

1. Go to encode (https://www.encodeproject.org)
2. Select `Mouse` button
3. Select `biosample type`: tissue
4. This will take you to the experiment matrix page (https://www.encodeproject.org/matrix/?type=Experiment&status=released&replicates.library.biosample.donor.organism.scientific_name=Mus+musculus&biosample_ontology.classification=tissue)
5. Select `ATAC-seq` in Assay title
6. Nagivate to the `Biosample term name` and choose: `forebrain, hindbrain & midbrain`
7. Navigate to `Assay title` & remove `Control ChIP seq`
8. Finally, we only want to deal with one data type, so navigate to: `Available file types` and choose: `bigWig`
    
You should now have 16 results (or more this was at 05/11/2019).

In order to automatically assign informative names (rather than encodes default ENXXX) we want to download both the files and corresponding metadata for the files.

First click the download button (and then the download in the popup).

1. Open the downloaded file, the first line contains the metadata file.
2. Download this file, (either curl or download this from the browser)

Rename these files to `encode-file-list_{file-explanation}_{date-of-download}` and `encode-experiment-report_file-explanation_{date-of-download}` place them in the `input` folder.

My download links:

Filtered data matrix: 
https://www.encodeproject.org/search/?type=Experiment&status=released&replicates.library.biosample.donor.organism.scientific_name=Mus+musculus&biosample_ontology.classification=tissue&biosample_ontology.term_name=forebrain&biosample_ontology.term_name=midbrain&biosample_ontology.term_name=hindbrain&assay_title=ATAC-seq&files.file_type=bigWig
Tsv download for metadata:
https://www.encodeproject.org/metadata/type%3DExperiment%26status%3Dreleased%26replicates.library.biosample.donor.organism.scientific_name%3DMus%2Bmusculus%26biosample_ontology.classification%3Dtissue%26biosample_ontology.term_name%3Dforebrain%26biosample_ontology.term_name%3Dmidbrain%26biosample_ontology.term_name%3Dhindbrain%26assay_title%3DATAC-seq%26files.file_type%3DbigWig/metadata.tsv


In [None]:
# Directory structure
input_dir = '../data/input/'
date_download = '20191105'

file_explanations = ['mouse-brain_ATAC']
meta_file_str = 'encode-experiment-metadata_'
file_list_str = 'encode-experiment-report_'

# Collecting the files we'll be using to download the data & assigning the generic names.
file_metas = []
file_lists = []
for c in file_explanations:
    file_metas.append(input_dir + meta_file_str + c + '_' + date_download + '.tsv')
    file_lists.append(input_dir + file_list_str + c + '_' + date_download + '.txt')

print(file_metas)
print(file_lists)


In [None]:
# # For teh ATAC files, we actually don't have the replicates merged so we are just going to download all the files.
# https://www.encodeproject.org/files/ENCFF339WWB/@@download/ENCFF339WWB.bigWig

for i in range(0, len(file_metas)):
    get_bed_files(file_metas[i].split('/')[-1], 'reduced-list_' + file_metas[i].split('/')[-1], input_dir)    
    # Run the command to download the files from the files list
    for file_list in file_lists:
        print('reduced-list_' + file_metas[i].split('/')[-1])
        os.system("xargs -L 1 curl -O -L < " + input_dir + 'reduced-list_' + file_metas[i].split('/')[-1])

In [None]:
# Unzip the beds and rename them

for i in range(0, len(file_metas)):
    unzip_rename_beds('../data/input/', 'reduced-list_encode-experiment-metadata_mouse-brain_ATAC_mouse-brain_20191105.tsv')