# Retrieving HG19 epigenomic files
The following notebook shows how the epigenomic files metadata are retrieved.

In [1]:
from glob import glob
import pandas as pd
import compress_json
from encodeproject import biosamples, accessions, biosample, download_urls

We specify that we are only interested in the [GRCh38](https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.26/) assembly, are currently in status [released](https://www.encodeproject.org/help/getting-started/status-terms/#FileStatuses), have replication type [isogenic](https://www.encodeproject.org/data-standards/terms/) (there is a biological replication) and the file format is [bigWig](https://genome.ucsc.edu/goldenPath/help/bigWig.html#:~:text=The%20bigWig%20format%20is%20useful,in%20an%20indexed%20binary%20format.&text=Wiggle%20data%20must%20be%20continuous%20and%20consist%20of%20equally%20sized%20elements.).

In [2]:
parameters = dict(
    assembly="hg19",
    replication_type="isogenic",
    file_format="bigWig",
    status="released",
    use_multiprocessing=True
)

We will append all the dataset while we obtain them to the following list.

In [3]:
all_datasets = []

### Retrieving CHIP-seq

In [4]:
samples = biosamples(
    accessions=accessions(compress_json.load("hg38_encode_queries/chipseq.json")),
    min_biological_replicates=2,
    output_type="fold change over control",
    **parameters
)
all_datasets.append(samples)
samples

HBox(children=(FloatProgress(value=0.0, description='Retrieving biosamples', layout=Layout(flex='2'), max=1579…



Unnamed: 0,organism,target,term_id,cell_line,institute_name,title,accession,status,assay_title,assay_term_name,...,output_category,output_type,read_length,read_length_units,run_type,schema_version,encode_version,biological_replicates,technical_replicates,url
0,human,H3K4me2,EFO:0001203,MCF-7,Broad Institute,"Bradley Bernstein, Broad",ENCFF998NCA,released,Histone ChIP-seq,ChIP-seq,...,signal,fold change over control,,,,26,3.0,"[1, 2]","[1_1, 1_2, 2_1, 2_2, 2_3]",https://encode-public.s3.amazonaws.com/2016/11...
1,human,SIX5,EFO:0003042,H1,HudsonAlpha Institute for Biotechnology,"Richard Myers, HAIB",ENCFF095XBB,released,TF ChIP-seq,ChIP-seq,...,signal,fold change over control,,,,26,3.0,"[1, 2]","[1_1, 2_1]",https://encode-public.s3.amazonaws.com/2017/12...
2,human,HCFC1,EFO:0002067,K562,Stanford University,"Michael Snyder, Stanford",ENCFF633TLX,released,TF ChIP-seq,ChIP-seq,...,signal,fold change over control,,,,26,3.0,"[1, 2]","[1_1, 2_1]",https://encode-public.s3.amazonaws.com/2016/12...
3,human,MYC,EFO:0002067,K562,Stanford University,"Michael Snyder, Stanford",ENCFF790FHL,released,TF ChIP-seq,ChIP-seq,...,signal,fold change over control,,,,26,3.0,"[1, 2]","[1_1, 2_1]",https://encode-public.s3.amazonaws.com/2017/12...
4,human,ZNF274,EFO:0002067,K562,University of Southern California,"Peggy Farnham, USC",ENCFF296ZAW,released,TF ChIP-seq,ChIP-seq,...,signal,fold change over control,,,,26,3.0,"[1, 2]","[1_1, 2_1]",https://encode-public.s3.amazonaws.com/2017/02...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1106,human,POLR2AphosphoS5,EFO:0002784,GM12878,HudsonAlpha Institute for Biotechnology,"Richard Myers, HAIB",ENCFF002UPS,released,TF ChIP-seq,ChIP-seq,...,signal,fold change over control,,,,26,3.0,"[1, 2]","[1_1, 2_1]",https://encode-public.s3.amazonaws.com/2017/06...
1107,human,EP300,EFO:0002784,GM12878,Stanford University,"Michael Snyder, Stanford",ENCFF820BXH,released,TF ChIP-seq,ChIP-seq,...,signal,fold change over control,,,,26,3.0,"[1, 2]","[1_1, 2_1]",https://encode-public.s3.amazonaws.com/2017/07...
1108,human,CTCF,EFO:0002067,K562,Stanford University,"Michael Snyder, Stanford",ENCFF933ZLL,released,TF ChIP-seq,ChIP-seq,...,signal,fold change over control,,,,26,3.0,"[1, 2]","[1_1, 2_1]",https://encode-public.s3.amazonaws.com/2017/04...
1109,human,POLR2A,EFO:0002784,GM12878,HudsonAlpha Institute for Biotechnology,"Richard Myers, HAIB",ENCFF368HBX,released,TF ChIP-seq,ChIP-seq,...,signal,fold change over control,,,,26,3.0,"[1, 2]","[1_1, 2_1]",https://encode-public.s3.amazonaws.com/2017/01...


### Retrieving DNASE-seq

In [5]:
samples = biosamples(
    accessions=accessions(compress_json.load("hg38_encode_queries/dnaseseq.json")),
    organism=None,
    **parameters
)
samples["organism"] = "human"

all_datasets.append(samples)
samples

HBox(children=(FloatProgress(value=0.0, description='Retrieving biosamples', layout=Layout(flex='2'), max=5.0,…



Unnamed: 0,organism,target,term_id,cell_line,institute_name,title,accession,status,assay_title,assay_term_name,...,output_category,output_type,read_length,read_length_units,run_type,schema_version,encode_version,biological_replicates,technical_replicates,url
0,human,Unknown,EFO:0001203,MCF-7,University of Washington,"John Stamatoyannopoulos, UW",ENCFF615FRD,released,DNase-seq,DNase-seq,...,signal,read-depth normalized signal,,,,26,3.0,[2],[2_1],https://encode-public.s3.amazonaws.com/2017/09...
1,human,Unknown,EFO:0001203,MCF-7,University of Washington,"John Stamatoyannopoulos, UW",ENCFF922TLC,released,DNase-seq,DNase-seq,...,signal,read-depth normalized signal,,,,26,3.0,[1],[1_1],https://encode-public.s3.amazonaws.com/2017/09...
2,human,Unknown,EFO:0001086,A549,University of Washington,"John Stamatoyannopoulos, UW",ENCFF180FXV,released,DNase-seq,DNase-seq,...,signal,read-depth normalized signal,,,,26,3.0,[2],[2_1],https://encode-public.s3.amazonaws.com/2017/12...
3,human,Unknown,EFO:0001086,A549,University of Washington,"John Stamatoyannopoulos, UW",ENCFF723TWJ,released,DNase-seq,DNase-seq,...,signal,read-depth normalized signal,,,,26,3.0,[1],[1_7],https://encode-public.s3.amazonaws.com/2017/12...
4,human,Unknown,EFO:0002784,GM12878,University of Washington,"John Stamatoyannopoulos, UW",ENCFF901GZH,released,DNase-seq,DNase-seq,...,signal,read-depth normalized signal,,,,26,3.0,[2],[2_1],https://encode-public.s3.amazonaws.com/2017/09...
5,human,Unknown,EFO:0002784,GM12878,University of Washington,"John Stamatoyannopoulos, UW",ENCFF264NMW,released,DNase-seq,DNase-seq,...,signal,read-depth normalized signal,,,,26,3.0,[1],[1_1],https://encode-public.s3.amazonaws.com/2017/09...


### Retrieving FAIRE-seq

### Retrieving DNAME

In [6]:
samples = biosamples(
    accessions=accessions(compress_json.load("hg38_encode_queries/dname.json")),
    organism=None,
    min_biological_replicates=0,
    output_type=None,
    **parameters
)
samples["organism"] = "human"

all_datasets.append(samples)
samples

HBox(children=(FloatProgress(value=0.0, description='Retrieving biosamples', layout=Layout(flex='2'), max=41.0…



Unnamed: 0,organism,target,term_id,cell_line,institute_name,title,accession,status,assay_title,assay_term_name,...,output_category,output_type,read_length,read_length_units,run_type,schema_version,encode_version,biological_replicates,technical_replicates,url


## Combining all datasets

In [7]:
combined = pd.concat(all_datasets)
combined

Unnamed: 0,organism,target,term_id,cell_line,institute_name,title,accession,status,assay_title,assay_term_name,...,output_category,output_type,read_length,read_length_units,run_type,schema_version,encode_version,biological_replicates,technical_replicates,url
0,human,H3K4me2,EFO:0001203,MCF-7,Broad Institute,"Bradley Bernstein, Broad",ENCFF998NCA,released,Histone ChIP-seq,ChIP-seq,...,signal,fold change over control,,,,26,3.0,"[1, 2]","[1_1, 1_2, 2_1, 2_2, 2_3]",https://encode-public.s3.amazonaws.com/2016/11...
1,human,SIX5,EFO:0003042,H1,HudsonAlpha Institute for Biotechnology,"Richard Myers, HAIB",ENCFF095XBB,released,TF ChIP-seq,ChIP-seq,...,signal,fold change over control,,,,26,3.0,"[1, 2]","[1_1, 2_1]",https://encode-public.s3.amazonaws.com/2017/12...
2,human,HCFC1,EFO:0002067,K562,Stanford University,"Michael Snyder, Stanford",ENCFF633TLX,released,TF ChIP-seq,ChIP-seq,...,signal,fold change over control,,,,26,3.0,"[1, 2]","[1_1, 2_1]",https://encode-public.s3.amazonaws.com/2016/12...
3,human,MYC,EFO:0002067,K562,Stanford University,"Michael Snyder, Stanford",ENCFF790FHL,released,TF ChIP-seq,ChIP-seq,...,signal,fold change over control,,,,26,3.0,"[1, 2]","[1_1, 2_1]",https://encode-public.s3.amazonaws.com/2017/12...
4,human,ZNF274,EFO:0002067,K562,University of Southern California,"Peggy Farnham, USC",ENCFF296ZAW,released,TF ChIP-seq,ChIP-seq,...,signal,fold change over control,,,,26,3.0,"[1, 2]","[1_1, 2_1]",https://encode-public.s3.amazonaws.com/2017/02...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,human,Unknown,EFO:0001203,MCF-7,University of Washington,"John Stamatoyannopoulos, UW",ENCFF922TLC,released,DNase-seq,DNase-seq,...,signal,read-depth normalized signal,,,,26,3.0,[1],[1_1],https://encode-public.s3.amazonaws.com/2017/09...
2,human,Unknown,EFO:0001086,A549,University of Washington,"John Stamatoyannopoulos, UW",ENCFF180FXV,released,DNase-seq,DNase-seq,...,signal,read-depth normalized signal,,,,26,3.0,[2],[2_1],https://encode-public.s3.amazonaws.com/2017/12...
3,human,Unknown,EFO:0001086,A549,University of Washington,"John Stamatoyannopoulos, UW",ENCFF723TWJ,released,DNase-seq,DNase-seq,...,signal,read-depth normalized signal,,,,26,3.0,[1],[1_7],https://encode-public.s3.amazonaws.com/2017/12...
4,human,Unknown,EFO:0002784,GM12878,University of Washington,"John Stamatoyannopoulos, UW",ENCFF901GZH,released,DNase-seq,DNase-seq,...,signal,read-depth normalized signal,,,,26,3.0,[2],[2_1],https://encode-public.s3.amazonaws.com/2017/09...


In [8]:
combined

Unnamed: 0,organism,target,term_id,cell_line,institute_name,title,accession,status,assay_title,assay_term_name,...,output_category,output_type,read_length,read_length_units,run_type,schema_version,encode_version,biological_replicates,technical_replicates,url
0,human,H3K4me2,EFO:0001203,MCF-7,Broad Institute,"Bradley Bernstein, Broad",ENCFF998NCA,released,Histone ChIP-seq,ChIP-seq,...,signal,fold change over control,,,,26,3.0,"[1, 2]","[1_1, 1_2, 2_1, 2_2, 2_3]",https://encode-public.s3.amazonaws.com/2016/11...
1,human,SIX5,EFO:0003042,H1,HudsonAlpha Institute for Biotechnology,"Richard Myers, HAIB",ENCFF095XBB,released,TF ChIP-seq,ChIP-seq,...,signal,fold change over control,,,,26,3.0,"[1, 2]","[1_1, 2_1]",https://encode-public.s3.amazonaws.com/2017/12...
2,human,HCFC1,EFO:0002067,K562,Stanford University,"Michael Snyder, Stanford",ENCFF633TLX,released,TF ChIP-seq,ChIP-seq,...,signal,fold change over control,,,,26,3.0,"[1, 2]","[1_1, 2_1]",https://encode-public.s3.amazonaws.com/2016/12...
3,human,MYC,EFO:0002067,K562,Stanford University,"Michael Snyder, Stanford",ENCFF790FHL,released,TF ChIP-seq,ChIP-seq,...,signal,fold change over control,,,,26,3.0,"[1, 2]","[1_1, 2_1]",https://encode-public.s3.amazonaws.com/2017/12...
4,human,ZNF274,EFO:0002067,K562,University of Southern California,"Peggy Farnham, USC",ENCFF296ZAW,released,TF ChIP-seq,ChIP-seq,...,signal,fold change over control,,,,26,3.0,"[1, 2]","[1_1, 2_1]",https://encode-public.s3.amazonaws.com/2017/02...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,human,Unknown,EFO:0001203,MCF-7,University of Washington,"John Stamatoyannopoulos, UW",ENCFF922TLC,released,DNase-seq,DNase-seq,...,signal,read-depth normalized signal,,,,26,3.0,[1],[1_1],https://encode-public.s3.amazonaws.com/2017/09...
2,human,Unknown,EFO:0001086,A549,University of Washington,"John Stamatoyannopoulos, UW",ENCFF180FXV,released,DNase-seq,DNase-seq,...,signal,read-depth normalized signal,,,,26,3.0,[2],[2_1],https://encode-public.s3.amazonaws.com/2017/12...
3,human,Unknown,EFO:0001086,A549,University of Washington,"John Stamatoyannopoulos, UW",ENCFF723TWJ,released,DNase-seq,DNase-seq,...,signal,read-depth normalized signal,,,,26,3.0,[1],[1_7],https://encode-public.s3.amazonaws.com/2017/12...
4,human,Unknown,EFO:0002784,GM12878,University of Washington,"John Stamatoyannopoulos, UW",ENCFF901GZH,released,DNase-seq,DNase-seq,...,signal,read-depth normalized signal,,,,26,3.0,[2],[2_1],https://encode-public.s3.amazonaws.com/2017/09...


### Keeping only latest encode version of each file

In [9]:
combined["string_biological_replicates"] = combined["biological_replicates"].astype(str)
filtered_combined = combined.sort_values("encode_version").groupby([
    "target",
    "cell_line",
    "assay_title",
    "institute_name",
    "string_biological_replicates"
]).last().reset_index()

filtered_combined.to_csv("epigenomic_dataset/epigenomes_metadata/hg19.csv", index=False)

In [10]:
filtered_combined

Unnamed: 0,target,cell_line,assay_title,institute_name,string_biological_replicates,organism,term_id,title,accession,status,...,output_category,output_type,read_length,read_length_units,run_type,schema_version,encode_version,biological_replicates,technical_replicates,url
0,ADNP,K562,TF ChIP-seq,University of Chicago,"[1, 2]",human,EFO:0002067,"Kevin White, UChicago",ENCFF946EOR,released,...,signal,fold change over control,,,,26,3.0,"[1, 2]","[1_1, 2_1]",https://encode-public.s3.amazonaws.com/2016/04...
1,AEBP2,HEK293,TF ChIP-seq,Stanford University,"[1, 2]",human,EFO:0001182,"Michael Snyder, Stanford",ENCFF649VIZ,released,...,signal,fold change over control,,,,26,3.0,"[1, 2]","[1_1, 2_1]",https://encode-public.s3.amazonaws.com/2016/09...
2,AFF1,K562,TF ChIP-seq,Stanford University,"[1, 2]",human,EFO:0002067,"Michael Snyder, Stanford",ENCFF870PDS,released,...,signal,fold change over control,,,,26,3.0,"[1, 2]","[1_1, 2_1]",https://encode-public.s3.amazonaws.com/2017/05...
3,AGO1,K562,TF ChIP-seq,University of California at San Diego,"[1, 2]",human,EFO:0002067,"Xiang-Dong Fu, UCSD",ENCFF054VBS,released,...,signal,fold change over control,,,,26,3.0,"[1, 2]","[1_1, 2_1]",https://encode-public.s3.amazonaws.com/2017/02...
4,AGO2,HepG2,TF ChIP-seq,University of California at San Diego,"[1, 2]",human,EFO:0001187,"Xiang-Dong Fu, UCSD",ENCFF010SVM,released,...,signal,fold change over control,,,,26,3.0,"[1, 2]","[1_1, 2_1]",https://encode-public.s3.amazonaws.com/2017/02...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1036,ZSCAN5A,HEK293,TF ChIP-seq,Stanford University,"[1, 2]",human,EFO:0001182,"Michael Snyder, Stanford",ENCFF446MUO,released,...,signal,fold change over control,,,,26,3.0,"[1, 2]","[1_1, 2_1]",https://encode-public.s3.amazonaws.com/2016/09...
1037,ZSCAN5C,HEK293,TF ChIP-seq,Stanford University,"[1, 2]",human,EFO:0001182,"Michael Snyder, Stanford",ENCFF881ROP,released,...,signal,fold change over control,,,,26,3.0,"[1, 2]","[1_1, 2_1]",https://encode-public.s3.amazonaws.com/2017/03...
1038,ZSCAN9,HepG2,TF ChIP-seq,HudsonAlpha Institute for Biotechnology,"[1, 2]",human,EFO:0001187,"Richard Myers, HAIB",ENCFF572RWA,released,...,signal,fold change over control,,,,26,3.0,"[1, 2]","[1_1, 2_1]",https://encode-public.s3.amazonaws.com/2018/11...
1039,ZXDB,HEK293,TF ChIP-seq,Stanford University,"[1, 2]",human,EFO:0001182,"Michael Snyder, Stanford",ENCFF342SQC,released,...,signal,fold change over control,,,,26,3.0,"[1, 2]","[1_1, 2_1]",https://encode-public.s3.amazonaws.com/2016/09...
