In [1]:
import pandas
from pathlib import Path
from pprint import pprint
import re
import sys
from collections import Counter, namedtuple

In [2]:
EC = str(Path("~/proj/encoded_client").expanduser())
if EC not in sys.path:
    sys.path.append(EC)
from encoded_client.encoded import ENCODED

In [3]:
server = ENCODED("www.encodeproject.org")

In [4]:
query = server.search_jsonld(type="Experiment", assay_title="scRNA-seq", limit="all", **{"lab.title":  "Ali Mortazavi, UCI"})
print(len(query["@graph"]))

140


In [5]:
query["@graph"][0]["replicates"][0]["library"]["biosample"]

{'organism': {'scientific_name': 'Mus musculus'},
 'age_display': '4 days',
 'life_stage': 'unknown',
 'accession': 'ENCBS937WPV',
 'subcellular_fraction_term_name': 'nucleus'}

In [6]:
metadata = []
prefix = "ali-mortazavi:"
for row in query["@graph"]:
    experiment = server.get_json(row["@id"])
    experiment_aliases = experiment["aliases"]
    assert len(experiment_aliases) == 1
    experiment_aliases = experiment_aliases[0][len(prefix):]

    read_count = 0
    for f in experiment["files"]:
        if f["file_type"] == "fastq":
            read_count = f["read_count"]
            break

    for replicate in experiment["replicates"]:
        #replicate = server.get_json(replicate_stub["@id"])
        library = replicate["library"]
        library_id = library["@id"]
        library_accession = library["accession"]
        library_aliases = library["aliases"]
        assert len(library_aliases) == 1
        library_aliases = library_aliases[0][len(prefix):]
        biosample = library["biosample"]
        biosample_accession = biosample["accession"]
        biosample_id = biosample["@id"]
        biosample_aliases = biosample["aliases"]
        subcellular_fraction = biosample.get("subcellular_fraction_term_name", "")
        assert len(biosample_aliases) == 1
        biosample_aliases = biosample_aliases[0][len(prefix):]

        metadata.append({
            "experiment": row["accession"],
            "experiment_aliases": experiment_aliases,
            "library": library_accession,
            "library_alias": library_aliases,
            "read_count": read_count,
            "biosample": biosample_accession,
            "biosample_alias": biosample_aliases,
            "subcellular_fraction": subcellular_fraction,
            "biosample_description": biosample["description"],
        })

metadata = pandas.DataFrame(metadata)
metadata

Unnamed: 0,experiment,experiment_aliases,library,library_alias,read_count,biosample,biosample_alias,subcellular_fraction,biosample_description
0,ENCSR193PLQ,exp_sr_P4_female_adrenal,ENCLB098ERF,library_sr_P4_female_adrenal_1,7575705,ENCBS937WPV,biosample_P4_female_adrenal_1,nucleus,B6Cast F1 P4 female adrenal 7
1,ENCSR193PLQ,exp_sr_P4_female_adrenal,ENCLB082IOS,library_sr_P4_female_adrenal_2,7575705,ENCBS083TAX,biosample_P4_female_adrenal_2,nucleus,B6Cast F1 P4 female adrenal 8
2,ENCSR962DRE,exp_sr_P25_female_adrenal,ENCLB523MJQ,library_sr_P25_female_adrenal_1,15650091,ENCBS575HZP,biosample_P25_female_adrenal_1,nucleus,B6Cast F1 P25 female adrenal 7
3,ENCSR962DRE,exp_sr_P25_female_adrenal,ENCLB697JVD,library_sr_P25_female_adrenal_2,15650091,ENCBS875QSI,biosample_P25_female_adrenal_2,nucleus,B6Cast F1 P25 female adrenal 8
4,ENCSR331DYV,exp_sr_P2mo_male_adrenal,ENCLB469XPD,library_sr_P2mo_male_adrenal_1,8817023,ENCBS033LDD,biosample_P2mo_male_adrenal_1,nucleus,B6Cast F1 P2mo male adrenal 7
...,...,...,...,...,...,...,...,...,...
269,ENCSR477EIC,exp_sr_P2mo_female_gastrocnemius_shallow,ENCLB983QCI,library_sr_P2mo_female_gastrocnemius_2_shallow,13411124,ENCBS124VMY,biosample_P2mo_female_gastrocnemius_2,nucleus,B6Cast F1 P2mo female gastrocnemius 8
270,ENCSR062ICV,exp_sr_P4_female_cortex_deep,ENCLB401ZUR,library_sr_P4_female_cortex_1_deep,2799625,ENCBS075VEC,biosample_P4_female_cortex_1,nucleus,B6Cast F1 P4 female cortex 7
271,ENCSR062ICV,exp_sr_P4_female_cortex_deep,ENCLB440ABL,library_sr_P4_female_cortex_2_deep,2799625,ENCBS431VFB,biosample_P4_female_cortex_2,nucleus,B6Cast F1 P4 female cortex 8
272,ENCSR877UAI,exp_sr_P4_male_adrenal_deep,ENCLB343SXS,library_sr_P4_male_adrenal_2_deep,4701514,ENCBS314KWD,biosample_P4_male_adrenal_2,nucleus,B6Cast F1 P4 male adrenal 8


In [7]:
metadata = metadata.sort_values("experiment_aliases")

In [13]:
c2c12 = metadata[metadata["experiment_aliases"].apply(lambda x: "c2c12" in x)]
c2c12

Unnamed: 0,experiment,experiment_aliases,library,library_alias,read_count,biosample,biosample_alias,subcellular_fraction,biosample_description
266,ENCSR388EIN,sr_1k_c2c12_sc_mb,ENCLB952MZJ,library_sr_1k_c2c12_sc_mb,8527474,ENCBS521YWL,biosample_c2c12_sc_mb,,C2C12 single-cell myoblasts
263,ENCSR877NAX,sr_1k_c2c12_sn_mb,ENCLB514RKT,library_sr_1k_c2c12_sn_mb,11081223,ENCBS431NOZ,biosample_c2c12_sn_mb,nucleus,C2C12 single-nucleus myoblasts
104,ENCSR572SMD,sr_1k_c2c12_sn_mt,ENCLB273ZWE,library_sr_1k_c2c12_sn_mt,11335807,ENCBS978ZNQ,biosample_c2c12_sn_mt,nucleus,C2C12 single-nucleus myotubes
144,ENCSR503QZJ,sr_9k_c2c12_sc_mb,ENCLB129LMS,library_sr_9k_c2c12_sc_mb,38274516,ENCBS521YWL,biosample_c2c12_sc_mb,,C2C12 single-cell myoblasts
267,ENCSR452OYH,sr_9k_c2c12_sn_mb,ENCLB294FBZ,library_sr_9k_c2c12_sn_mb,44213518,ENCBS431NOZ,biosample_c2c12_sn_mb,nucleus,C2C12 single-nucleus myoblasts
111,ENCSR109DAN,sr_9k_c2c12_sn_mt,ENCLB527JDU,library_sr_9k_c2c12_sn_mt,23120573,ENCBS978ZNQ,biosample_c2c12_sn_mt,nucleus,C2C12 single-nucleus myotubes


In [9]:
[(x["accession"], x["read_count"]) for x in experiment["files"] if x["file_type"] == "fastq"]

[('ENCFF496PNG', 4701514),
 ('ENCFF795FHL', 2330829),
 ('ENCFF077TDR', 4701514),
 ('ENCFF389CIY', 2330829)]

In [14]:
metadata.to_excel("uci-sc-experiment-decoder.xlsx")
c2c12.to_excel("c2c12/c2c12-encode-portal.xlsx")

In [11]:
experiment["assay_term_name"]

'single-cell RNA sequencing assay'

In [12]:
server.get_json("ENCSR877NAX")["assay_term_name"]

'single-cell RNA sequencing assay'