# Introduction

We discussed what to do witht the pipeline configuration files, and it was suggested I submit them as attachments.

They attachments may also need to be added to Analysis objects that Jennifer added.

The DCC doesn't support yaml, so I'll probably need to submit them as plain text.

In [1]:
from datetime import datetime
from pathlib import Path
import pandas
import os
import sys
from urllib.parse import quote_plus

In [2]:
EC = str(Path("~/proj/encoded_client").expanduser())
if EC not in sys.path:
    sys.path.append(EC)

from encoded_client.encoded import ENCODED, DCCValidator, Document, HTTPError

In [3]:
library_configs = []
production = Path("production")
for lab in production.iterdir():
    for library in lab.iterdir():
        if library.is_dir():
            library_id = library.name
            library_configs.append({
                "uuid": None,
                "local_filename": library / "config.yaml",
                "remote_filename": str(library / "config.yaml.txt"),
                "mime_type": "text/plain",
                "document type": "workflow metadata",
                "md5sum": None,
                "library_id": library_id
            })
library_configs = pandas.DataFrame(library_configs)
print("Total", library_configs.shape[0])
library_configs

Total 616


Unnamed: 0,uuid,local_filename,remote_filename,mime_type,document type,md5sum,library_id
0,,production/stanford/ENCLB527WWJ/config.yaml,production/stanford/ENCLB527WWJ/config.yaml.txt,text/plain,workflow metadata,,ENCLB527WWJ
1,,production/stanford/ENCLB619VVF/config.yaml,production/stanford/ENCLB619VVF/config.yaml.txt,text/plain,workflow metadata,,ENCLB619VVF
2,,production/stanford/ENCLB002DZK/config.yaml,production/stanford/ENCLB002DZK/config.yaml.txt,text/plain,workflow metadata,,ENCLB002DZK
3,,production/stanford/ENCLB814PUF/config.yaml,production/stanford/ENCLB814PUF/config.yaml.txt,text/plain,workflow metadata,,ENCLB814PUF
4,,production/stanford/ENCLB280ZGL/config.yaml,production/stanford/ENCLB280ZGL/config.yaml.txt,text/plain,workflow metadata,,ENCLB280ZGL
...,...,...,...,...,...,...,...
611,,production/uci/ENCLB595FHR/config.yaml,production/uci/ENCLB595FHR/config.yaml.txt,text/plain,workflow metadata,,ENCLB595FHR
612,,production/uci/ENCLB947PWM/config.yaml,production/uci/ENCLB947PWM/config.yaml.txt,text/plain,workflow metadata,,ENCLB947PWM
613,,production/uci/ENCLB106VMV/config.yaml,production/uci/ENCLB106VMV/config.yaml.txt,text/plain,workflow metadata,,ENCLB106VMV
614,,production/uci/ENCLB327QWK/config.yaml,production/uci/ENCLB327QWK/config.yaml.txt,text/plain,workflow metadata,,ENCLB327QWK


In [4]:
#server = ENCODED("test.encodedcc.org")
server = ENCODED("www.encodeproject.org")
validator = DCCValidator(server)

In [5]:
def get_experiment_by_library_id(server, library_id):
    result = server.search_jsonld(searchTerm=library_id)
    for experiment_row in result["@graph"]:
        if 'Experiment' in experiment_row["@type"]:
            return server.get_json(experiment_row["@id"])

def get_analysis_by_library_id(server, library_id):
    experiment = get_experiment_by_library_id(server, library_id)
    if experiment is None:
        return
    
    default_analysis = experiment.get("default_analysis")
    if default_analysis is None: 
        return
    
    for analysis in experiment["analyses"]:
        if analysis["@id"] == default_analysis:
            return analysis

get_analysis_by_library_id(server, "ENCLB280ZGL")["@id"]

'/analyses/ENCAN471PLI/'

In [6]:
def get_posted_documents(server):
    posted_documents = []
    query = "/search/?searchTerm=@Documents&lab.title=Barbara%20Wold,%20Caltech"
    graph = server.get_json(query)
    for row in graph["@graph"]:
        attachment = row["attachment"]
        posted_documents.append({
            "date_created": datetime.strptime(row["date_created"], "%Y-%m-%dT%H:%M:%S.%f%z"),
            "@id": row["@id"],
            "description": row["description"], 
            "document_type": row["document_type"], 
            "mime_type": attachment["type"], 
            "remote_filename": attachment["download"],
            "href": attachment["href"]
        })
    posted_documents = pandas.DataFrame(posted_documents)
    return posted_documents

posted_documents = get_posted_documents(server)
posted_documents.head()

Unnamed: 0,date_created,@id,description,document_type,mime_type,remote_filename,href
0,2014-06-27 17:43:53.305108+00:00,/documents/e84de88e-4c31-4b2a-b6ac-96feb764b2d0/,mRNA Selection Protocol,extraction protocol,application/pdf,dynabeads_mrna_purification_man.pdf,@@download/attachment/dynabeads_mrna_purificat...
1,2017-09-25 18:17:18.814210+00:00,/documents/92251b87-30e2-4a32-a3e3-c4b3c7856329/,This document describes the Fluidigm C1 cell i...,other,application/pdf,C1_mRNA-Seq_pr_100-7168L1 (2).pdf,@@download/attachment/C1_mRNA-Seq_pr_100-7168L...
2,2022-06-13 22:51:25.495498+00:00,/documents/5441e3af-f996-4a51-b2a2-982e3427a6bc/,Configuration file for scRNA-seq pipeline,workflow metadata,text/plain,production%2Fuci%2FENCLB217YGZ%2Fconfig.yaml.txt,@@download/attachment/production%252Fuci%252FE...
3,2022-06-13 22:51:34.446087+00:00,/documents/89831fba-ae30-4af5-a619-3eadaba73f72/,Configuration file for scRNA-seq pipeline,workflow metadata,text/plain,production%2Fstanford%2FENCLB123ZDT%2Fconfig.y...,@@download/attachment/production%252Fstanford%...
4,2022-06-13 22:52:43.578942+00:00,/documents/484d361b-8d11-433d-8398-44f58eb0f700/,Configuration file for scRNA-seq pipeline,workflow metadata,text/plain,production%2Fstanford%2FENCLB526RVS%2Fconfig.y...,@@download/attachment/production%252Fstanford%...


In [7]:
#posted_documents.set_index("remote_filename").loc['production/stanford/ENCLB527WWJ/config.yaml.txt']

In [13]:
for name in posted_documents.set_index("remote_filename").index:
    if "ENCLB527WWJ" in name:
        print(name)

In [14]:
def get_document_from_posted(posted, filename):
    escaped = quote_plus(filename)
    print("searching {}".format(escaped))
    candidates = posted[posted["remote_filename"] == escaped]
    if candidates.shape[0] == 0:
        # no matches
        return
    elif isinstance(candidates, pandas.DataFrame):
        return candidates.loc[candidates.first_valid_index()]
    elif isinstance(candidates, pandas.Series):
        return candidates
    else:
        raise RuntimeError("Unexpected internal type {}".format(type(candidates)))
    
#print(get_document_from_posted(posted_documents, 'production/stanford/ENCLB527WWJ/config.yaml.txt'))
print(get_document_from_posted(posted_documents, 'production/stanford/ENCLB526RVS/config.yaml.txt'))
print(get_document_from_posted(posted_documents, "hope_not_real.txt"))


searching production%2Fstanford%2FENCLB526RVS%2Fconfig.yaml.txt
date_created                        2022-06-13 22:52:43.578942+00:00
@id                 /documents/484d361b-8d11-433d-8398-44f58eb0f700/
description                Configuration file for scRNA-seq pipeline
document_type                                      workflow metadata
mime_type                                                 text/plain
remote_filename    production%2Fstanford%2FENCLB526RVS%2Fconfig.y...
href               @@download/attachment/production%252Fstanford%...
Name: 4, dtype: object
searching hope_not_real.txt
None


In [9]:
print(get_document_from_posted(posted_documents, 'production/stanford_heart_20220621/ENCLB138XBO/config.yaml'))


None


In [10]:
posted_documents

Unnamed: 0,date_created,@id,description,document_type,mime_type,remote_filename,href
0,2014-06-27 17:43:53.305108+00:00,/documents/e84de88e-4c31-4b2a-b6ac-96feb764b2d0/,mRNA Selection Protocol,extraction protocol,application/pdf,dynabeads_mrna_purification_man.pdf,@@download/attachment/dynabeads_mrna_purificat...
1,2017-09-25 18:17:18.814210+00:00,/documents/92251b87-30e2-4a32-a3e3-c4b3c7856329/,This document describes the Fluidigm C1 cell i...,other,application/pdf,C1_mRNA-Seq_pr_100-7168L1 (2).pdf,@@download/attachment/C1_mRNA-Seq_pr_100-7168L...
2,2022-06-13 22:51:25.495498+00:00,/documents/5441e3af-f996-4a51-b2a2-982e3427a6bc/,Configuration file for scRNA-seq pipeline,workflow metadata,text/plain,production%2Fuci%2FENCLB217YGZ%2Fconfig.yaml.txt,@@download/attachment/production%252Fuci%252FE...
3,2022-06-13 22:51:34.446087+00:00,/documents/89831fba-ae30-4af5-a619-3eadaba73f72/,Configuration file for scRNA-seq pipeline,workflow metadata,text/plain,production%2Fstanford%2FENCLB123ZDT%2Fconfig.y...,@@download/attachment/production%252Fstanford%...
4,2022-06-13 22:52:43.578942+00:00,/documents/484d361b-8d11-433d-8398-44f58eb0f700/,Configuration file for scRNA-seq pipeline,workflow metadata,text/plain,production%2Fstanford%2FENCLB526RVS%2Fconfig.y...,@@download/attachment/production%252Fstanford%...
5,2022-06-13 22:51:37.623874+00:00,/documents/8d6dd36c-210d-4d6d-ba26-c8582db6123e/,Configuration file for scRNA-seq pipeline,workflow metadata,text/plain,production%2Fstanford%2FENCLB063XDV%2Fconfig.y...,@@download/attachment/production%252Fstanford%...
6,2022-06-13 22:54:29.683734+00:00,/documents/f1971706-69cd-4e8a-8b3e-801aa6c73ebe/,Configuration file for scRNA-seq pipeline,workflow metadata,text/plain,production%2Fuci%2FENCLB870FRH%2Fconfig.yaml.txt,@@download/attachment/production%252Fuci%252FE...
7,2022-06-13 22:54:12.522463+00:00,/documents/b56fe10c-42d3-48ba-b49a-936086b66350/,Configuration file for scRNA-seq pipeline,workflow metadata,text/plain,production%2Fuci%2FENCLB057CVT%2Fconfig.yaml.txt,@@download/attachment/production%252Fuci%252FE...
8,2022-06-13 22:51:39.678880+00:00,/documents/279a8145-bf10-4e4a-b792-86905d1b20bf/,Configuration file for scRNA-seq pipeline,workflow metadata,text/plain,production%2Fstanford%2FENCLB025GOX%2Fconfig.y...,@@download/attachment/production%252Fstanford%...
9,2022-06-13 22:55:42.400302+00:00,/documents/fd8559fa-7535-4b44-ad3e-03f29d5ebc40/,Configuration file for scRNA-seq pipeline,workflow metadata,text/plain,production%2Fuci%2FENCLB215OBG%2Fconfig.yaml.txt,@@download/attachment/production%252Fuci%252FE...


In [15]:
def submit_document(server, row, library_configs, dry_run=True):
    result = {
        "create_document_log": None,
        "library_id": None,
        "document_id": None,
        "analysis_id": None,
        "filename": None
    }

    library_configs = library_configs.set_index("remote_filename")
    workflow_doc = Document(
        row.local_filename, 
        "workflow metadata", 
        "Configuration file for scRNA-seq pipeline",
        filename=row.remote_filename,
        server=server,
    )
    result["filename"] = workflow_doc.filename
    
    workflow_remote_filename = workflow_doc.filename
    library_id = library_configs.loc[workflow_remote_filename]["library_id"]
    
    # try to block double escaping
    assert "%" not in workflow_remote_filename
    posted = get_posted_documents(server)
    posted_document = get_document_from_posted(posted, workflow_remote_filename)
    
    if posted_document is None:
        if not dry_run:
            result["create_document_log"] = workflow_doc.create_if_needed(server, workflow_doc.uuid, validator)
            if result["create_document_log"]["status"] == "success":
                workflow_id = result["create_document_log"]["@graph"][0]["@id"]
            else:
                workflow_id = None
        else:
            workflow_id = "would create"
    else:
        workflow_id = posted_document["@id"]
        
    # 2) attach document to analysis object    
    assert workflow_id is not None
    assert library_id is not None
    result["library_id"] = library_id
    result["document_id"] = workflow_id
    
    analysis = get_analysis_by_library_id(server, library_id)
    if workflow_id not in analysis["documents"]:
        #print("adding {} to {}".format(workflow_id, analysis["@id"]))
        #documents = analysis["documents"]
        #documents.append(workflow_id)
        #result = server.patch_json(analysis["@id"], {"documents": documents})
        #responses.append(result)
        result["analysis_id"] = analysis["@id"]
        
    return result

result = submit_document(server, library_configs.loc[556], library_configs, dry_run=True)
result

searching production%2Fuci%2FENCLB419YTE%2Fconfig.yaml.txt


{'create_document_log': None,
 'library_id': 'ENCLB419YTE',
 'document_id': 'would create',
 'analysis_id': '/analyses/ENCAN597RYW/',
 'filename': 'production/uci/ENCLB419YTE/config.yaml.txt'}

In [None]:
results = []
for i, row in library_configs.iterrows():
    results.append(submit_document(server, row, library_configs, dry_run=True))
#    if i > 5:
#        break

In [None]:
posted_results = pandas.DataFrame(results)
posted_results[["library_id", "document_id", "analysis_id", "filename"]].to_csv(
    "document_analysis_{}.tsv".format(server.server),
    sep="\t",
    index=False
)
posted_results

In [None]:
latest_configs = library_configs[library_configs["local_filename"].apply(lambda lib: Path(lib).parts[1] in ("stanford_heart_20220810",))]


In [None]:
if 0:
    latest_results = []
    for i, row in latest_configs.iterrows():
        latest_results.append(submit_document(server, row, library_configs, dry_run=True))
    #    if i > 5:
    #        break

    latest_results = pandas.DataFrame(latest_results)
    target_name = Path("document_analysis_{}_20220810.tsv".format(server.server))
    if not target_name.exists():
        latest_results[["library_id", "document_id", "analysis_id", "filename"]].to_csv(
            target_name,
            sep="\t",
            index=False
        )
    latest_results.head()

In [None]:
liz_configs = library_configs[library_configs["local_filename"].apply(lambda lib: Path(lib).parts[1] in ("liz_reprocess",))]
liz_configs


In [None]:
if 0:
    liz_results = []
    for i, row in liz_configs.iterrows():
        liz_results.append(submit_document(server, row, liz_configs, dry_run=True))

    liz_results = pandas.DataFrame(liz_results)
    target_name = Path("document_analysis_{}_liz_resubmit.tsv".format(server.server))
    if not target_name.exists():
        liz_results[["library_id", "document_id", "analysis_id", "filename"]].to_csv(
            target_name,
            sep="\t",
            index=False
        )
    print(liz_results.head())

In [None]:
analyses = pandas.DataFrame([["ENCSR398YBK", "ENCLB398IAZ", "barbara-wold:ENCSR398YBK_analysis"],
["ENCSR231FNL", "ENCLB398IAZ", "barbara-wold:ENCSR231FNL_analysis"],
["ENCSR176WWW", "ENCLB872TNB", "barbara-wold:ENCSR176WWW_analysis"],
["ENCSR980OCK", "ENCSR980OCK", "barbara-wold:ENCSR980OCK_analysis"],
["ENCSR814LMX", "ENCLB366ZFV", "barbara-wold:ENCSR814LMX_analysis"],
["ENCSR067BOK", "ENCLB849AUZ", "barbara-wold:ENCSR067BOK_analysis"]], columns=["experiment_id", "library_id", "alias"])

analyses

In [None]:
latest_results

In [None]:
library_configs

In [None]:
missing_docs_query = "https://www.encodeproject.org/report/?type=Analysis&files.assay_term_name=single-cell+RNA+sequencing+assay&documents!=*&pipelines=%2Fpipelines%2FENCPL257SYI%2F&field=%40id&field=status&field=pipeline_award_rfas&field=assembly&field=genome_annotation&field=datasets"

experiments_missing_docs = []
for row in server.get_json(missing_docs_query)["@graph"]:
    experiment_id = row["datasets"][0]
    experiment = server.get_json(experiment_id)
    for replicate in experiment["replicates"]:
        library = replicate["library"]
        experiments_missing_docs.append({
            "experiment": experiment["accession"],
            "library": library["accession"],
            "analyses": row["@id"],
            "date_created": experiment["date_created"],
            "summary": experiment["simple_biosample_summary"],
            
        })

experiments_missing_docs = pandas.DataFrame(experiments_missing_docs)
experiments_missing_docs 

In [None]:
libraries_missing_docs = set(experiments_missing_docs["library"].to_list())
library_configs[library_configs["library_id"].isin(libraries_missing_docs)]

In [None]:
ls production

# Sept 22 heart

In [None]:
stanford_sept22_configs = library_configs[library_configs["local_filename"].apply(lambda lib: Path(lib).parts[1] in ("stanford_heart_20220822",))]
stanford_sept22_configs


In [None]:
if 0:
    stanford_sept22_results = []
    for i, row in stanford_sept22_configs.iterrows():
        stanford_sept22_results.append(submit_document(server, row, stanford_sept22_configs, dry_run=True))

    stanford_sept22_results = pandas.DataFrame(stanford_sept22_results)
    target_name = Path("document_analysis_{}_stanford_20220822.tsv".format(server.server))
    if not target_name.exists():
        stanford_sept22_results[["library_id", "document_id", "analysis_id", "filename"]].to_csv(
            target_name,
            sep="\t",
            index=False
        )
    print(stanford_sept22_results.head())
    print(stanford_sept22_results.shape[0])

# Snyder Liver 2023 01 10

In [18]:
liver_jan10_configs = library_configs[library_configs["local_filename"].apply(lambda lib: Path(lib).parts[1] in ("snyder_liver_20230110",))]
liver_jan10_configs


Unnamed: 0,uuid,local_filename,remote_filename,mime_type,document type,md5sum,library_id
129,,production/snyder_liver_20230110/ENCLB540OBF/c...,production/snyder_liver_20230110/ENCLB540OBF/c...,text/plain,workflow metadata,,ENCLB540OBF
130,,production/snyder_liver_20230110/ENCLB823WAF/c...,production/snyder_liver_20230110/ENCLB823WAF/c...,text/plain,workflow metadata,,ENCLB823WAF
131,,production/snyder_liver_20230110/ENCLB566WWJ/c...,production/snyder_liver_20230110/ENCLB566WWJ/c...,text/plain,workflow metadata,,ENCLB566WWJ
132,,production/snyder_liver_20230110/ENCLB054SXC/c...,production/snyder_liver_20230110/ENCLB054SXC/c...,text/plain,workflow metadata,,ENCLB054SXC
133,,production/snyder_liver_20230110/ENCLB401DAZ/c...,production/snyder_liver_20230110/ENCLB401DAZ/c...,text/plain,workflow metadata,,ENCLB401DAZ
134,,production/snyder_liver_20230110/ENCLB851FFF/c...,production/snyder_liver_20230110/ENCLB851FFF/c...,text/plain,workflow metadata,,ENCLB851FFF
135,,production/snyder_liver_20230110/ENCLB401UMP/c...,production/snyder_liver_20230110/ENCLB401UMP/c...,text/plain,workflow metadata,,ENCLB401UMP
136,,production/snyder_liver_20230110/ENCLB713GHH/c...,production/snyder_liver_20230110/ENCLB713GHH/c...,text/plain,workflow metadata,,ENCLB713GHH


In [21]:
if 0:
    liver_jan10_results = []
    for i, row in liver_jan10_configs.iterrows():
        liver_jan10_results.append(submit_document(server, row, liver_jan10_configs, dry_run=False))

    liver_jan10_results = pandas.DataFrame(liver_jan10_results)
    target_name = Path("document_analysis_{}_snyder_liver_20230110.tsv".format(server.server))
    if not target_name.exists():
        liver_jan10_results[["library_id", "document_id", "analysis_id", "filename"]].to_csv(
            target_name,
            sep="\t",
            index=False
        )
    print(liver_jan10_results.head())
    print(liver_jan10_results.shape[0])

searching production%2Fsnyder_liver_20230110%2FENCLB540OBF%2Fconfig.yaml.txt
searching production%2Fsnyder_liver_20230110%2FENCLB823WAF%2Fconfig.yaml.txt
searching production%2Fsnyder_liver_20230110%2FENCLB566WWJ%2Fconfig.yaml.txt
searching production%2Fsnyder_liver_20230110%2FENCLB054SXC%2Fconfig.yaml.txt
searching production%2Fsnyder_liver_20230110%2FENCLB401DAZ%2Fconfig.yaml.txt
searching production%2Fsnyder_liver_20230110%2FENCLB851FFF%2Fconfig.yaml.txt
searching production%2Fsnyder_liver_20230110%2FENCLB401UMP%2Fconfig.yaml.txt
searching production%2Fsnyder_liver_20230110%2FENCLB713GHH%2Fconfig.yaml.txt
                                 create_document_log   library_id  \
0  {'status': 'success', '@type': ['result'], '@g...  ENCLB540OBF   
1  {'status': 'success', '@type': ['result'], '@g...  ENCLB823WAF   
2  {'status': 'success', '@type': ['result'], '@g...  ENCLB566WWJ   
3  {'status': 'success', '@type': ['result'], '@g...  ENCLB054SXC   
4  {'status': 'success', '@type': ['res