# Introduction

We discussed what to do witht the pipeline configuration files, and it was suggested I submit them as attachments.

They attachments may also need to be added to Analysis objects that Jennifer added.

The DCC doesn't support yaml, so I'll probably need to submit them as plain text.

In [None]:
from datetime import datetime
from pathlib import Path
import pandas
import os
import sys
from urllib.parse import quote_plus

In [None]:
EC = str(Path("~/proj/encoded_client").expanduser())
if EC not in sys.path:
    sys.path.append(EC)

from encoded_client.encoded import ENCODED, DCCValidator, Document, HTTPError

In [None]:
library_configs = []
production = Path("production")
for lab in production.iterdir():
    for library in lab.iterdir():
        if library.is_dir():
            library_id = library.name
            library_configs.append({
                "uuid": None,
                "local_filename": library / "config.yaml",
                "remote_filename": str(library / "config.yaml.txt"),
                "mime_type": "text/plain",
                "document type": "workflow metadata",
                "md5sum": None,
                "library_id": library_id
            })
library_configs = pandas.DataFrame(library_configs)
print("Total", library_configs.shape[0])
library_configs

In [None]:
#server = ENCODED("test.encodedcc.org")
server = ENCODED("www.encodeproject.org")
validator = DCCValidator(server)

In [None]:
def get_experiment_by_library_id(server, library_id):
    result = server.search_jsonld(searchTerm=library_id)
    for experiment_row in result["@graph"]:
        if 'Experiment' in experiment_row["@type"]:
            return server.get_json(experiment_row["@id"])

def get_analysis_by_library_id(server, library_id):
    experiment = get_experiment_by_library_id(server, library_id)
    if experiment is None:
        return
    
    default_analysis = experiment.get("default_analysis")
    if default_analysis is None: 
        return
    
    for analysis in experiment["analyses"]:
        if analysis["@id"] == default_analysis:
            return analysis

get_analysis_by_library_id(server, "ENCLB280ZGL")["@id"]

In [None]:
def get_posted_documents(server):
    posted_documents = []
    query = "/search/?searchTerm=@Documents&lab.title=Barbara%20Wold,%20Caltech"
    graph = server.get_json(query)
    for row in graph["@graph"]:
        attachment = row["attachment"]
        posted_documents.append({
            "date_created": datetime.strptime(row["date_created"], "%Y-%m-%dT%H:%M:%S.%f%z"),
            "@id": row["@id"],
            "description": row["description"], 
            "document_type": row["document_type"], 
            "mime_type": attachment["type"], 
            "remote_filename": attachment["download"],
            "href": attachment["href"]
        })
    posted_documents = pandas.DataFrame(posted_documents)
    return posted_documents

posted_documents = get_posted_documents(server)
posted_documents.head()

In [None]:
#posted_documents.set_index("remote_filename").loc['production/stanford/ENCLB527WWJ/config.yaml.txt']

In [None]:
def get_document_from_posted(posted, filename):
    escaped = quote_plus(filename)
    candidates = posted[posted["remote_filename"] == escaped]
    if candidates.shape[0] == 0:
        # no matches
        return
    elif isinstance(candidates, pandas.DataFrame):
        return candidates.loc[candidates.first_valid_index()]
    elif isinstance(candidates, pandas.Series):
        return candidates
    else:
        raise RuntimeError("Unexpected internal type {}".format(type(candidates)))
    
print(get_document_from_posted(posted_documents, 'production/stanford/ENCLB527WWJ/config.yaml.txt'))
print(get_document_from_posted(posted_documents, "hope_not_real.txt"))


In [None]:
print(get_document_from_posted(posted_documents, 'production/stanford_heart_20220621/ENCLB138XBO/config.yaml'))


In [None]:
def submit_document(server, row, library_configs, dry_run=True):
    result = {
        "create_document_log": None,
        "library_id": None,
        "document_id": None,
        "analysis_id": None,
        "filename": None
    }

    library_configs = library_configs.set_index("remote_filename")
    workflow_doc = Document(
        row.local_filename, 
        "workflow metadata", 
        "Configuration file for scRNA-seq pipeline",
        filename=row.remote_filename,
        server=server,
    )
    result["filename"] = workflow_doc.filename
    
    workflow_remote_filename = workflow_doc.filename
    library_id = library_configs.loc[workflow_remote_filename]["library_id"]
    
    # try to block double escaping
    assert "%" not in workflow_remote_filename
    posted = get_posted_documents(server)
    posted_document = get_document_from_posted(posted, workflow_remote_filename)
    
    if posted_document is None:
        if not dry_run:
            result["create_document_log"] = workflow_doc.create_if_needed(server, workflow_doc.uuid, validator)
            if result["create_document_log"]["status"] == "success":
                workflow_id = result["create_document_log"]["@graph"][0]["@id"]
            else:
                workflow_id = None
        else:
            workflow_id = "would create"
    else:
        workflow_id = posted_document["@id"]
        
    # 2) attach document to analysis object    
    assert workflow_id is not None
    assert library_id is not None
    result["library_id"] = library_id
    result["document_id"] = workflow_id
    
    analysis = get_analysis_by_library_id(server, library_id)
    if workflow_id not in analysis["documents"]:
        #print("adding {} to {}".format(workflow_id, analysis["@id"]))
        #documents = analysis["documents"]
        #documents.append(workflow_id)
        #result = server.patch_json(analysis["@id"], {"documents": documents})
        #responses.append(result)
        result["analysis_id"] = analysis["@id"]
        
    return result

result = submit_document(server, library_configs.loc[556], library_configs, dry_run=True)
result

In [None]:
results = []
for i, row in library_configs.iterrows():
    results.append(submit_document(server, row, library_configs, dry_run=True))
#    if i > 5:
#        break

In [None]:
posted_results = pandas.DataFrame(results)
posted_results[["library_id", "document_id", "analysis_id", "filename"]].to_csv(
    "document_analysis_{}.tsv".format(server.server),
    sep="\t",
    index=False
)
posted_results

In [None]:
latest_configs = library_configs[library_configs["local_filename"].apply(lambda lib: Path(lib).parts[1] in ("stanford_heart_20220810",))]


In [None]:
if 0:
    latest_results = []
    for i, row in latest_configs.iterrows():
        latest_results.append(submit_document(server, row, library_configs, dry_run=True))
    #    if i > 5:
    #        break

    latest_results = pandas.DataFrame(latest_results)
    target_name = Path("document_analysis_{}_20220810.tsv".format(server.server))
    if not target_name.exists():
        latest_results[["library_id", "document_id", "analysis_id", "filename"]].to_csv(
            target_name,
            sep="\t",
            index=False
        )
    latest_results.head()

In [None]:
liz_configs = library_configs[library_configs["local_filename"].apply(lambda lib: Path(lib).parts[1] in ("liz_reprocess",))]
liz_configs


In [None]:
if 0:
    liz_results = []
    for i, row in liz_configs.iterrows():
        liz_results.append(submit_document(server, row, liz_configs, dry_run=True))

    liz_results = pandas.DataFrame(liz_results)
    target_name = Path("document_analysis_{}_liz_resubmit.tsv".format(server.server))
    if not target_name.exists():
        liz_results[["library_id", "document_id", "analysis_id", "filename"]].to_csv(
            target_name,
            sep="\t",
            index=False
        )
    print(liz_results.head())

In [None]:
analyses = pandas.DataFrame([["ENCSR398YBK", "ENCLB398IAZ", "barbara-wold:ENCSR398YBK_analysis"],
["ENCSR231FNL", "ENCLB398IAZ", "barbara-wold:ENCSR231FNL_analysis"],
["ENCSR176WWW", "ENCLB872TNB", "barbara-wold:ENCSR176WWW_analysis"],
["ENCSR980OCK", "ENCSR980OCK", "barbara-wold:ENCSR980OCK_analysis"],
["ENCSR814LMX", "ENCLB366ZFV", "barbara-wold:ENCSR814LMX_analysis"],
["ENCSR067BOK", "ENCLB849AUZ", "barbara-wold:ENCSR067BOK_analysis"]], columns=["experiment_id", "library_id", "alias"])

analyses

In [None]:
latest_results

In [None]:
library_configs

In [None]:
missing_docs_query = "https://www.encodeproject.org/report/?type=Analysis&files.assay_term_name=single-cell+RNA+sequencing+assay&documents!=*&pipelines=%2Fpipelines%2FENCPL257SYI%2F&field=%40id&field=status&field=pipeline_award_rfas&field=assembly&field=genome_annotation&field=datasets"

experiments_missing_docs = []
for row in server.get_json(missing_docs_query)["@graph"]:
    experiment_id = row["datasets"][0]
    experiment = server.get_json(experiment_id)
    for replicate in experiment["replicates"]:
        library = replicate["library"]
        experiments_missing_docs.append({
            "experiment": experiment["accession"],
            "library": library["accession"],
            "analyses": row["@id"],
            "date_created": experiment["date_created"],
            "summary": experiment["simple_biosample_summary"],
            
        })

experiments_missing_docs = pandas.DataFrame(experiments_missing_docs)
experiments_missing_docs 

In [None]:
libraries_missing_docs = set(experiments_missing_docs["library"].to_list())
library_configs[library_configs["library_id"].isin(libraries_missing_docs)]

In [None]:
ls production

# Sept 22 heart

In [None]:
stanford_sept22_configs = library_configs[library_configs["local_filename"].apply(lambda lib: Path(lib).parts[1] in ("stanford_heart_20220822",))]
stanford_sept22_configs


In [None]:
if 0:
    stanford_sept22_results = []
    for i, row in stanford_sept22_configs.iterrows():
        stanford_sept22_results.append(submit_document(server, row, stanford_sept22_configs, dry_run=True))

    stanford_sept22_results = pandas.DataFrame(stanford_sept22_results)
    target_name = Path("document_analysis_{}_stanford_20220822.tsv".format(server.server))
    if not target_name.exists():
        stanford_sept22_results[["library_id", "document_id", "analysis_id", "filename"]].to_csv(
            target_name,
            sep="\t",
            index=False
        )
    print(stanford_sept22_results.head())
    print(stanford_sept22_results.shape[0])