# Introduction

We discussed what to do witht the pipeline configuration files, and it was suggested I submit them as attachments.

They attachments may also need to be added to Analysis objects that Jennifer added.

The DCC doesn't support yaml, so I'll probably need to submit them as plain text.

In [1]:
from datetime import datetime
from pathlib import Path
import pandas
import os
import sys
from urllib.parse import quote_plus

In [2]:
EC = str(Path("~/proj/encoded_client").expanduser())
if EC not in sys.path:
    sys.path.append(EC)

from encoded_client.encoded import ENCODED, DCCValidator, Document, HTTPError

In [3]:
library_configs = []
production = Path("production")
for lab in production.iterdir():
    for library in lab.iterdir():
        if library.is_dir():
            library_id = library.name
            library_configs.append({
                "uuid": None,
                "local_filename": library / "config.yaml",
                "remote_filename": str(library / "config.yaml.txt"),
                "mime_type": "text/plain",
                "document type": "workflow metadata",
                "md5sum": None,
                "library_id": library_id
            })
library_configs = pandas.DataFrame(library_configs)
print("Total", library_configs.shape[0])
library_configs

Total 557


Unnamed: 0,uuid,local_filename,remote_filename,mime_type,document type,md5sum,library_id
0,,production/stanford/ENCLB527WWJ/config.yaml,production/stanford/ENCLB527WWJ/config.yaml.txt,text/plain,workflow metadata,,ENCLB527WWJ
1,,production/stanford/ENCLB619VVF/config.yaml,production/stanford/ENCLB619VVF/config.yaml.txt,text/plain,workflow metadata,,ENCLB619VVF
2,,production/stanford/ENCLB002DZK/config.yaml,production/stanford/ENCLB002DZK/config.yaml.txt,text/plain,workflow metadata,,ENCLB002DZK
3,,production/stanford/ENCLB814PUF/config.yaml,production/stanford/ENCLB814PUF/config.yaml.txt,text/plain,workflow metadata,,ENCLB814PUF
4,,production/stanford/ENCLB280ZGL/config.yaml,production/stanford/ENCLB280ZGL/config.yaml.txt,text/plain,workflow metadata,,ENCLB280ZGL
...,...,...,...,...,...,...,...
552,,production/uci/ENCLB595FHR/config.yaml,production/uci/ENCLB595FHR/config.yaml.txt,text/plain,workflow metadata,,ENCLB595FHR
553,,production/uci/ENCLB947PWM/config.yaml,production/uci/ENCLB947PWM/config.yaml.txt,text/plain,workflow metadata,,ENCLB947PWM
554,,production/uci/ENCLB106VMV/config.yaml,production/uci/ENCLB106VMV/config.yaml.txt,text/plain,workflow metadata,,ENCLB106VMV
555,,production/uci/ENCLB327QWK/config.yaml,production/uci/ENCLB327QWK/config.yaml.txt,text/plain,workflow metadata,,ENCLB327QWK


In [4]:
#server = ENCODED("test.encodedcc.org")
server = ENCODED("www.encodeproject.org")
validator = DCCValidator(server)

In [5]:
def get_experiment_by_library_id(server, library_id):
    result = server.search_jsonld(searchTerm=library_id)
    for experiment_row in result["@graph"]:
        if 'Experiment' in experiment_row["@type"]:
            return server.get_json(experiment_row["@id"])

def get_analysis_by_library_id(server, library_id):
    experiment = get_experiment_by_library_id(server, library_id)
    if experiment is None:
        return
    
    default_analysis = experiment.get("default_analysis")
    if default_analysis is None: 
        return
    
    for analysis in experiment["analyses"]:
        if analysis["@id"] == default_analysis:
            return analysis

get_analysis_by_library_id(server, "ENCLB280ZGL")["@id"]

'/analyses/ENCAN471PLI/'

In [6]:
def get_posted_documents(server):
    posted_documents = []
    query = "/search/?searchTerm=@Documents&lab.title=Barbara%20Wold,%20Caltech"
    graph = server.get_json(query)
    for row in graph["@graph"]:
        attachment = row["attachment"]
        posted_documents.append({
            "date_created": datetime.strptime(row["date_created"], "%Y-%m-%dT%H:%M:%S.%f%z"),
            "@id": row["@id"],
            "description": row["description"], 
            "document_type": row["document_type"], 
            "mime_type": attachment["type"], 
            "remote_filename": attachment["download"],
            "href": attachment["href"]
        })
    posted_documents = pandas.DataFrame(posted_documents)
    return posted_documents

posted_documents = get_posted_documents(server)
posted_documents.head()

Unnamed: 0,date_created,@id,description,document_type,mime_type,remote_filename,href
0,2013-09-20 21:18:53.862006-08:00,/documents/5f5193b0-0c61-40c6-a286-b1f6d6f0c0da/,A copy of the Smart-seq protocol,general protocol,application/pdf,SmartSeqProtocol.pdf,@@download/attachment/SmartSeqProtocol.pdf
1,2014-12-13 00:46:45.269582+00:00,/documents/5167c161-0e64-4aaf-8722-6a6dc8cd2af0/,Revised Jackson quote August 4 2011,strain generation protocol,application/pdf,Revised Jackson_quote_August42011.pdf,@@download/attachment/Revised%20Jackson_quote_...
2,2014-12-13 00:46:45.731485+00:00,/documents/90c2b544-e312-4987-9a38-9fbd8c0790ab/,Status Report Sept 27 2011,strain generation protocol,application/pdf,Status report Sept 27 2011.pdf,@@download/attachment/Status%20report%20Sept%2...
3,2013-09-13 13:28:33.582873-08:00,/documents/c7dba054-d2e2-46d2-8834-304e37aa4846/,Cell Growth Protocol and Differentiation treat...,growth protocol,application/pdf,10ToneHalf_Wold_protocol.pdf,@@download/attachment/10ToneHalf_Wold_protocol...
4,2013-09-13 13:28:58.346451-08:00,/documents/b02dbc05-d0c8-4754-98f0-5b994d568521/,"LHCN-M2 cell culture, differentiation treatmen...",general protocol,application/pdf,LHCNM2_Wold_protocol.pdf,@@download/attachment/LHCNM2_Wold_protocol.pdf


In [7]:
#posted_documents.set_index("remote_filename").loc['production/stanford/ENCLB527WWJ/config.yaml.txt']

In [8]:
def get_document_from_posted(posted, filename):
    candidates = posted[posted["remote_filename"] == filename]
    if candidates.shape[0] == 0:
        # no matches
        return
    elif isinstance(candidates, pandas.DataFrame):
        return candidates.loc[candidates.first_valid_index()]
    elif isinstance(candidates, pandas.Series):
        return candidates
    else:
        raise RuntimeError("Unexpected internal type {}".format(type(candidates)))
    
print(get_document_from_posted(posted_documents, 'production/stanford/ENCLB527WWJ/config.yaml.txt'))
print(get_document_from_posted(posted_documents, "hope_not_real.txt"))

None
None


In [9]:
def submit_document(server, row, library_configs):
    result = {
        "create_document_log": None,
        "library_id": None,
        "document_id": None,
        "analysis_id": None,
        "filename": None
    }

    library_configs = library_configs.set_index("remote_filename")
    workflow_doc = Document(
        row.local_filename, 
        "workflow metadata", 
        "Configuration file for scRNA-seq pipeline",
        filename=row.remote_filename
    )
    result["filename"] = workflow_doc.filename
    
    workflow_remote_filename = workflow_doc.filename
    library_id = library_configs.loc[workflow_remote_filename]["library_id"]
    
    # try to block double escaping
    assert "%" not in workflow_remote_filename
    posted = get_posted_documents(server)
    posted_document = get_document_from_posted(posted, quote_plus(workflow_remote_filename))
    
    if posted_document is None:
        result["create_document_log"] = workflow_doc.create_if_needed(server, workflow_doc.uuid, validator)
        if result["create_document_log"]["status"] == "success":
            workflow_id = result["create_document_log"]["@graph"][0]["@id"]
        else:
            workflow_id = None
    else:
        workflow_id = posted_document["@id"]
        
    # 2) attach document to analysis object    
    assert workflow_id is not None
    assert library_id is not None
    result["library_id"] = library_id
    result["document_id"] = workflow_id
    
    analysis = get_analysis_by_library_id(server, library_id)
    if workflow_id not in analysis["documents"]:
        #print("adding {} to {}".format(workflow_id, analysis["@id"]))
        #documents = analysis["documents"]
        #documents.append(workflow_id)
        #result = server.patch_json(analysis["@id"], {"documents": documents})
        #responses.append(result)
        result["analysis_id"] = analysis["@id"]
        
    return result

result = submit_document(server, library_configs.loc[556], library_configs)
result

{'create_document_log': {'status': 'success',
  '@type': ['result'],
  '@graph': [{'references': [],
    'date_created': '2022-06-13T22:51:25.495498+00:00',
    'submitted_by': '/users/bc5b62f7-ce28-4a1e-b6b3-81c9c5a86d7a/',
    'status': 'in progress',
    'lab': '/labs/barbara-wold/',
    'award': '/awards/U54HG006998/',
    'aliases': [],
    'attachment': {'download': 'production%2Fuci%2FENCLB217YGZ%2Fconfig.yaml.txt',
     'href': '@@download/attachment/production%252Fuci%252FENCLB217YGZ%252Fconfig.yaml.txt',
     'type': 'text/plain',
     'md5sum': 'd691a9acac806e637fcfcdfc5e782416'},
    'schema_version': '8',
    'document_type': 'workflow metadata',
    'description': 'Configuration file for scRNA-seq pipeline',
    'urls': [],
    '@id': '/documents/5441e3af-f996-4a51-b2a2-982e3427a6bc/',
    '@type': ['Document', 'Item'],
    'uuid': '5441e3af-f996-4a51-b2a2-982e3427a6bc'}]},
 'library_id': 'ENCLB217YGZ',
 'document_id': '/documents/5441e3af-f996-4a51-b2a2-982e3427a6bc/',
 

In [10]:
results = []
for i, row in library_configs.iterrows():
    results.append(submit_document(server, row, library_configs))
#    if i > 5:
#        break

In [11]:
posted_results = pandas.DataFrame(results)
posted_results[["library_id", "document_id", "analysis_id", "filename"]].to_csv(
    "document_analysis_{}.tsv".format(server.server),
    sep="\t",
    index=False
)

In [12]:
posted_results

Unnamed: 0,create_document_log,library_id,document_id,analysis_id,filename
0,"{'status': 'success', '@type': ['result'], '@g...",ENCLB527WWJ,/documents/03a6306e-0238-4ac3-bca3-a9d7ba379edb/,/analyses/ENCAN717NNE/,production/stanford/ENCLB527WWJ/config.yaml.txt
1,"{'status': 'success', '@type': ['result'], '@g...",ENCLB619VVF,/documents/59b6f4ef-5e42-439d-b38a-ce91c9469a12/,/analyses/ENCAN466NKC/,production/stanford/ENCLB619VVF/config.yaml.txt
2,"{'status': 'success', '@type': ['result'], '@g...",ENCLB002DZK,/documents/253252a7-2398-40ef-98f1-d2c27f4108fd/,/analyses/ENCAN320AVL/,production/stanford/ENCLB002DZK/config.yaml.txt
3,"{'status': 'success', '@type': ['result'], '@g...",ENCLB814PUF,/documents/02332226-b462-46e7-adb8-32a1d822bceb/,/analyses/ENCAN957BHU/,production/stanford/ENCLB814PUF/config.yaml.txt
4,"{'status': 'success', '@type': ['result'], '@g...",ENCLB280ZGL,/documents/26dad66b-65d9-4c3b-bbd2-037996d24d04/,/analyses/ENCAN471PLI/,production/stanford/ENCLB280ZGL/config.yaml.txt
...,...,...,...,...,...
552,"{'status': 'success', '@type': ['result'], '@g...",ENCLB595FHR,/documents/3c8f24a3-a38d-4b69-90c7-d5a0b27958da/,/analyses/ENCAN130URD/,production/uci/ENCLB595FHR/config.yaml.txt
553,"{'status': 'success', '@type': ['result'], '@g...",ENCLB947PWM,/documents/de66bc0a-3503-4ecd-baa4-c7f2cb024361/,/analyses/ENCAN917KYJ/,production/uci/ENCLB947PWM/config.yaml.txt
554,"{'status': 'success', '@type': ['result'], '@g...",ENCLB106VMV,/documents/9bc085de-57b6-45a3-b501-edd16a48a055/,/analyses/ENCAN464RKT/,production/uci/ENCLB106VMV/config.yaml.txt
555,"{'status': 'success', '@type': ['result'], '@g...",ENCLB327QWK,/documents/4462887a-e6ec-4889-80c0-1239fcfd5c59/,/analyses/ENCAN762MJX/,production/uci/ENCLB327QWK/config.yaml.txt
