diff --git a/.gitignore b/.gitignore
index 36d483e5..b84875d3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,3 +15,7 @@ dist/
Submit4DN.egg-info/*
coverage.xml
+
+fields_ordered.xls
+
+fields.xls
diff --git a/README.md b/README.md
index 0087be2b..5dd27ab8 100644
--- a/README.md
+++ b/README.md
@@ -86,13 +86,13 @@ get_field_info --type Biosample --comments --outfile biosample.xls
Complete list of sheets:
~~~~
-get_field_info --type Publication --type Document --type Vendor --type Protocol --type BiosampleCellCulture --type Biosource --type Enzyme --type Construct --type TreatmentChemical --type TreatmentRnai --type Modification --type Biosample --type FileFastq --type FileSet --type IndividualHuman --type IndividualMouse --type ExperimentHiC --type ExperimentCaptureC --type Target --type GenomicRegion --type ExperimentSet --type Image --comments --outfile AllItems.xls
+get_field_info --type Publication --type Document --type Vendor --type Protocol --type BiosampleCellCulture --type Biosource --type Enzyme --type Construct --type TreatmentChemical --type TreatmentRnai --type Modification --type Biosample --type FileFastq --type FileSet --type IndividualHuman --type IndividualMouse --type ExperimentHiC --type ExperimentSetReplicate --type ExperimentCaptureC --type Target --type GenomicRegion --type ExperimentSet --type Image --comments --outfile AllItems.xls
~~~~
Complete list of sheets: (using python scripts)
~~~~
-python3 -m wranglertools.get_field_info --type Publication --type Document --type Vendor --type Protocol --type BiosampleCellCulture --type Biosource --type Enzyme --type Construct --type TreatmentChemical --type TreatmentRnai --type Modification --type Biosample --type FileFastq --type FileSet --type IndividualHuman --type IndividualMouse --type ExperimentHiC --type ExperimentCaptureC --type Target --type GenomicRegion --type ExperimentSet --type Image --comments --outfile AllItems.xls
+python3 -m wranglertools.get_field_info --type Publication --type Document --type Vendor --type Protocol --type BiosampleCellCulture --type Biosource --type Enzyme --type Construct --type TreatmentChemical --type TreatmentRnai --type Modification --type Biosample --type FileFastq --type FileSet --type IndividualHuman --type IndividualMouse --type ExperimentHiC --type ExperimentCaptureC --type Target --type GenomicRegion --type ExperimentSet --type ExperimentSetReplicate --type Image --comments --outfile AllItems.xls
~~~~
diff --git a/README.rst b/README.rst
deleted file mode 100644
index 55b8e95f..00000000
--- a/README.rst
+++ /dev/null
@@ -1,137 +0,0 @@
-Submit 4DN - Data Submiter Tools
-================================
-
-|Build Status| |Coverage Status| |Code Quality| |PyPI version|
-
-The Submit4DN package is written by the `4DN Data Coordination and
-Integration Center `__ for data submitters
-from the 4DN Network. Please `contact
-us `__ to get access to
-the system, or if you have any questions or suggestions. Detailed
-documentation on data submission can be found `at this
-link `__
-
-Installing the package
-----------------------
-
-The Submit4DN package is registered with Pypi so installation is as
-simple as:
-
-::
-
- pip3 install submit4dn
-
-Connection
-----------
-
-To be able to use the provided tools, you need to have a secure key to
-access the REST application. If you do not have a secure key, please
-contact `4DN Data
-Wranglers `__ to get an
-account and to learn how to generate a key. Place your key in a json
-file in the following format.
-
-::
-
- {
- "default": {
- "key": "TheConnectionKey",
- "secret": "very_secret_key",
- "server":"www.The4dnWebsite.com"
- }
- }
-
-The default location for the keyfile is your home directory
-``~/keypairs.json``. If you prefer to use a different file location or a
-different key name (not "default"), you can specify your key with the
-``keyfile`` and ``key`` parameters:
-
-::
-
- import_data --keyfile path/to/filename.json --key NotDefault
-
-Generating data submission forms
---------------------------------
-
-To create the data submission xls forms, you can use ``get_field_info``.
-It will accept the following parameters:
-
-::
-
- --type use for each sheet that you want to add to the excel workbook
- --descriptions adds the descriptions in the second line (by default True)
- --enums adds the enum options in the third line (by default True)
- --comments adds the comments together with enums (by default False)
- --writexls creates the xls file (by default True)
- --outfile change the default file name "fields.xls" to a specified one
- --order create an ordered and filtered version of the excel (by default True)
-
-Examples generating a single sheet:
-
-::
-
- get_field_info --type Biosample
- get_field_info --type Biosample --comments
- get_field_info --type Biosample --comments --outfile biosample.xls
-
-Complete list of sheets: ~\ :sub:`~` get\_field\_info --type Publication
---type Document --type Vendor --type Protocol --type
-BiosampleCellCulture --type Biosource --type Enzyme --type Construct
---type TreatmentChemical --type TreatmentRnai --type Modification --type
-Biosample --type FileFastq --type FileSet --type IndividualHuman --type
-IndividualMouse --type ExperimentHiC --type ExperimentCaptureC --type
-Target --type GenomicRegion --type ExperimentSet --type Image --comments
---outfile AllItems.xls ~\ :sub:`~`
-
-Data submission
----------------
-
-After you fill out the data submission forms, you can use
-``import_data`` to submit the metadata. The method can be used both to
-create new metadata items and to patch fields of existing items.
-
-::
-
- import_data filename.xls
-
-**Uploading vs Patching**
-
-If there are uuid, alias, @id, or accession fields in the xls form that
-match existing entries in the database, you will be asked if you want to
-PATCH each object. You can use the ``--patchall`` flag, if you want to
-patch ALL objects in your document and ignore that message.
-
-If no object identifiers are found in the document, you need to use
-``--update`` for POSTing to occur.
-
-Development
-===========
-
-Note if you are attempting to run the scripts in the wranglertools
-directory without installing the package then in order to get the
-correct sys.path you need to run the scripts from the parent directory
-using the following command format:
-
-::
-
- python3 -m wranglertools.get_field_info —-type Biosource
- python3 -m wranglertools.import_data filename.xls
-
-pypi page is - https://pypi.python.org/pypi/Submit4DN
-
-The proper way to create a new release is ``invoke deploy`` which will
-prompt you to update the release number, then tag the code with that
-version number and push it to github, which will trigger travis to build
-and test and if tests pass it will deploy to production version of pypi.
-Note that travis will automatically deploy the new version if you push a
-tag to git.
-
-.. |Build Status| image:: https://travis-ci.org/hms-dbmi/Submit4DN.svg?branch=master
- :target: https://travis-ci.org/hms-dbmi/Submit4DN
-.. |Coverage Status| image:: https://coveralls.io/repos/github/hms-dbmi/Submit4DN/badge.svg?branch=master
- :target: https://coveralls.io/github/hms-dbmi/Submit4DN?branch=master
-.. |Code Quality| image:: https://api.codacy.com/project/badge/Grade/a4d521b4dd9c49058304606714528538
- :target: https://www.codacy.com/app/jeremy_7/Submit4DN
-.. |PyPI version| image:: https://badge.fury.io/py/Submit4DN.svg
- :target: https://badge.fury.io/py/Submit4DN
-
diff --git a/setup.cfg b/setup.cfg
index 49589a63..5aaf1e2d 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -2,20 +2,20 @@
max-line-length = 120
[coverage:run]
branch = True
-omit =
+omit =
*/__pycache__/*
*/Data_Files/*
*/System_Files/*
*/tests/*
-include =
- */wranglertools/*
+include =
+ */wranglertools/*
[aliases]
test=pytest
[tool:pytest]
-addopts =
+addopts =
--cov
--cov-report term
--cov-report xml
--cov-report html
- --cov-fail-under 89
+ --cov-fail-under 85
diff --git a/tests/data_files/Exp_HiC_insert.xls b/tests/data_files/Exp_HiC_insert.xls
index a21d6032..560a245a 100644
Binary files a/tests/data_files/Exp_HiC_insert.xls and b/tests/data_files/Exp_HiC_insert.xls differ
diff --git a/tests/data_files/Exp_Set_Replicate_insert.xls b/tests/data_files/Exp_Set_Replicate_insert.xls
new file mode 100644
index 00000000..4035a5eb
Binary files /dev/null and b/tests/data_files/Exp_Set_Replicate_insert.xls differ
diff --git a/tests/data_files/Exp_Set_insert.xls b/tests/data_files/Exp_Set_insert.xls
new file mode 100644
index 00000000..7e151d4e
Binary files /dev/null and b/tests/data_files/Exp_Set_insert.xls differ
diff --git a/tests/test_fdnDCIC.py b/tests/test_fdnDCIC.py
index 630f0b39..ff7967dc 100644
--- a/tests/test_fdnDCIC.py
+++ b/tests/test_fdnDCIC.py
@@ -191,15 +191,14 @@ def test_switch_fields():
[['cell_line_tier', 'cell_line', 'SOP_cell_line'], 'Biosource'],
[['start_coordinate', 'start_location', 'location_description',
'end_location', 'end_coordinate'], "GenomicRegion"],
- [['experiment_relation.relationship_type', 'experiment_sets|3', 'files', 'average_fragment_size',
- 'experiment_sets|1', 'fragment_size_range', 'documents', 'experiment_relation.experiment',
- 'experiment_sets|2', 'filesets', 'experiment_sets|0'], "Experiment"]
+ [['experiment_relation.relationship_type', 'files', 'average_fragment_size',
+ 'fragment_size_range', 'documents', 'experiment_relation.experiment',
+ 'filesets'], "Experiment"]
]
result_list = [['cell_line', 'cell_line_tier', 'SOP_cell_line'],
['location_description', 'start_location', 'end_location', 'start_coordinate', 'end_coordinate'],
['average_fragment_size', 'fragment_size_range', 'files', 'filesets',
- 'experiment_relation.relationship_type', 'experiment_relation.experiment', 'experiment_sets|0',
- 'experiment_sets|1', 'experiment_sets|2', 'experiment_sets|3', 'documents']]
+ 'experiment_relation.relationship_type', 'experiment_relation.experiment', 'documents']]
for n, (a, b) in enumerate(cases):
assert result_list[n] == fdnDCIC.switch_fields(a, b)
@@ -210,6 +209,7 @@ def test_fetch_all_items_mock(connection, mocker, returned_vendor_items):
all_vendor_items = fdnDCIC.fetch_all_items('Vendor', fields, connection)
for vendor in all_vendor_items:
assert len(vendor) == len(fields)
+ print(vendor)
assert vendor[0].startswith("#")
diff --git a/tests/test_import_data.py b/tests/test_import_data.py
index cdb5a826..a5098cd9 100644
--- a/tests/test_import_data.py
+++ b/tests/test_import_data.py
@@ -158,13 +158,76 @@ def test_get_existing_uuid(connection, mocker, returned_vendor_existing_item):
assert response == returned_vendor_existing_item.json()
+def test_combine_set_replicates():
+ post_json = {"aliases": "sample_repset", "description": "sample description"}
+ existing_data = {}
+ dict_replicates = {'sample_repset': [{'replicate_exp': 'awesome_uuid1', 'bio_rep_no': 1.0, 'tec_rep_no': 1.0},
+ {'replicate_exp': 'awesome_uuid3', 'bio_rep_no': 1.0, 'tec_rep_no': 2.0}]}
+ post_json2, dict_replicates2 = imp.combine_set(post_json, existing_data, "ExperimentSetReplicate", dict_replicates)
+
+ response = {'replicate_exps': [{'replicate_exp': 'awesome_uuid1', 'tec_rep_no': 1.0, 'bio_rep_no': 1.0},
+ {'replicate_exp': 'awesome_uuid3', 'tec_rep_no': 2.0, 'bio_rep_no': 1.0}],
+ 'description': 'sample description',
+ 'aliases': 'sample_repset'}
+ assert post_json2 == response
+ assert dict_replicates2 == {}
+
+
+def test_combine_set_expsets():
+ post_json = {"aliases": "sample_expset", "description": "sample description"}
+ existing_data = {}
+ dict_expsets = {'sample_expset': ['awesome_uuid1', 'awesome_uuid4', 'awesome_uuid5']}
+ post_json2, dict_expsets2 = imp.combine_set(post_json, existing_data, "ExperimentSet", dict_expsets)
+
+ response = {'experiments_in_set': ['awesome_uuid4', 'awesome_uuid5', 'awesome_uuid1'],
+ 'description': 'sample description',
+ 'aliases': 'sample_expset'}
+ assert sorted(post_json2) == sorted(response)
+ assert dict_expsets2 == {}
+
+
+def test_combine_set_replicates_with_existing():
+ post_json = {"aliases": "sample_repset", "description": "sample description"}
+ existing_data = {"uuid": "sampleuuid", "accession": "sample_accession",
+ 'replicate_exps': [{'replicate_exp': 'awesome_uuid', 'bio_rep_no': 1.0, 'tec_rep_no': 6.0},
+ {'replicate_exp': 'awesome_uuid2', 'bio_rep_no': 2.0, 'tec_rep_no': 1.0}]}
+ dict_replicates = {'sample_repset': [{'replicate_exp': 'awesome_uuid1', 'bio_rep_no': 1.0, 'tec_rep_no': 1.0},
+ {'replicate_exp': 'awesome_uuid3', 'bio_rep_no': 1.0, 'tec_rep_no': 2.0}]}
+ post_json2, dict_replicates2 = imp.combine_set(post_json, existing_data, "ExperimentSetReplicate", dict_replicates)
+
+ response = {'replicate_exps': [{'replicate_exp': 'awesome_uuid1', 'bio_rep_no': 1.0, 'tec_rep_no': 1.0},
+ {'replicate_exp': 'awesome_uuid3', 'bio_rep_no': 1.0, 'tec_rep_no': 2.0},
+ {'replicate_exp': 'awesome_uuid', 'bio_rep_no': 1.0, 'tec_rep_no': 6.0},
+ {'replicate_exp': 'awesome_uuid2', 'bio_rep_no': 2.0, 'tec_rep_no': 1.0}],
+ 'description': 'sample description',
+ 'aliases': 'sample_repset'}
+ assert post_json2 == response
+ assert dict_replicates2 == {}
+
+
+def test_combine_set_expsets_with_existing():
+ post_json = {"aliases": "sample_expset", "description": "sample description"}
+ existing_data = {"uuid": "sampleuuid", "accession": "sample_accession",
+ "experiments_in_set": ['awesome_uuid1', 'awesome_uuid2']}
+ dict_expsets = {'sample_expset': ['awesome_uuid1', 'awesome_uuid4', 'awesome_uuid5']}
+ post_json2, dict_expsets2 = imp.combine_set(post_json, existing_data, "ExperimentSet", dict_expsets)
+
+ response = {'experiments_in_set': ['awesome_uuid4', 'awesome_uuid5', 'awesome_uuid2', 'awesome_uuid1'],
+ 'description': 'sample description',
+ 'aliases': 'sample_expset'}
+ assert sorted(post_json2) == sorted(response)
+ assert dict_expsets2 == {}
+
+
@pytest.mark.file_operation
def test_excel_reader_no_update_no_patchall_new_doc_with_attachment(capsys, mocker, connection):
# test new item submission without patchall update tags and check the return message
test_insert = './tests/data_files/Document_insert.xls'
dict_load = {}
+ dict_rep = {}
+ dict_set = {}
with mocker.patch('wranglertools.import_data.get_existing', return_value={}):
- imp.excel_reader(test_insert, 'Document', False, connection, False, dict_load)
+ imp.excel_reader(test_insert, 'Document', False, connection, False, dict_load, dict_rep, dict_set)
args = imp.get_existing.call_args
attach = args[0][0]['attachment']
assert attach['href'].startswith('data:image/jpeg;base64')
@@ -175,6 +238,8 @@ def test_excel_reader_no_update_no_patchall_new_item(capsys, mocker, connection)
# test new item submission without patchall update tags and check the return message
test_insert = './tests/data_files/Vendor_insert.xls'
dict_load = {}
+ dict_rep = {}
+ dict_set = {}
message = "This looks like a new row but the update flag wasn't passed, use --update to post new data"
post_json = {'lab': 'sample-lab',
'description': 'Sample description',
@@ -183,7 +248,7 @@ def test_excel_reader_no_update_no_patchall_new_item(capsys, mocker, connection)
'url': 'https://www.sample_vendor.com/',
'aliases': ['dcic:sample_vendor']}
with mocker.patch('wranglertools.import_data.get_existing', return_value={}):
- imp.excel_reader(test_insert, 'Vendor', False, connection, False, dict_load)
+ imp.excel_reader(test_insert, 'Vendor', False, connection, False, dict_load, dict_rep, dict_set)
args = imp.get_existing.call_args
assert args[0][0] == post_json
out, err = capsys.readouterr()
@@ -195,6 +260,8 @@ def test_excel_reader_no_update_no_patchall_existing_item(capsys, mocker, connec
# test exisiting item submission without patchall update tags and check the return message
test_insert = "./tests/data_files/Vendor_insert.xls"
dict_load = {}
+ dict_rep = {}
+ dict_set = {}
message = "VENDOR: 0 out of 1 posted, 0 errors, 0 patched, 1 not patched (use --patchall to patch)."
post_json = {'lab': 'sample-lab',
'description': 'Sample description',
@@ -204,7 +271,7 @@ def test_excel_reader_no_update_no_patchall_existing_item(capsys, mocker, connec
'aliases': ['dcic:sample_vendor']}
existing_vendor = {'uuid': 'sample_uuid'}
with mocker.patch('wranglertools.import_data.get_existing', return_value=existing_vendor):
- imp.excel_reader(test_insert, 'Vendor', False, connection, False, dict_load)
+ imp.excel_reader(test_insert, 'Vendor', False, connection, False, dict_load, dict_rep, dict_set)
args = imp.get_existing.call_args
assert args[0][0] == post_json
out, err = capsys.readouterr()
@@ -216,20 +283,22 @@ def test_excel_reader_no_update_no_patchall_new_experiment_expset_combined(mocke
# check if the separated exp set fields in experiments get combined.
test_insert = './tests/data_files/Exp_HiC_insert.xls'
dict_load = {}
- post_json = {'experiment_sets': ['a', 'b', 'c', 'd'], 'aliases': ['dcic:test'], 'award': 'test-award',
- 'experiment_type': 'in situ Hi-C', 'lab': 'test-lab', 'filename': 'example.fastq.gz',
- 'biosample': 'test-biosample'}
+ dict_rep = {}
+ dict_set = {}
+ post_json = {'filename': 'example.fastq.gz', 'experiment_type': 'in situ Hi-C', 'aliases': ['dcic:test'],
+ 'award': 'test-award', 'lab': 'test-lab', 'biosample': 'test-biosample'}
with mocker.patch('wranglertools.import_data.get_existing', return_value={}):
- imp.excel_reader(test_insert, 'ExperimentHiC', False, connection, False, dict_load)
+ imp.excel_reader(test_insert, 'ExperimentHiC', False, connection, False, dict_load, dict_rep, dict_set)
args = imp.get_existing.call_args
assert args[0][0] == post_json
@pytest.mark.file_operation
def test_excel_reader_update_new_experiment_post_and_file_upload(capsys, mocker, connection):
- # check if the separated exp set fields in experiments get combined
test_insert = './tests/data_files/Exp_HiC_insert.xls'
dict_load = {}
+ dict_rep = {}
+ dict_set = {}
message0 = "calculating md5 sum for file ./tests/data_files/example.fastq.gz"
message1 = "EXPERIMENTHIC: 1 out of 1 posted, 0 errors, 0 patched."
e = {'status': 'success', '@graph': [{'uuid': 'some_uuid'}]}
@@ -239,7 +308,7 @@ def test_excel_reader_update_new_experiment_post_and_file_upload(capsys, mocker,
with mocker.patch('wranglertools.import_data.upload_file', return_value={}):
# mock posting new items
with mocker.patch('wranglertools.fdnDCIC.new_FDN', return_value=e):
- imp.excel_reader(test_insert, 'ExperimentHiC', True, connection, False, dict_load)
+ imp.excel_reader(test_insert, 'ExperimentHiC', True, connection, False, dict_load, dict_rep, dict_set)
args = imp.fdnDCIC.new_FDN.call_args
out, err = capsys.readouterr()
outlist = [i.strip() for i in out.split('\n') if i is not ""]
@@ -251,9 +320,10 @@ def test_excel_reader_update_new_experiment_post_and_file_upload(capsys, mocker,
@pytest.mark.file_operation
def test_excel_reader_patch_experiment_post_and_file_upload(capsys, mocker, connection):
- # check if the separated exp set fields in experiments get combined
test_insert = './tests/data_files/Exp_HiC_insert.xls'
dict_load = {}
+ dict_rep = {}
+ dict_set = {}
message0 = "calculating md5 sum for file ./tests/data_files/example.fastq.gz"
message1 = "EXPERIMENTHIC: 1 out of 1 posted, 0 errors, 1 patched."
existing_exp = {'uuid': 'sample_uuid'}
@@ -269,7 +339,8 @@ def test_excel_reader_patch_experiment_post_and_file_upload(capsys, mocker, conn
with mocker.patch('wranglertools.fdnDCIC.patch_FDN', return_value=e):
# mock get upload creds
with mocker.patch('wranglertools.import_data.get_upload_creds', return_value="new_creds"):
- imp.excel_reader(test_insert, 'ExperimentHiC', False, connection, True, dict_load)
+ imp.excel_reader(test_insert, 'ExperimentHiC', False, connection, True,
+ dict_load, dict_rep, dict_set)
# check for md5sum
args = imp.fdnDCIC.patch_FDN.call_args
post_json_arg = args[0][2]
@@ -285,6 +356,50 @@ def test_excel_reader_patch_experiment_post_and_file_upload(capsys, mocker, conn
assert message1 == outlist[1]
+@pytest.mark.file_operation
+def test_excel_reader_update_new_replicate_set_post(capsys, mocker, connection):
+ test_insert = './tests/data_files/Exp_Set_Replicate_insert.xls'
+ dict_load = {}
+ dict_rep = {'sample_repset': [{'replicate_exp': 'awesome_uuid', 'bio_rep_no': 1.0, 'tec_rep_no': 1.0}]}
+ dict_set = {}
+ message = "EXPERIMENTSETREPLICATE: 1 out of 1 posted, 0 errors, 0 patched."
+ e = {'status': 'success', '@graph': [{'uuid': 'sample_repset'}]}
+ final_post = {'aliases': ['sample_repset'],
+ 'replicate_exps': [{'bio_rep_no': 1.0, 'tec_rep_no': 1.0, 'replicate_exp': 'awesome_uuid'}]}
+ # mock fetching existing info, return None
+ with mocker.patch('wranglertools.import_data.get_existing', return_value={}):
+ # mock upload file and skip
+ with mocker.patch('wranglertools.fdnDCIC.new_FDN', return_value=e):
+ imp.excel_reader(test_insert, 'ExperimentSetReplicate', True, connection, False,
+ dict_load, dict_rep, dict_set)
+ args = imp.fdnDCIC.new_FDN.call_args
+ out, err = capsys.readouterr()
+ assert message == out.strip()
+ assert args[0][2] == final_post
+
+
+@pytest.mark.file_operation
+def test_excel_reader_update_new_experiment_set_post(capsys, mocker, connection):
+ test_insert = './tests/data_files/Exp_Set_insert.xls'
+ dict_load = {}
+ dict_rep = {}
+ dict_set = {'sample_expset': ['awesome_uuid']}
+ message = "EXPERIMENTSET: 1 out of 1 posted, 0 errors, 0 patched."
+ e = {'status': 'success', '@graph': [{'uuid': 'sample_expset'}]}
+ final_post = {'aliases': ['sample_expset'], 'experiments_in_set': ['awesome_uuid']}
+ # mock fetching existing info, return None
+ with mocker.patch('wranglertools.import_data.get_existing', return_value={}):
+ # mock upload file and skip
+ with mocker.patch('wranglertools.fdnDCIC.new_FDN', return_value=e):
+ imp.excel_reader(test_insert, 'ExperimentSet', True, connection, False,
+ dict_load, dict_rep, dict_set)
+ args = imp.fdnDCIC.new_FDN.call_args
+ out, err = capsys.readouterr()
+ assert message == out.strip()
+ print(args[0][2])
+ assert args[0][2] == final_post
+
+
def test_order_sorter(capsys):
test_list = ["ExperimentHiC", "BiosampleCellCulture", "Biosource", "Document", "Modification",
"IndividualMouse", "Biosample", "Lab", "User", "Trouble"]
@@ -297,3 +412,14 @@ def test_order_sorter(capsys):
outlist = [i.strip() for i in out.split('\n') if i is not ""]
assert message0 == outlist[0]
assert message1 == outlist[1]
+
+
+@pytest.mark.file_operation
+def test_loadxl_cycle(capsys, mocker, connection):
+ patch_list = {'Experiment': [{"uuid": "some_uuid"}]}
+ e = {'status': 'success', '@graph': [{'uuid': 'some_uuid'}]}
+ message = "EXPERIMENT(phase2): 1 items patched."
+ with mocker.patch('wranglertools.fdnDCIC.patch_FDN', return_value=e):
+ imp.loadxl_cycle(patch_list, connection)
+ out, err = capsys.readouterr()
+ assert message == out.strip()
diff --git a/wranglertools/fdnDCIC.py b/wranglertools/fdnDCIC.py
index 6f509288..f3022a3d 100644
--- a/wranglertools/fdnDCIC.py
+++ b/wranglertools/fdnDCIC.py
@@ -72,9 +72,9 @@ def get_FDN(obj_id, connection, frame="object"):
if response.json():
logging.debug('GET RESPONSE JSON: %s' %
(json.dumps(response.json(), indent=4, separators=(',', ': '))))
- except:
+ except: # pragma: no cover
logging.debug('GET RESPONSE text %s' % (response.text))
- if not response.status_code == 200:
+ if not response.status_code == 200: # pragma: no cover
if response.json().get("notification"):
logging.warning('%s' % (response.json().get("notification")))
else:
@@ -90,14 +90,14 @@ def patch_FDN(obj_id, connection, patch_input):
json_payload = json.dumps(patch_input)
elif isinstance(patch_input, str):
json_payload = patch_input
- else:
+ else: # pragma: no cover
print('Datatype to PATCH is not string or dict.')
url = connection.server + obj_id
logging.debug('PATCH URL : %s' % (url))
logging.debug('PATCH data: %s' % (json_payload))
response = requests.patch(url, auth=connection.auth, data=json_payload, headers=connection.headers)
logging.debug('PATCH RESPONSE: %s' % (json.dumps(response.json(), indent=4, separators=(',', ': '))))
- if not response.status_code == 200:
+ if not response.status_code == 200: # pragma: no cover
logging.warning('PATCH failure. Response = %s' % (response.text))
return response.json()
@@ -109,7 +109,7 @@ def new_FDN(connection, collection_name, post_input):
json_payload = json.dumps(post_input)
elif isinstance(post_input, str):
json_payload = post_input
- else:
+ else: # pragma: no cover
print('Datatype to POST is not string or dict.')
url = connection.server + collection_name
logging.debug("POST URL : %s" % (url))
@@ -117,7 +117,7 @@ def new_FDN(connection, collection_name, post_input):
separators=(',', ': '))))
response = requests.post(url, auth=connection.auth, headers=connection.headers, data=json_payload)
logging.debug("POST RESPONSE: %s" % (json.dumps(response.json(), indent=4, separators=(',', ': '))))
- if not response.status_code == 201:
+ if not response.status_code == 201: # pragma: no cover
logging.warning('POST failure. Response = %s' % (response.text))
logging.debug("Return object: %s" % (json.dumps(response.json(), sort_keys=True, indent=4,
separators=(',', ': '))))
@@ -145,7 +145,8 @@ def md5(path):
"User", "Award", "Lab", "Document", "Protocol", "Publication", "Organism", "IndividualMouse", "IndividualHuman",
"Vendor", "Enzyme", "Biosource", "Construct", "TreatmentRnai", "TreatmentChemical",
"GenomicRegion", "Target", "Modification", "Image", "BiosampleCellCulture", "Biosample",
- "FileSet", "FileFastq", "FileFasta", "ExperimentSet", "ExperimentHiC", "ExperimentCaptureC"]
+ "FileSet", "FileFastq", "FileFasta", "ExperimentHiC", "ExperimentCaptureC",
+ "ExperimentSet", "ExperimentSetReplicate"]
do_not_use = [
"submitted_by", "date_created", "organism", "schema_version", "accession", "uuid", "status",
@@ -165,7 +166,8 @@ def filter_and_sort(list_names):
useful = sorted(useful)
return useful
-move_frond = ['award', '*award', 'lab', '*lab', 'description',
+move_frond = ['experiment_set', '*tec_rep_no', '*bio_rep_no', '*replicate_set',
+ 'award', '*award', 'lab', '*lab', 'description',
'title', '*title', 'name', '*name', 'aliases', '#Field Name:']
@@ -210,11 +212,7 @@ def move_to_end(list_names):
['Experiment', 'files', 'documents'],
['Experiment', 'filesets', 'documents'],
['Experiment', 'experiment_relation.relationship_type', 'documents'],
- ['Experiment', 'experiment_relation.experiment', 'documents'],
- ['Experiment', 'experiment_sets|0', 'documents'],
- ['Experiment', 'experiment_sets|1', 'documents'],
- ['Experiment', 'experiment_sets|2', 'documents'],
- ['Experiment', 'experiment_sets|3', 'documents'],
+ ['Experiment', 'experiment_relation.experiment', 'documents']
]
@@ -232,6 +230,7 @@ def switch_fields(list_names, sheet):
return list_names
# if object name is in the following list, fetch all current/released items and add to xls
+# if experiment is ever added to this list, experiment set related fields might cause some problems
fetch_items = {
"Document": "document", "Protocol": "protocol", "Enzymes": "enzyme", "Biosource": "biosource",
"Publication": "publication", "Vendor": "vendor"}
@@ -249,9 +248,7 @@ def fetch_all_items(sheet, field_list, connection):
for field in field_list:
# required fields will have a star
field = field.strip('*')
- # in case we ever want to have experiment sets in experiment
- # this will put all exeperiment sets in the others category
- field = field.replace("|3", "")
+ # add # to skip existing items during submission
if field == "#Field Name:":
item_info.append("#")
# the attachment field returns a dictionary
@@ -268,7 +265,7 @@ def fetch_all_items(sheet, field_list, connection):
item_info.append(write_value)
all_items.append(item_info)
return all_items
- else:
+ else: # pragma: no cover
return
@@ -288,7 +285,7 @@ def order_FDN(input_xls, connection):
if sh in Sheets_read:
Sheets.append(sh)
Sheets_read.remove(sh)
- if Sheets_read:
+ if Sheets_read: # pragma: no cover
print(Sheets_read, "not in sheet_order list, please update")
Sheets.extend(Sheets_read)
for sheet in Sheets:
diff --git a/wranglertools/get_field_info.py b/wranglertools/get_field_info.py
index e0c2d498..a7c37661 100755
--- a/wranglertools/get_field_info.py
+++ b/wranglertools/get_field_info.py
@@ -94,6 +94,13 @@ class FieldInfo(object):
comm = attr.ib(default=u'')
enum = attr.ib(default=u'')
+# additional fields for experiment sheets to capture experiment_set related information
+exp_set_addition = [FieldInfo('*replicate_set', 'Item:ExperimentSetReplicate', 'Grouping for replicate experiments'),
+ FieldInfo('*bio_rep_no', 'number', 'Biological replicate number'),
+ FieldInfo('*tec_rep_no', 'number', 'Technical replicate number'),
+ FieldInfo('experiment_set', 'array of Item:ExperimentSet', 'Grouping for non-replicate experiments')
+ ]
+
def get_field_type(field):
field_type = field.get('type', '')
@@ -176,6 +183,8 @@ def get_uploadable_fields(connection, types, include_description=False,
include_description,
include_comments,
include_enums)
+ if name.startswith('Experiment') and not name.startswith('ExperimentSet'):
+ fields[name].extend(exp_set_addition)
return fields
@@ -203,6 +212,8 @@ def create_xls(all_fields, filename):
add_info += str(field.comm)
if field.enum:
add_info += "Choices:" + str(field.enum)
+ if not field.comm and not field.enum:
+ add_info = "-"
ws.write(3, col+1, add_info)
wb.save(filename)
diff --git a/wranglertools/import_data.py b/wranglertools/import_data.py
index 02ad87b9..a4f2cee3 100755
--- a/wranglertools/import_data.py
+++ b/wranglertools/import_data.py
@@ -98,8 +98,7 @@ def getArgs(): # pragma: no cover
['FileSet', ['files_in_set']],
['ExperimentHiC', ['experiment_relation']],
['ExperimentCaptureC', ['experiment_relation']],
- ['ExperimentSet', ['experiments_in_set']],
- ['Publication', ['experiment_sets_in_pub']]
+ ['Publication', ['exp_sets_prod_in_pub', 'exp_sets_used_in_pub']]
]
@@ -258,6 +257,11 @@ def build_field(field, field_data, field_type):
def build_patch_json(fields, fields2types):
"""Create the data entry dictionary from the fields."""
+ # convert array types to array
+ for field, ftype in fields2types.items():
+ if 'array' in ftype:
+ fields2types[field] = 'array'
+
patch_data = {}
for field, field_data in fields.items():
field_type = None
@@ -301,7 +305,73 @@ def get_existing(post_json, connection):
return temp
-def excel_reader(datafile, sheet, update, connection, patchall, dict_patch_loadxl):
+def filter_set_from_exps(post_json):
+ """Experiments set information is taken from experiments and submitted to experiment_set."""
+ rep_set_info = []
+ exp_set_info = []
+ # Part I - Replicate Sets
+ # store the values in a list and delete them from post_json
+ if post_json.get('replicate_set'):
+ for replicate_field in ['replicate_set', 'bio_rep_no', 'tec_rep_no']:
+ rep_set_info.append(post_json[replicate_field])
+ post_json.pop(replicate_field)
+ # Part II - Experiment Sets
+ if post_json.get('experiment_set'):
+ exp_set_info = post_json['experiment_set']
+ post_json.pop('experiment_set')
+ return post_json, rep_set_info, exp_set_info
+
+
+def filter_loadxl_fields(post_json, sheet):
+ """All fields from the list_of_loadxl_fields are taken out of post_json and accumulated in dictionary."""
+ patch_loadxl_item = {}
+ for sheet_loadxl, fields_loadxl in list_of_loadxl_fields:
+ if sheet == sheet_loadxl:
+ for field_loadxl in fields_loadxl:
+ if post_json.get(field_loadxl):
+ patch_loadxl_item[field_loadxl] = post_json[field_loadxl]
+ del post_json[field_loadxl]
+ return post_json, patch_loadxl_item
+
+
+def combine_set(post_json, existing_data, sheet, accumulate_dict):
+ """Combine experiment related information form dictionaries with existing information."""
+ # find all identifiers from exisiting set item to match the one used in experiments sheet
+ identifiers = []
+ for identifier in ['accession', 'uuid', 'aliases', '@id']:
+ ex_item_id = existing_data.get(identifier, '')
+ item_id = post_json.get(identifier, ex_item_id)
+ if isinstance(item_id, list):
+ item_id = item_id[0]
+ if item_id:
+ identifiers.append(item_id)
+ # search dictionary for the existing item id
+ for identifier in identifiers:
+ if accumulate_dict.get(identifier):
+ add_to_post = accumulate_dict.get(identifier)
+ # Combination for experimentsets
+ if sheet == "ExperimentSet":
+ if existing_data.get('experiments_in_set'):
+ existing_exps = existing_data.get('experiments_in_set')
+ post_json['experiments_in_set'] = list(set(add_to_post + existing_exps))
+ else:
+ post_json['experiments_in_set'] = add_to_post
+ # Combination for replicate sets
+ if sheet == "ExperimentSetReplicate":
+ if existing_data.get('replicate_exps'):
+ existing_sets = existing_data.get('replicate_exps')
+ new_exps = [i['replicate_exp'] for i in add_to_post]
+ existing_sets = [i for i in existing_sets if i['replicate_exp'] not in new_exps]
+ post_json['replicate_exps'] = add_to_post + existing_sets
+ else:
+ post_json['replicate_exps'] = add_to_post
+ # remove found item from the accumulate_dict
+ accumulate_dict.pop(identifier)
+ break
+ return post_json, accumulate_dict
+
+
+def excel_reader(datafile, sheet, update, connection, patchall, dict_patch_loadxl, dict_replicates, dict_exp_sets):
"""takes an excel sheet and post or patched the data in."""
# dict for acumulating cycle patch data
patch_loadxl = []
@@ -309,26 +379,17 @@ def excel_reader(datafile, sheet, update, connection, patchall, dict_patch_loadx
keys = next(row) # grab the first row of headers
types = next(row) # grab second row with type info
# remove title column
- fields2types = None
keys.pop(0)
- row2name = types.pop(0)
-
- if 'Type' in row2name:
- fields2types = dict(zip(keys, types))
- for field, ftype in fields2types.items():
- if 'array' in ftype:
- fields2types[field] = 'array'
-
- # print(fields2types)
- # sys.exit()
+ types.pop(0)
+ fields2types = dict(zip(keys, types))
+ # set counters to 0
total = 0
error = 0
success = 0
patch = 0
not_patched = 0
+ # iterate over the rows
for values in row:
- # dictionary to collect patch items
- patch_loadxl_item = {}
# Rows that start with # are skipped
if values[0].startswith("#"):
continue
@@ -337,30 +398,10 @@ def excel_reader(datafile, sheet, update, connection, patchall, dict_patch_loadx
total += 1
post_json = dict(zip(keys, values))
post_json = build_patch_json(post_json, fields2types)
-
- # Experiments sets are seperated to 4 columns in get_field_info.py and this combines them back
- if "Experiment" in sheet:
- if sheet != "ExperimentSet":
- comb_sets = []
- for set_key in ["experiment_sets|0", "experiment_sets|1", "experiment_sets|2", "experiment_sets|3"]:
- try:
- comb_sets.extend(post_json.get(set_key))
- except: # pragma: no cover
- continue
- post_json.pop(set_key, None)
- post_json['experiment_sets'] = comb_sets
# add attchments here
if post_json.get("attachment"):
attach = attachment(post_json["attachment"])
post_json["attachment"] = attach
-
- # All fields from the list_of_loadxl_fields are taken out of post_json and accumulated in dictionary
- for sheet_loadxl, fields_loadxl in list_of_loadxl_fields:
- if sheet == sheet_loadxl:
- for field_loadxl in fields_loadxl:
- if post_json.get(field_loadxl):
- patch_loadxl_item[field_loadxl] = post_json[field_loadxl]
- del post_json[field_loadxl]
# should I upload files as well?
file_to_upload = False
filename_to_post = post_json.get('filename')
@@ -368,9 +409,22 @@ def excel_reader(datafile, sheet, update, connection, patchall, dict_patch_loadx
# remove full path from filename
post_json['filename'] = filename_to_post.split('/')[-1]
file_to_upload = True
-
+ # Get existing data if available
existing_data = get_existing(post_json, connection)
-
+ # Filter loadxl fields
+ post_json, patch_loadxl_item = filter_loadxl_fields(post_json, sheet)
+ # Filter experiment set related fields
+ if sheet.startswith('Experiment') and not sheet.startswith('ExperimentSet'):
+ post_json, rep_set_info, exp_set_info = filter_set_from_exps(post_json)
+ # Combine experimentset items with stored dictionaries
+ if sheet == 'ExperimentSet':
+ post_json, dict_exp_sets = combine_set(post_json, existing_data, sheet, dict_exp_sets)
+ if sheet == 'ExperimentSetReplicate':
+ post_json, dict_replicates = combine_set(post_json, existing_data, sheet, dict_replicates)
+
+ # Run update or patch
+ e = {}
+ flow = ''
if existing_data.get("uuid"):
if not patchall:
not_patched += 1
@@ -387,15 +441,7 @@ def excel_reader(datafile, sheet, update, connection, patchall, dict_patch_loadx
e['@graph'][0]['upload_credentials'] = creds
# upload
upload_file(e, filename_to_post)
- if e["status"] == "error": # pragma: no cover
- error += 1
- elif e["status"] == "success":
- success += 1
- patch += 1
- # if patch successful, append uuid to patch_loadxl_item if full
- if patch_loadxl_item != {}:
- patch_loadxl_item['uuid'] = e['@graph'][0]['uuid']
- patch_loadxl.append(patch_loadxl_item)
+ flow = 'patch'
else:
if update:
# add the md5
@@ -406,18 +452,41 @@ def excel_reader(datafile, sheet, update, connection, patchall, dict_patch_loadx
if file_to_upload:
# upload the file
upload_file(e, filename_to_post)
- if e["status"] == "error": # pragma: no cover
- error += 1
- elif e["status"] == "success":
- success += 1
- # if post successful, append uuid to patch_loadxl_item if full
- if patch_loadxl_item != {}:
- patch_loadxl_item['uuid'] = e['@graph'][0]['uuid']
- patch_loadxl.append(patch_loadxl_item)
else:
print("This looks like a new row but the update flag wasn't passed, use --update to"
" post new data")
return
+
+ # check status and if success fill transient storage dictionaries
+ if e.get("status") == "error": # pragma: no cover
+ error += 1
+ elif e.get("status") == "success":
+ success += 1
+ if flow == 'patch':
+ patch += 1
+ # uuid of the posted/patched item
+ item_uuid = e['@graph'][0]['uuid']
+ # if post/patch successful, append uuid to patch_loadxl_item if full
+ if patch_loadxl_item != {}:
+ patch_loadxl_item['uuid'] = item_uuid
+ patch_loadxl.append(patch_loadxl_item)
+ # if post/patch successful, add the replicate/set information to the accumulate lists
+ if sheet.startswith('Experiment') and not sheet.startswith('ExperimentSet'):
+ # Part-I Replicates
+ rep_id = rep_set_info[0]
+ saveitem = {'replicate_exp': item_uuid, 'bio_rep_no': rep_set_info[1], 'tec_rep_no': rep_set_info[2]}
+ if dict_replicates.get(rep_id):
+ dict_replicates[rep_id].append(saveitem)
+ else:
+ dict_replicates[rep_id] = [saveitem, ]
+ # Part-II Experiment Sets
+ if exp_set_info:
+ for exp_set in exp_set_info:
+ if dict_exp_sets.get(exp_set):
+ dict_exp_sets[exp_set].append(item_uuid)
+ else:
+ dict_exp_sets[exp_set] = [item_uuid, ]
+
# add all object loadxl patches to dictionary
dict_patch_loadxl[sheet] = patch_loadxl
# print final report, and if there are not patched entries, add to report
@@ -426,9 +495,13 @@ def excel_reader(datafile, sheet, update, connection, patchall, dict_patch_loadx
not_patched_note = ", " + str(not_patched) + " not patched (use --patchall to patch)."
print("{sheet}: {success} out of {total} posted, {error} errors, {patch} patched{not_patch}".format(
sheet=sheet.upper(), success=success, total=total, error=error, patch=patch, not_patch=not_patched_note))
+ # if sheet == 'ExperimentSet':
+ # if dict_exp_sets
+ # if sheet == 'ExperimentSetReplicate':
+ # if dict_replicates
-def get_upload_creds(file_id, connection, file_info):
+def get_upload_creds(file_id, connection, file_info): # pragma: no cover
url = "%s%s/upload/" % (connection.server, file_id)
req = requests.post(url,
auth=connection.auth,
@@ -437,24 +510,21 @@ def get_upload_creds(file_id, connection, file_info):
return req.json()['@graph'][0]['upload_credentials']
-def upload_file(metadata_post_response, path):
+def upload_file(metadata_post_response, path): # pragma: no cover
try:
item = metadata_post_response['@graph'][0]
creds = item['upload_credentials']
except Exception as e:
print(e)
return
-
####################
# POST file to S3
-
env = os.environ.copy() # pragma: no cover
env.update({
'AWS_ACCESS_KEY_ID': creds['access_key'],
'AWS_SECRET_ACCESS_KEY': creds['secret_key'],
'AWS_SECURITY_TOKEN': creds['session_token'],
- }) # pragma: no cover
-
+ })
# ~10s/GB from Stanford - AWS Oregon
# ~12-15s/GB from AWS Ireland - AWS Oregon
print("Uploading file.")
@@ -473,8 +543,6 @@ def upload_file(metadata_post_response, path):
# the order to try to upload / update the items
# used to avoid dependencies... i.e. biosample needs the biosource to exist
-
-
def order_sorter(list_of_names):
ret_list = []
for i in sheet_order:
@@ -515,12 +583,15 @@ def main(): # pragma: no cover
supported_collections = list(profiles.keys())
supported_collections = [s.lower() for s in list(profiles.keys())]
# we want to read through names in proper upload order
- dict_loadxl = {}
sorted_names = order_sorter(names)
+ # dictionaries that accumulate information during submission
dict_loadxl = {}
+ dict_replicates = {}
+ dict_exp_sets = {}
for n in sorted_names:
if n.lower() in supported_collections:
- excel_reader(args.infile, n, args.update, connection, args.patchall, dict_loadxl)
+ excel_reader(args.infile, n, args.update, connection, args.patchall, dict_loadxl,
+ dict_replicates, dict_exp_sets)
else:
print("Sheet name '{name}' not part of supported object types!".format(name=n))
loadxl_cycle(dict_loadxl, connection)