diff --git a/Data_Files/MicroscopyCalibration/1calibration.xls b/Data_Files/MicroscopyCalibration/1calibration.xls deleted file mode 100644 index 2e827968..00000000 Binary files a/Data_Files/MicroscopyCalibration/1calibration.xls and /dev/null differ diff --git a/Data_Files/MicroscopyCalibration/1calibration_Release.xls b/Data_Files/MicroscopyCalibration/1calibration_Release.xls deleted file mode 100644 index c9796fa2..00000000 Binary files a/Data_Files/MicroscopyCalibration/1calibration_Release.xls and /dev/null differ diff --git a/Data_Files/Rao_et_al_2014/Protocols/DilutionHiC.docx b/Data_Files/Rao_et_al_2014/Protocols/DilutionHiC.docx deleted file mode 100644 index f1b29452..00000000 Binary files a/Data_Files/Rao_et_al_2014/Protocols/DilutionHiC.docx and /dev/null differ diff --git a/Data_Files/Rao_et_al_2014/Protocols/DilutionHiC.pdf b/Data_Files/Rao_et_al_2014/Protocols/DilutionHiC.pdf deleted file mode 100644 index ad82511c..00000000 Binary files a/Data_Files/Rao_et_al_2014/Protocols/DilutionHiC.pdf and /dev/null differ diff --git a/Data_Files/Rao_et_al_2014/Protocols/In situ HiC.docx b/Data_Files/Rao_et_al_2014/Protocols/In situ HiC.docx deleted file mode 100644 index fc81986f..00000000 Binary files a/Data_Files/Rao_et_al_2014/Protocols/In situ HiC.docx and /dev/null differ diff --git a/Data_Files/Rao_et_al_2014/Protocols/In situ HiC_3day.docx b/Data_Files/Rao_et_al_2014/Protocols/In situ HiC_3day.docx deleted file mode 100644 index 38ea62a0..00000000 Binary files a/Data_Files/Rao_et_al_2014/Protocols/In situ HiC_3day.docx and /dev/null differ diff --git a/Data_Files/Rao_et_al_2014/Protocols/In situ HiC_agar.docx b/Data_Files/Rao_et_al_2014/Protocols/In situ HiC_agar.docx deleted file mode 100644 index e74dac0c..00000000 Binary files a/Data_Files/Rao_et_al_2014/Protocols/In situ HiC_agar.docx and /dev/null differ diff --git a/Data_Files/Rao_et_al_2014/Protocols/In situ HiC_sn_pellet.docx b/Data_Files/Rao_et_al_2014/Protocols/In situ HiC_sn_pellet.docx deleted file mode 100644 index 1f5bb82f..00000000 Binary files a/Data_Files/Rao_et_al_2014/Protocols/In situ HiC_sn_pellet.docx and /dev/null differ diff --git a/Data_Files/Rao_et_al_2014/Protocols/In situ HiC_tethered.docx b/Data_Files/Rao_et_al_2014/Protocols/In situ HiC_tethered.docx deleted file mode 100644 index 297f5caa..00000000 Binary files a/Data_Files/Rao_et_al_2014/Protocols/In situ HiC_tethered.docx and /dev/null differ diff --git a/Data_Files/Rao_et_al_2014/Protocols/In situ HiC_wo_crosslink.docx b/Data_Files/Rao_et_al_2014/Protocols/In situ HiC_wo_crosslink.docx deleted file mode 100644 index b1d267d2..00000000 Binary files a/Data_Files/Rao_et_al_2014/Protocols/In situ HiC_wo_crosslink.docx and /dev/null differ diff --git a/Data_Files/Rao_et_al_2014/Protocols/InsituHiC.pdf b/Data_Files/Rao_et_al_2014/Protocols/InsituHiC.pdf deleted file mode 100644 index 903c6025..00000000 Binary files a/Data_Files/Rao_et_al_2014/Protocols/InsituHiC.pdf and /dev/null differ diff --git a/Data_Files/Rao_et_al_2014/Protocols/InsituHiC_3day.pdf b/Data_Files/Rao_et_al_2014/Protocols/InsituHiC_3day.pdf deleted file mode 100644 index f6c2ebce..00000000 Binary files a/Data_Files/Rao_et_al_2014/Protocols/InsituHiC_3day.pdf and /dev/null differ diff --git a/Data_Files/Rao_et_al_2014/Protocols/InsituHiC_agar.pdf b/Data_Files/Rao_et_al_2014/Protocols/InsituHiC_agar.pdf deleted file mode 100644 index 740b5009..00000000 Binary files a/Data_Files/Rao_et_al_2014/Protocols/InsituHiC_agar.pdf and /dev/null differ diff --git a/Data_Files/Rao_et_al_2014/Protocols/InsituHiC_sn_pellet.pdf b/Data_Files/Rao_et_al_2014/Protocols/InsituHiC_sn_pellet.pdf deleted file mode 100644 index 2df7b6e9..00000000 Binary files a/Data_Files/Rao_et_al_2014/Protocols/InsituHiC_sn_pellet.pdf and /dev/null differ diff --git a/Data_Files/Rao_et_al_2014/Protocols/InsituHiC_tethered.pdf b/Data_Files/Rao_et_al_2014/Protocols/InsituHiC_tethered.pdf deleted file mode 100644 index 42533e7f..00000000 Binary files a/Data_Files/Rao_et_al_2014/Protocols/InsituHiC_tethered.pdf and /dev/null differ diff --git a/Data_Files/Rao_et_al_2014/Protocols/InsituHiC_wo_crosslink.pdf b/Data_Files/Rao_et_al_2014/Protocols/InsituHiC_wo_crosslink.pdf deleted file mode 100644 index f29a4e8e..00000000 Binary files a/Data_Files/Rao_et_al_2014/Protocols/InsituHiC_wo_crosslink.pdf and /dev/null differ diff --git a/Data_Files/Rao_et_al_2014/Protocols/b5964093-eb15-4ab3-bd7f-0135b5c7d7c6.png b/Data_Files/Rao_et_al_2014/Protocols/b5964093-eb15-4ab3-bd7f-0135b5c7d7c6.png deleted file mode 100644 index f3ba2b90..00000000 Binary files a/Data_Files/Rao_et_al_2014/Protocols/b5964093-eb15-4ab3-bd7f-0135b5c7d7c6.png and /dev/null differ diff --git a/Data_Files/Rao_et_al_2014/Protocols/the_most_interesting_coder_in_the_world.png b/Data_Files/Rao_et_al_2014/Protocols/the_most_interesting_coder_in_the_world.png deleted file mode 100644 index e50d43fb..00000000 Binary files a/Data_Files/Rao_et_al_2014/Protocols/the_most_interesting_coder_in_the_world.png and /dev/null differ diff --git a/Data_Files/Rao_et_al_2014/RaoRepSetDiff.xlsx b/Data_Files/Rao_et_al_2014/RaoRepSetDiff.xlsx deleted file mode 100644 index 3cf1a4e7..00000000 Binary files a/Data_Files/Rao_et_al_2014/RaoRepSetDiff.xlsx and /dev/null differ diff --git a/Data_Files/Rao_et_al_2014/del_test.xlsx b/Data_Files/Rao_et_al_2014/del_test.xlsx deleted file mode 100644 index 6072690a..00000000 Binary files a/Data_Files/Rao_et_al_2014/del_test.xlsx and /dev/null differ diff --git a/Data_Files/Rao_et_al_2014/fieldsJinSelvaraj.xls b/Data_Files/Rao_et_al_2014/fieldsJinSelvaraj.xls deleted file mode 100644 index 84599faf..00000000 Binary files a/Data_Files/Rao_et_al_2014/fieldsJinSelvaraj.xls and /dev/null differ diff --git a/Data_Files/Rao_et_al_2014/fieldsJinSelvaraj_release.xls b/Data_Files/Rao_et_al_2014/fieldsJinSelvaraj_release.xls deleted file mode 100644 index 536e0d23..00000000 Binary files a/Data_Files/Rao_et_al_2014/fieldsJinSelvaraj_release.xls and /dev/null differ diff --git a/Data_Files/Rao_et_al_2014/fieldsRao.xls b/Data_Files/Rao_et_al_2014/fieldsRao.xls deleted file mode 100644 index e5072ac2..00000000 Binary files a/Data_Files/Rao_et_al_2014/fieldsRao.xls and /dev/null differ diff --git a/Data_Files/Rao_et_al_2014/fieldsRaoRelease.xls b/Data_Files/Rao_et_al_2014/fieldsRaoRelease.xls deleted file mode 100644 index dcbf5660..00000000 Binary files a/Data_Files/Rao_et_al_2014/fieldsRaoRelease.xls and /dev/null differ diff --git a/Data_Files/Rao_et_al_2014/testing_images.xlsx b/Data_Files/Rao_et_al_2014/testing_images.xlsx deleted file mode 100644 index b357865a..00000000 Binary files a/Data_Files/Rao_et_al_2014/testing_images.xlsx and /dev/null differ diff --git a/Data_Files/SOPs/Documents/4DN_F121-9_SOP_170314.pdf b/Data_Files/SOPs/Documents/4DN_F121-9_SOP_170314.pdf deleted file mode 100644 index 31cef313..00000000 Binary files a/Data_Files/SOPs/Documents/4DN_F121-9_SOP_170314.pdf and /dev/null differ diff --git a/Data_Files/SOPs/Documents/4DN_F123_F121-9_MEF_KOSR_SOP_161101.pdf b/Data_Files/SOPs/Documents/4DN_F123_F121-9_MEF_KOSR_SOP_161101.pdf deleted file mode 100644 index b63eba7b..00000000 Binary files a/Data_Files/SOPs/Documents/4DN_F123_F121-9_MEF_KOSR_SOP_161101.pdf and /dev/null differ diff --git a/Data_Files/SOPs/Documents/4DN_GM12878_SOP_170314.pdf b/Data_Files/SOPs/Documents/4DN_GM12878_SOP_170314.pdf deleted file mode 100644 index 91a4a69c..00000000 Binary files a/Data_Files/SOPs/Documents/4DN_GM12878_SOP_170314.pdf and /dev/null differ diff --git a/Data_Files/SOPs/Documents/4DN_H1_SOP_170305.pdf b/Data_Files/SOPs/Documents/4DN_H1_SOP_170305.pdf deleted file mode 100644 index 05dbbe4f..00000000 Binary files a/Data_Files/SOPs/Documents/4DN_H1_SOP_170305.pdf and /dev/null differ diff --git a/Data_Files/SOPs/Documents/4DN_HAP1_SOP_170314.pdf b/Data_Files/SOPs/Documents/4DN_HAP1_SOP_170314.pdf deleted file mode 100644 index 9f89d64e..00000000 Binary files a/Data_Files/SOPs/Documents/4DN_HAP1_SOP_170314.pdf and /dev/null differ diff --git a/Data_Files/SOPs/Documents/4DN_HEK293_SOP_170314.pdf b/Data_Files/SOPs/Documents/4DN_HEK293_SOP_170314.pdf deleted file mode 100644 index b9baaccb..00000000 Binary files a/Data_Files/SOPs/Documents/4DN_HEK293_SOP_170314.pdf and /dev/null differ diff --git a/Data_Files/SOPs/Documents/4DN_HFFc6_SOP_161216.pdf b/Data_Files/SOPs/Documents/4DN_HFFc6_SOP_161216.pdf deleted file mode 100644 index 3e3163cd..00000000 Binary files a/Data_Files/SOPs/Documents/4DN_HFFc6_SOP_161216.pdf and /dev/null differ diff --git a/Data_Files/SOPs/Documents/4DN_IMR90_SOP_161101.pdf b/Data_Files/SOPs/Documents/4DN_IMR90_SOP_161101.pdf deleted file mode 100644 index 8df461ab..00000000 Binary files a/Data_Files/SOPs/Documents/4DN_IMR90_SOP_161101.pdf and /dev/null differ diff --git a/Data_Files/SOPs/Documents/4DN_K562_SOP_170314.pdf b/Data_Files/SOPs/Documents/4DN_K562_SOP_170314.pdf deleted file mode 100644 index 35d09871..00000000 Binary files a/Data_Files/SOPs/Documents/4DN_K562_SOP_170314.pdf and /dev/null differ diff --git a/Data_Files/SOPs/Documents/4DN_U2OS_SOP_170314.pdf b/Data_Files/SOPs/Documents/4DN_U2OS_SOP_170314.pdf deleted file mode 100644 index c00e5ad3..00000000 Binary files a/Data_Files/SOPs/Documents/4DN_U2OS_SOP_170314.pdf and /dev/null differ diff --git a/Data_Files/SOPs/Documents/4DN_WTC-11_SOP_150309.pdf b/Data_Files/SOPs/Documents/4DN_WTC-11_SOP_150309.pdf deleted file mode 100644 index caf43934..00000000 Binary files a/Data_Files/SOPs/Documents/4DN_WTC-11_SOP_150309.pdf and /dev/null differ diff --git a/Data_Files/SOPs/Documents/4DN_hTERT-RPE_SOP_170314.pdf b/Data_Files/SOPs/Documents/4DN_hTERT-RPE_SOP_170314.pdf deleted file mode 100644 index 3b3ae971..00000000 Binary files a/Data_Files/SOPs/Documents/4DN_hTERT-RPE_SOP_170314.pdf and /dev/null differ diff --git a/Data_Files/SOPs/cell_lines_and_SOPs_170522.xls b/Data_Files/SOPs/cell_lines_and_SOPs_170522.xls deleted file mode 100644 index 37b068ea..00000000 Binary files a/Data_Files/SOPs/cell_lines_and_SOPs_170522.xls and /dev/null differ diff --git a/Data_Files/Sanborn_et_al_2015/Sanborn_2015_metadata_170712.xls b/Data_Files/Sanborn_et_al_2015/Sanborn_2015_metadata_170712.xls deleted file mode 100644 index cf660956..00000000 Binary files a/Data_Files/Sanborn_et_al_2015/Sanborn_2015_metadata_170712.xls and /dev/null differ diff --git a/Data_Files/Sanborn_et_al_2015/Sanborn_FASTA_upload.xls b/Data_Files/Sanborn_et_al_2015/Sanborn_FASTA_upload.xls deleted file mode 100644 index a91808eb..00000000 Binary files a/Data_Files/Sanborn_et_al_2015/Sanborn_FASTA_upload.xls and /dev/null differ diff --git a/Data_Files/Submission Pack/Example_rao_etal2014.xls b/Data_Files/Submission Pack/Example_rao_etal2014.xls deleted file mode 100644 index 443db42e..00000000 Binary files a/Data_Files/Submission Pack/Example_rao_etal2014.xls and /dev/null differ diff --git a/Data_Files/Submission Pack/Metadata_entry_form.xls b/Data_Files/Submission Pack/Metadata_entry_form.xls deleted file mode 100644 index b5e32d62..00000000 Binary files a/Data_Files/Submission Pack/Metadata_entry_form.xls and /dev/null differ diff --git a/Data_Files/Submission Pack/Metadata_entry_form_V2.xls b/Data_Files/Submission Pack/Metadata_entry_form_V2.xls deleted file mode 100644 index 35b42fe9..00000000 Binary files a/Data_Files/Submission Pack/Metadata_entry_form_V2.xls and /dev/null differ diff --git a/Data_Files/Submission Pack/Metadata_entry_form_V3.xls b/Data_Files/Submission Pack/Metadata_entry_form_V3.xls deleted file mode 100644 index 41032595..00000000 Binary files a/Data_Files/Submission Pack/Metadata_entry_form_V3.xls and /dev/null differ diff --git a/Data_Files/Submission Pack/keypairs.json b/Data_Files/Submission Pack/keypairs.json deleted file mode 100644 index dcb85774..00000000 --- a/Data_Files/Submission Pack/keypairs.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "default": { - "key": "ABCDEFG", - "secret": "ABCD1234567", - "server": "http://4dn-web-dev.us-east-1.elasticbeanstalk.com/" - } -} diff --git a/Data_Files/reference_files/file_reference.json b/Data_Files/reference_files/file_reference.json deleted file mode 100644 index 49e90091..00000000 --- a/Data_Files/reference_files/file_reference.json +++ /dev/null @@ -1,9 +0,0 @@ - { - "status": "uploaded", - "submitted_by": "4dndcic@gmail.com", - "file_format": "bwaIndex", - "notes": "reference genome index for hg38, for bwa", - "filename": "hg38.bwaIndex.tgz", - "lab": "4dn-dcic-lab", - "award": "1U01CA200059-01" - } diff --git a/Data_Files/reference_files/file_reference2.json b/Data_Files/reference_files/file_reference2.json deleted file mode 100644 index d777317e..00000000 --- a/Data_Files/reference_files/file_reference2.json +++ /dev/null @@ -1,9 +0,0 @@ - { - "status": "uploaded", - "submitted_by": "4dndcic@gmail.com", - "file_format": "bwaIndex", - "notes": "reference genome index for mm10, for bwa", - "filename": "mm10.bwaIndex.tgz", - "lab": "4dn-dcic-lab", - "award": "1U01CA200059-01" - } diff --git a/Data_Files/workflow_mapping/workflow_mapping_combined.json b/Data_Files/workflow_mapping/workflow_mapping_combined.json deleted file mode 100644 index f584dc96..00000000 --- a/Data_Files/workflow_mapping/workflow_mapping_combined.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "name": "hi-c-processing-parta--mm10", - "description": "workflow mapping table for Hi-C processing part A, mm10", - "workflow_name": "hi-c-processing-parta", - "workflow": "02d636b9-d82d-4da9-950c-2ca994a0943e", - "data_input_type": "experiment", - "workflow_parameters": [ - {"parameter": "bwa_index", "file_link": "4a6d10ee-2edb-4402-a98f-0edb1d58f5e1"} - ], - "experiment_parameters": [ - {"parameter": "biosample.biosource.individual.organism", "value": "mouse"}, - {"parameter": "experiment_type", "value": "in situ HiC"} - ], - "workflow_parameters": [ - {"parameter": "genome_version", "value": "mm10"} - ] -} -{ - "name": "hi-c-processing-parta--hg38", - "description": "workflow mapping table for Hi-C processing part A, hg38", - "workflow_name": "hi-c-processing-parta", - "workflow": "02d636b9-d82d-4da9-950c-2ca994a0943e", - "data_input_type": "experiment", - "workflow_parameters": [ - {"parameter": "bwa_index", "file_link": "1f53df95-4cf3-41cc-971d-81bb16c486dd"} - ], - "experiment_parameters": [ - {"parameter": "biosample.biosource.individual.organism", "value": "mouse"}, - {"parameter": "experiment_type", "value": "in situ HiC"} - ], - "workflow_parameters": [ - {"parameter": "genome_version", "value": "hg38"} - ] -} diff --git a/Data_Files/workflows/combine_workflow.json b/Data_Files/workflows/combine_workflow.json deleted file mode 100644 index 3a9de17d..00000000 --- a/Data_Files/workflows/combine_workflow.json +++ /dev/null @@ -1,30 +0,0 @@ -[ -{ - "title": "FastQC", - "name": "fastqc-0-11-4-1", - "workflow_type": "Data QC", - "workflow_diagram": "", - "description": "FastQC quality control step for fastq files" -}, -{ - "title": "ValidateFiles", - "name": "validate", - "workflow_type": "Data QC", - "workflow_diagram": "", - "description": "ValidateFiles quality control step for fastq files" -}, -{ - "title": "md5", - "name": "md5", - "workflow_type": "Data QC", - "workflow_diagram": "", - "description": "md5 on uncompressed file" -}, -{ - "title": "Hi-C processing part A", - "name": "hi-c-processing-parta", - "workflow_type": "Hi-C data analysis", - "workflow_diagram": "", - "description": "Creating a bam and pairs files from a pair of fastq files" -} -} diff --git a/Data_Files/workflows/workflow.json b/Data_Files/workflows/workflow.json deleted file mode 100644 index 2f0d0881..00000000 --- a/Data_Files/workflows/workflow.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "title": "FastQC", - "name": "fastqc-0-11-4-1", - "workflow_type": "Data QC", - "workflow_diagram": "", - "description": "FastQC quality control step for fastq files" -} diff --git a/Data_Files/workflows/workflow2.json b/Data_Files/workflows/workflow2.json deleted file mode 100644 index 7f4782ba..00000000 --- a/Data_Files/workflows/workflow2.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "title": "ValidateFiles", - "name": "validate", - "workflow_type": "Data QC", - "workflow_diagram": "", - "description": "ValidateFiles quality control step for fastq files" -} diff --git a/Data_Files/workflows/workflow3.json b/Data_Files/workflows/workflow3.json deleted file mode 100644 index 4ac0c233..00000000 --- a/Data_Files/workflows/workflow3.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "title": "md5", - "name": "md5", - "workflow_type": "Data QC", - "workflow_diagram": "", - "description": "md5 on uncompressed file" -} diff --git a/Data_Files/workflows/workflow4.json b/Data_Files/workflows/workflow4.json deleted file mode 100644 index 669cdb83..00000000 --- a/Data_Files/workflows/workflow4.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "title": "Hi-C processing part A", - "name": "hi-c-processing-parta", - "workflow_type": "Hi-C data analysis", - "workflow_diagram": "", - "description": "Creating a bam and pairs files from a pair of fastq files" -} diff --git a/README.md b/README.md index 940f568f..2183d3b4 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ If you do not yet have access, please contact [4DN Data Wranglers](mailto:suppor to get an account and [learn how to generate and save a key](https://data.4dnucleome.org/help/submitter-guide/getting-started-with-submissions#getting-connection-keys-for-the-4dn-dcic-servers). ## Generating data submission forms -To create the data submission xls forms, you can use `get_field_info`. +To create the data submission excel workbook, you can use `get_field_info`. It will accept the following parameters: ~~~~ @@ -68,7 +68,7 @@ It will accept the following parameters: --nodesc do not add the descriptions in the second line (by default they are added) --noenums do not add the list of options for a field if they are specified (by default they are added) --comments adds any (usually internal) comments together with enums (by default False) - --outfile change the default file name "fields.xls" to a specified one + --outfile change the default file name "fields.xlsx" to a specified one --debug to add more debugging output --noadmin if you have admin access to 4DN this option lets you generate the sheet as a non-admin user ~~~~ @@ -77,19 +77,19 @@ Examples generating a single sheet: ~~~~ get_field_info --type Biosample get_field_info --type Biosample --comments -get_field_info --type Biosample --comments --outfile biosample.xls +get_field_info --type Biosample --comments --outfile biosample.xlsx ~~~~ Example Workbook with all sheets: ~~~~ -get_field_info --outfile MetadataSheets.xls +get_field_info --outfile MetadataSheets.xlsx ~~~~ Examples for Workbooks using a preset option: ~~~~ -get_field_info --type HiC --comments --outfile exp_hic_generic.xls -get_field_info --type ChIP-seq --comments --outfile exp_chipseq_generic.xls -get_field_info --type FISH --comments --outfile exp_fish_generic.xls +get_field_info --type HiC --comments --outfile exp_hic_generic.xlsx +get_field_info --type ChIP-seq --comments --outfile exp_chipseq_generic.xlsx +get_field_info --type FISH --comments --outfile exp_fish_generic.xlsx ~~~~ Current presets include: `Hi-C, ChIP-seq, Repli-seq, ATAC-seq, DamID, ChIA-PET, Capture-C, FISH, SPT` @@ -100,7 +100,7 @@ Please refer to the [submission guidelines](https://data.4dnucleome.org/help/sub After you fill out the data submission forms, you can use `import_data` to submit the metadata. The method can be used both to create new metadata items and to patch fields of existing items. ~~~~ - import_data filename.xls + import_data filename.xlsx ~~~~ #### Uploading vs Patching @@ -108,7 +108,7 @@ After you fill out the data submission forms, you can use `import_data` to submi Runnning `import_data` without one of the flags described below will perform a dry run submission that will include several validation checks. It is strongly recommended to do a dry run prior to actual submission and if necessary work with a Data Wrangler to correct any errors. -If there are uuid, alias, @id, or accession fields in the xls form that match existing entries in the database, you will be asked if you want to PATCH each object. +If there are uuid, alias, @id, or accession fields in the excel form that match existing entries in the database, you will be asked if you want to PATCH each object. You can use the `--patchall` flag, if you want to patch ALL objects in your document and ignore that message. If no object identifiers are found in the document, you need to use `--update` for POSTing to occur. @@ -131,7 +131,7 @@ Note if you are attempting to run the scripts in the wranglertools directory wit ``` python -m wranglertools.get_field_info —-type Biosource - python -m wranglertools.import_data filename.xls + python -m wranglertools.import_data filename.xlsx ``` pypi page is - https://pypi.python.org/pypi/Submit4DN @@ -158,10 +158,5 @@ To run the mark tests, or exclude them from the tests you can use the following # Run only tests with file_operation py.test -m file_operation -For a better testing experienece that also check to ensure sufficient coverage and runs linters use invoke: - -``` - invoke test -``` - -This will first run linters, if linters pass, tests will be run and if tests achieve specified minimum coverage (89% as of time of writting) pass the tests. + # skip tests that use ftp (do this when testing locally) + py.test -m "not ftp" diff --git a/poetry.lock b/poetry.lock index 59b1941c..a26c6066 100644 --- a/poetry.lock +++ b/poetry.lock @@ -25,14 +25,14 @@ requests = ">=0.14.0" [[package]] name = "awscli" -version = "1.22.88" +version = "1.22.96" description = "Universal Command Line Environment for AWS." category = "main" optional = false python-versions = ">= 3.6" [package.dependencies] -botocore = "1.24.33" +botocore = "1.24.41" colorama = ">=0.2.5,<0.4.4" docutils = ">=0.10,<0.16" PyYAML = ">=3.10,<5.5" @@ -41,11 +41,11 @@ s3transfer = ">=0.5.0,<0.6.0" [[package]] name = "beautifulsoup4" -version = "4.10.0" +version = "4.11.1" description = "Screen-scraping library" category = "main" optional = false -python-versions = ">3.0.0" +python-versions = ">=3.6.0" [package.dependencies] soupsieve = ">1.2" @@ -56,14 +56,14 @@ lxml = ["lxml"] [[package]] name = "boto3" -version = "1.21.33" +version = "1.21.41" description = "The AWS SDK for Python" category = "main" optional = false python-versions = ">= 3.6" [package.dependencies] -botocore = ">=1.24.33,<1.25.0" +botocore = ">=1.24.41,<1.25.0" jmespath = ">=0.7.1,<2.0.0" s3transfer = ">=0.5.0,<0.6.0" @@ -72,7 +72,7 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "botocore" -version = "1.24.33" +version = "1.24.41" description = "Low-level, data-driven core of boto 3." category = "main" optional = false @@ -126,7 +126,7 @@ toml = ["tomli"] [[package]] name = "dcicutils" -version = "3.11.0" +version = "3.12.0" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" category = "main" optional = false @@ -189,6 +189,14 @@ urllib3 = ">=1.21.1" develop = ["requests (>=2.0.0,<3.0.0)", "nose", "coverage", "mock", "pyyaml", "nosexcover", "numpy", "pandas", "sphinx (<1.7)", "sphinx-rtd-theme"] requests = ["requests (>=2.4.0,<3.0.0)"] +[[package]] +name = "et-xmlfile" +version = "1.1.0" +description = "An implementation of lxml.xmlfile for the standard library" +category = "main" +optional = false +python-versions = ">=3.6" + [[package]] name = "gitdb" version = "4.0.9" @@ -228,6 +236,17 @@ category = "main" optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" +[[package]] +name = "openpyxl" +version = "3.0.9" +description = "A Python library to read/write Excel 2010 xlsx/xlsm files" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +et-xmlfile = "*" + [[package]] name = "py" version = "1.11.0" @@ -397,7 +416,7 @@ python-versions = ">=3.6" [[package]] name = "soupsieve" -version = "2.3.1" +version = "2.3.2.post1" description = "A modern CSS selector implementation for Beautiful Soup." category = "main" optional = false @@ -504,26 +523,10 @@ WebOb = ">=1.2" docs = ["Sphinx (>=1.8.1)", "docutils", "pylons-sphinx-themes (>=1.0.8)"] tests = ["nose (<1.3.0)", "coverage", "mock", "pastedeploy", "wsgiproxy2", "pyquery"] -[[package]] -name = "xlrd3" -version = "1.1.0" -description = "Library for developers to extract data from Microsoft Excel (tm) spreadsheet files" -category = "main" -optional = false -python-versions = ">=3.6" - -[[package]] -name = "xlwt" -version = "1.3.0" -description = "Library to create spreadsheet files compatible with MS Excel 97/2000/XP/2003 XLS files, on any platform, with Python 2.6, 2.7, 3.3+" -category = "main" -optional = false -python-versions = "*" - [metadata] lock-version = "1.1" python-versions = ">=3.6.1,<3.10" -content-hash = "234a7ebb5e92b15805d1d255fb9e46e4834896a51cea3089838bfef4a7f734ec" +content-hash = "ade46840a838f52351947e30550944b3d014ee0541b2b883ea67fa2d4a8e141f" [metadata.files] attrs = [ @@ -535,20 +538,20 @@ aws-requests-auth = [ {file = "aws_requests_auth-0.4.3-py2.py3-none-any.whl", hash = "sha256:646bc37d62140ea1c709d20148f5d43197e6bd2d63909eb36fa4bb2345759977"}, ] awscli = [ - {file = "awscli-1.22.88-py3-none-any.whl", hash = "sha256:fac66bcbb965f2fbdf6260b1ab2ed27b0cae121fc44c087f80ea6a093413aad1"}, - {file = "awscli-1.22.88.tar.gz", hash = "sha256:7f06de8f0713e29882f19afafae6f131df93b85f4ee2270e7a7b48d8696162b7"}, + {file = "awscli-1.22.96-py3-none-any.whl", hash = "sha256:7659d905d4b986ba37b9acef51f4647a26f7b692671d77e1e785b6d136575d7f"}, + {file = "awscli-1.22.96.tar.gz", hash = "sha256:7fc5e23aadb6d6b238a7ff45c56ce9d2621019b48d09437049b4c73d21cb8f0a"}, ] beautifulsoup4 = [ - {file = "beautifulsoup4-4.10.0-py3-none-any.whl", hash = "sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf"}, - {file = "beautifulsoup4-4.10.0.tar.gz", hash = "sha256:c23ad23c521d818955a4151a67d81580319d4bf548d3d49f4223ae041ff98891"}, + {file = "beautifulsoup4-4.11.1-py3-none-any.whl", hash = "sha256:58d5c3d29f5a36ffeb94f02f0d786cd53014cf9b3b3951d42e0080d8a9498d30"}, + {file = "beautifulsoup4-4.11.1.tar.gz", hash = "sha256:ad9aa55b65ef2808eb405f46cf74df7fcb7044d5cbc26487f96eb2ef2e436693"}, ] boto3 = [ - {file = "boto3-1.21.33-py3-none-any.whl", hash = "sha256:9d8ddfefe0c4a993423e2c40831034c78fcb7b3425bf3610cf0087301dd9098b"}, - {file = "boto3-1.21.33.tar.gz", hash = "sha256:c06b9b29f80da8cf6d9fac8f41d74a74d0f5347927acf11b15428b295fcbdd31"}, + {file = "boto3-1.21.41-py3-none-any.whl", hash = "sha256:3b5a3c701bb9b48e145723c1a72d2752fa3613f6e5aaf0660ef605bf13f82bb4"}, + {file = "boto3-1.21.41.tar.gz", hash = "sha256:986aa67e577b44b65a6cc42bfd74593e6d95e2dbc0b0c37e358f2253b00542db"}, ] botocore = [ - {file = "botocore-1.24.33-py3-none-any.whl", hash = "sha256:16ca4a2b72fef8caaf0eeb423dbf6cd64938442c4b9f96deb672468229e5e3f9"}, - {file = "botocore-1.24.33.tar.gz", hash = "sha256:ea5fd180082030a6c33fa19bf011d72970f3ed23cfff1b41413069e325768103"}, + {file = "botocore-1.24.41-py3-none-any.whl", hash = "sha256:65be3e46bb10056c7e6407303928c649e7ea8a01e0e3f449d5cbb6b73f8146e0"}, + {file = "botocore-1.24.41.tar.gz", hash = "sha256:57ed9843e5b6a74f82530b31dc622c8042b1c62a5ca9d46e92d2b4366450039b"}, ] certifi = [ {file = "certifi-2021.10.8-py2.py3-none-any.whl", hash = "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"}, @@ -612,8 +615,8 @@ coverage = [ {file = "coverage-6.2.tar.gz", hash = "sha256:e2cad8093172b7d1595b4ad66f24270808658e11acf43a8f95b41276162eb5b8"}, ] dcicutils = [ - {file = "dcicutils-3.11.0-py3-none-any.whl", hash = "sha256:a44f45dbff7d183fbce3cb9eafe7faad459fc456a13e1af048834116eb4fe88b"}, - {file = "dcicutils-3.11.0.tar.gz", hash = "sha256:2118dbb2fda27266799f148ff0a17f4560134bbd2145784bddfc100798de9915"}, + {file = "dcicutils-3.12.0-py3-none-any.whl", hash = "sha256:421be771e89d5734ba18c42b0c2e0227def94b67653d4c90169e6ec9db2cd7a3"}, + {file = "dcicutils-3.12.0.tar.gz", hash = "sha256:ae604a15ac6ee6ac8c462fee5311b7da78292d6670caf3766acaad76034b6d27"}, ] docker = [ {file = "docker-4.4.4-py2.py3-none-any.whl", hash = "sha256:f3607d5695be025fa405a12aca2e5df702a57db63790c73b927eb6a94aac60af"}, @@ -628,6 +631,10 @@ elasticsearch = [ {file = "elasticsearch-6.8.1-py2.py3-none-any.whl", hash = "sha256:540d633afcc0a32972e4b489c4559c9a96e294850853238f7a18b1cbd267c2ed"}, {file = "elasticsearch-6.8.1.tar.gz", hash = "sha256:a8062a00b61bc7babeea028530667583a68ecb1a9f59ab0b22ff7feaf70d3564"}, ] +et-xmlfile = [ + {file = "et_xmlfile-1.1.0-py3-none-any.whl", hash = "sha256:a2ba85d1d6a74ef63837eed693bcb89c3f752169b0e3e7ae5b16ca5e1b3deada"}, + {file = "et_xmlfile-1.1.0.tar.gz", hash = "sha256:8eb9e2bc2f8c97e37a2dc85a09ecdcdec9d8a396530a6d5a33b30b9a92da0c5c"}, +] gitdb = [ {file = "gitdb-4.0.9-py3-none-any.whl", hash = "sha256:8033ad4e853066ba6ca92050b9df2f89301b8fc8bf7e9324d412a63f8bf1a8fd"}, {file = "gitdb-4.0.9.tar.gz", hash = "sha256:bac2fd45c0a1c9cf619e63a90d62bdc63892ef92387424b855792a6cabe789aa"}, @@ -644,6 +651,10 @@ jmespath = [ {file = "jmespath-0.10.0-py2.py3-none-any.whl", hash = "sha256:cdf6525904cc597730141d61b36f2e4b8ecc257c420fa2f4549bac2c2d0cb72f"}, {file = "jmespath-0.10.0.tar.gz", hash = "sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9"}, ] +openpyxl = [ + {file = "openpyxl-3.0.9-py2.py3-none-any.whl", hash = "sha256:8f3b11bd896a95468a4ab162fc4fcd260d46157155d1f8bfaabb99d88cfcf79f"}, + {file = "openpyxl-3.0.9.tar.gz", hash = "sha256:40f568b9829bf9e446acfffce30250ac1fa39035124d55fc024025c41481c90f"}, +] py = [ {file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"}, {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, @@ -757,8 +768,8 @@ smmap = [ {file = "smmap-5.0.0.tar.gz", hash = "sha256:c840e62059cd3be204b0c9c9f74be2c09d5648eddd4580d9314c3ecde0b30936"}, ] soupsieve = [ - {file = "soupsieve-2.3.1-py3-none-any.whl", hash = "sha256:1a3cca2617c6b38c0343ed661b1fa5de5637f257d4fe22bd9f1338010a1efefb"}, - {file = "soupsieve-2.3.1.tar.gz", hash = "sha256:b8d49b1cd4f037c7082a9683dfa1801aa2597fb11c3a1155b7a5b94829b4f1f9"}, + {file = "soupsieve-2.3.2.post1-py3-none-any.whl", hash = "sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759"}, + {file = "soupsieve-2.3.2.post1.tar.gz", hash = "sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d"}, ] structlog = [ {file = "structlog-19.2.0-py2.py3-none-any.whl", hash = "sha256:6640e6690fc31d5949bc614c1a630464d3aaa625284aeb7c6e486c3010d73e12"}, @@ -792,11 +803,3 @@ webtest = [ {file = "WebTest-2.0.35-py2.py3-none-any.whl", hash = "sha256:44ddfe99b5eca4cf07675e7222c81dd624d22f9a26035d2b93dc8862dc1153c6"}, {file = "WebTest-2.0.35.tar.gz", hash = "sha256:aac168b5b2b4f200af4e35867cf316712210e3d5db81c1cbdff38722647bb087"}, ] -xlrd3 = [ - {file = "xlrd3-1.1.0-py2.py3-none-any.whl", hash = "sha256:8e8e808f938144e7936a6e07c1d57be7a0f6c6f5b37c9c67974b43246d8aacb6"}, - {file = "xlrd3-1.1.0.tar.gz", hash = "sha256:20e6ed2e5f7f8b4ab61e30faffebceff6fab348332b4c915373f0a72742dc177"}, -] -xlwt = [ - {file = "xlwt-1.3.0-py2.py3-none-any.whl", hash = "sha256:a082260524678ba48a297d922cc385f58278b8aa68741596a87de01a9c628b2e"}, - {file = "xlwt-1.3.0.tar.gz", hash = "sha256:c59912717a9b28f1a3c2a98fd60741014b06b043936dcecbc113eaaada156c88"}, -] diff --git a/pyproject.toml b/pyproject.toml index cf3150f8..57356fe6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "Submit4DN" -version = "2.1.1" +version = "2.2.0" description = "Utility package for submitting data to the 4DN Data Portal" authors = ["4DN-DCIC Team "] license = "MIT" @@ -15,8 +15,7 @@ packages = [ python = ">=3.6.1,<3.10" python-magic = ">=0.4.12" attrs = ">=21.4" -xlrd3 = "1.1.0" -xlwt = "1.3.0" +openpyxl = "^3.0.9" dcicutils = ">=3.11.0" # awscli is not directly imported but is required for aws cp operation awscli = "^1.22.88" diff --git a/tests/conftest.py b/tests/conftest.py index b9008f46..8daa2d4d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,8 @@ # flake8: noqa import pytest from wranglertools.get_field_info import FDN_Key, FDN_Connection +from pathlib import Path +import openpyxl class MockedResponse(object): @@ -365,6 +367,18 @@ def embed_properties(): } +@pytest.fixture +def workbooks(): + workbooks = {} + WORKBOOK_DIR = './tests/data_files/workbooks/' + filenames = Path(WORKBOOK_DIR).glob('*.xlsx') + for fn in filenames: + if fn.name.startswith('~'): + continue + workbooks[fn.name] = openpyxl.load_workbook(fn) + return workbooks + + @pytest.fixture def file_metadata(): from collections import OrderedDict @@ -414,6 +428,15 @@ def returned_award_schema(): return MockedResponse(data, 200) +@pytest.fixture +def returned_bcc_schema(): + """ trimmed schema with examples of various permutations of properties including descriptions, comments, enums, suggested enums + and also a calcprop and schema version that have import-item permission for testing sheet generation + """ + data = {"title": "Cell culture details for biosample preparation","id": "/profiles/biosample_cell_culture.json","$schema": "http://json-schema.org/draft-04/schema#","type": "object","required": ["culture_start_date", "award", "lab"],"identifyingProperties": ["uuid", "aliases"],"additionalProperties": False,"mixinProperties": [{ "$ref": "mixins.json#/schema_version" },],"mixinFacets" : [{ "$ref": "mixins.json#/facets_common" }],"properties": {"description": {"title": "Description","description": "A short description of the cell culture procedure - eg. Details on culturing a preparation of K562 cells","type": "string","lookup": 20,"formInput": "textarea"},"culture_start_date": {"title": "Culture start date","description": "YYYY-MM-DD format date for most recently thawed cell culture.","comment": "Date can be submitted in as YYYY-MM-DD or YYYY-MM-DDTHH:MM:SSTZD","type": "string","lookup": 40,"anyOf": [{"format": "date-time"},{"format": "date"}]},"culture_harvest_date": {"title": "Culture harvest date","type": "string","lookup": 50,"anyOf": [{"format": "date-time"},{"format": "date"}]},"culture_duration":{"title":"Total Days in Culture","description":"Total number of culturing days since receiving original vial","type":"number","lookup": 60},"passage_number": {"title": "Passage Number","type": "integer","lookup": 70},"protocols_additional": {"title": "Additional Protocols used in Cell Culture","description": "Protocols including additional culture manipulations such as stem cell differentiation or cell cycle synchronization.","type": "array","lookup": 150,"items": {"title": "Culture Protocol","type": "string","linkTo": "Protocol"}},"in_vitro_differentiated": {"title": "Differentiated in vitro","description": "Relevant for pluripotent and stem cell lines - set to Yes if cells have undergone in vitro differentiation","type": "string","enum": ["Yes", "No"],"default": "No","lookup": 98},"tissue": {"title": "Differentiation Tissue/Cell Type","description": "The resulting tissue or cell type for cells that have undergone differentiation.","type": "string","linkTo": "OntologyTerm","lookup": 99,"suggested_enum": ["cardiac muscle myoblast","cardiac muscle cell"]},"synchronization_stage": {"title": "Synchronization Stage","description": "If a culture is synchronized the cell cycle stage from which the biosample used in an experiment is prepared","type": "string","lookup": 120,"suggested_enum": ["non synchronized","G1",],"ignored_enum": ["3h after mitotic shakeoff"]},"@type":{"calculatedProperty":True,"title":"Type","type":"array","items":{"type":"string"}}, "schema_version":{"requestMethod":[],"type":"string","default":"1","pattern":"^\\d+(\\.\\d+)*$","title":"Schema Version", "permission":"import_items"}}} + return MockedResponse(data, 200) + + @pytest.fixture def returned_vendor_schema(): data = {"title":"Vendor","description":"Schema for submitting an originating lab or vendor.","id":"/profiles/vendor.json","$schema":"http://json-schema.org/draft-04/schema#","type":"object","required":["title"],"identifyingProperties":["uuid","name"],"additionalProperties":False,"mixinProperties":[{"$ref":"mixins.json#/schema_version"},{"$ref":"mixins.json#/uuid"},{"$ref":"mixins.json#/status"},{"$ref":"mixins.json#/notes"},{"$ref":"mixins.json#/submitted"},{"$ref":"mixins.json#/attribution"},{"$ref":"mixins.json#/aliases"}],"properties":{"aliases":{"type":"array","default":[],"uniqueItems":True,"title":"Lab aliases","description":"Lab specific identifiers to reference an object.","items":{"comment":"Current convention is colon separated lab name and lab identifier. (e.g. john-doe:42).","pattern":"^\\S+:\\S+","uniqueKey":"alias","title":"Lab alias","description":"A lab specific identifier to reference an object.","type":"string"}},"award":{"comment":"See award.json for list of available identifiers.","title":"Grant","description":"Grant associated with the submission.","linkTo":"Award","type":"string"},"lab":{"description":"Lab associated with the submission.","linkSubmitsFor":True,"title":"Lab","comment":"See lab.json for list of available identifiers.","linkTo":"Lab","type":"string"},"date_created":{"anyOf":[{"format":"date-time"},{"format":"date"}],"serverDefault":"now","readonly":True,"type":"string","comment":"Do not submit, value is assigned by the server. The date the object is created.","title":"Date created","rdfs:subPropertyOf":"dc:created","permission":"import_items"},"submitted_by":{"serverDefault":"userid","readonly":True,"type":"string","comment":"Do not submit, value is assigned by the server. The user that created the object.","linkTo":"User","title":"Submitted by","rdfs:subPropertyOf":"dc:creator","permission":"import_items"},"notes":{"elasticsearch_mapping_index_type":{"title":"Field mapping index type","description":"Defines one of three types of indexing available","type":"string","enum":["analyzed","not_analyzed","no"],"default":"analyzed"},"description":"DCIC internal notes.","type":"string","title":"Notes"},"status":{"readonly":True,"default":"in review by lab","title":"Status","type":"string","enum":["released","current","revoked","deleted","replaced","in review by lab","in review by project","released to project"],"permission":"import_items"},"uuid":{"serverDefault":"uuid4","readonly":True,"requestMethod":"POST","type":"string","title":"UUID","format":"uuid","permission":"import_items"},"schema_version":{"default":"1","pattern":"^\\d+(\\.\\d+)*$","requestMethod":[],"title":"Schema Version","comment":"Do not submit, value is assigned by the server. The version of the JSON schema that the server uses to validate the object. Schema version indicates generation of schema used to save version to to enable upgrade steps to work. Individual schemas should set the default.","type":"string"},"description":{"title":"Description","description":"A plain text description of the source.","type":"string","default":""},"title":{"title":"Name","description":"The complete name of the originating lab or vendor. ","type":"string"},"name":{"uniqueKey":True,"type":"string","description":"DON'T SUBMIT, auto-generated, use for referencing vendors in other sheets.","pattern":"^[a-z0-9\\-]+$"},"url":{"title":"URL","description":"An external resource with additional information about the source.","type":"string","format":"uri"},"@type":{"calculatedProperty":True,"title":"Type","type":"array","items":{"type":"string"}},"@id":{"calculatedProperty":True,"title":"ID","type":"string"}},"boost_values":{"name":1,"title":1},"@type":["JSONSchema"]} diff --git a/tests/data_files/Document_insert.xls b/tests/data_files/Document_insert.xls deleted file mode 100644 index 33c7d16f..00000000 Binary files a/tests/data_files/Document_insert.xls and /dev/null differ diff --git a/tests/data_files/Exp_HiC_insert.xls b/tests/data_files/Exp_HiC_insert.xls deleted file mode 100644 index 735c9063..00000000 Binary files a/tests/data_files/Exp_HiC_insert.xls and /dev/null differ diff --git a/tests/data_files/Exp_Set_Replicate_insert.xls b/tests/data_files/Exp_Set_Replicate_insert.xls deleted file mode 100644 index 4035a5eb..00000000 Binary files a/tests/data_files/Exp_Set_Replicate_insert.xls and /dev/null differ diff --git a/tests/data_files/Exp_Set_insert.xls b/tests/data_files/Exp_Set_insert.xls deleted file mode 100644 index 55b5e387..00000000 Binary files a/tests/data_files/Exp_Set_insert.xls and /dev/null differ diff --git a/tests/data_files/File_Set_insert.xls b/tests/data_files/File_Set_insert.xls deleted file mode 100644 index c136038c..00000000 Binary files a/tests/data_files/File_Set_insert.xls and /dev/null differ diff --git a/tests/data_files/File_fastq_insert.xls b/tests/data_files/File_fastq_insert.xls deleted file mode 100644 index 331fbfd5..00000000 Binary files a/tests/data_files/File_fastq_insert.xls and /dev/null differ diff --git a/tests/data_files/Ftp_file_test.xls b/tests/data_files/Ftp_file_test.xls deleted file mode 100644 index ad51a452..00000000 Binary files a/tests/data_files/Ftp_file_test.xls and /dev/null differ diff --git a/tests/data_files/Ftp_file_test_md5.xls b/tests/data_files/Ftp_file_test_md5.xls deleted file mode 100644 index a2974015..00000000 Binary files a/tests/data_files/Ftp_file_test_md5.xls and /dev/null differ diff --git a/tests/data_files/GFI_test_Experiment_Set_reference.xls b/tests/data_files/GFI_test_Experiment_Set_reference.xls deleted file mode 100644 index 6d9e843b..00000000 Binary files a/tests/data_files/GFI_test_Experiment_Set_reference.xls and /dev/null differ diff --git a/tests/data_files/GFI_test_vendor_reference.xls b/tests/data_files/GFI_test_vendor_reference.xls deleted file mode 100644 index 3fb7e3e9..00000000 Binary files a/tests/data_files/GFI_test_vendor_reference.xls and /dev/null differ diff --git a/tests/data_files/Pseudo_wfr_insert.xls b/tests/data_files/Pseudo_wfr_insert.xls deleted file mode 100644 index 0c9717c4..00000000 Binary files a/tests/data_files/Pseudo_wfr_insert.xls and /dev/null differ diff --git a/tests/data_files/Vendor.xls b/tests/data_files/Vendor.xls deleted file mode 100644 index cf4d5c75..00000000 Binary files a/tests/data_files/Vendor.xls and /dev/null differ diff --git a/tests/data_files/Vendor_insert.xls b/tests/data_files/Vendor_insert.xls deleted file mode 100644 index 62eb269e..00000000 Binary files a/tests/data_files/Vendor_insert.xls and /dev/null differ diff --git a/tests/data_files/Vendor_ordered_reference.xls b/tests/data_files/Vendor_ordered_reference.xls deleted file mode 100644 index ae4ad691..00000000 Binary files a/tests/data_files/Vendor_ordered_reference.xls and /dev/null differ diff --git a/tests/data_files/test_cell_values.xls b/tests/data_files/test_cell_values.xls deleted file mode 100644 index 6844a511..00000000 Binary files a/tests/data_files/test_cell_values.xls and /dev/null differ diff --git a/tests/data_files/workbooks/Document_insert.xlsx b/tests/data_files/workbooks/Document_insert.xlsx new file mode 100644 index 00000000..940f136c Binary files /dev/null and b/tests/data_files/workbooks/Document_insert.xlsx differ diff --git a/tests/data_files/workbooks/Exp_HiC_insert.xlsx b/tests/data_files/workbooks/Exp_HiC_insert.xlsx new file mode 100644 index 00000000..0ffc0566 Binary files /dev/null and b/tests/data_files/workbooks/Exp_HiC_insert.xlsx differ diff --git a/tests/data_files/workbooks/Exp_Set_Replicate_insert.xlsx b/tests/data_files/workbooks/Exp_Set_Replicate_insert.xlsx new file mode 100644 index 00000000..b3540295 Binary files /dev/null and b/tests/data_files/workbooks/Exp_Set_Replicate_insert.xlsx differ diff --git a/tests/data_files/workbooks/Exp_Set_insert.xlsx b/tests/data_files/workbooks/Exp_Set_insert.xlsx new file mode 100644 index 00000000..54faa29c Binary files /dev/null and b/tests/data_files/workbooks/Exp_Set_insert.xlsx differ diff --git a/tests/data_files/FileFastq_pairing.xlsx b/tests/data_files/workbooks/FileFastq_pairing.xlsx similarity index 100% rename from tests/data_files/FileFastq_pairing.xlsx rename to tests/data_files/workbooks/FileFastq_pairing.xlsx diff --git a/tests/data_files/workbooks/File_fastq_insert.xlsx b/tests/data_files/workbooks/File_fastq_insert.xlsx new file mode 100644 index 00000000..62ffdc66 Binary files /dev/null and b/tests/data_files/workbooks/File_fastq_insert.xlsx differ diff --git a/tests/data_files/workbooks/File_fastq_upload.xlsx b/tests/data_files/workbooks/File_fastq_upload.xlsx new file mode 100644 index 00000000..b1ab01e5 Binary files /dev/null and b/tests/data_files/workbooks/File_fastq_upload.xlsx differ diff --git a/tests/data_files/workbooks/Ftp_file_test.xlsx b/tests/data_files/workbooks/Ftp_file_test.xlsx new file mode 100644 index 00000000..2c206908 Binary files /dev/null and b/tests/data_files/workbooks/Ftp_file_test.xlsx differ diff --git a/tests/data_files/workbooks/Ftp_file_test_md5.xlsx b/tests/data_files/workbooks/Ftp_file_test_md5.xlsx new file mode 100644 index 00000000..4ddf338f Binary files /dev/null and b/tests/data_files/workbooks/Ftp_file_test_md5.xlsx differ diff --git a/tests/data_files/workbooks/Pseudo_wfr_insert.xlsx b/tests/data_files/workbooks/Pseudo_wfr_insert.xlsx new file mode 100644 index 00000000..4c8d571b Binary files /dev/null and b/tests/data_files/workbooks/Pseudo_wfr_insert.xlsx differ diff --git a/tests/data_files/workbooks/Vendor.xlsx b/tests/data_files/workbooks/Vendor.xlsx new file mode 100644 index 00000000..9a3bea57 Binary files /dev/null and b/tests/data_files/workbooks/Vendor.xlsx differ diff --git a/tests/data_files/workbooks/Vendor_insert.xlsx b/tests/data_files/workbooks/Vendor_insert.xlsx new file mode 100644 index 00000000..5d23e43c Binary files /dev/null and b/tests/data_files/workbooks/Vendor_insert.xlsx differ diff --git a/tests/data_files/workbooks/test_cell_values.xlsx b/tests/data_files/workbooks/test_cell_values.xlsx new file mode 100644 index 00000000..a8ae1c81 Binary files /dev/null and b/tests/data_files/workbooks/test_cell_values.xlsx differ diff --git a/tests/test_get_field_info.py b/tests/test_get_field_info.py index 35fdd435..e154ef6a 100644 --- a/tests/test_get_field_info.py +++ b/tests/test_get_field_info.py @@ -1,7 +1,9 @@ import wranglertools.get_field_info as gfi import pytest from operator import itemgetter -import xlrd3 +import openpyxl +from pathlib import Path +import os # test data is in conftest.py @@ -344,85 +346,100 @@ def test_get_uploadable_fields_mock(connection_mock, mocker, returned_vendor_sch def xls_to_list(xls_file, sheet): """To compare xls files to reference ones, return a sorted list of content.""" - return_list = [] - wb = xlrd3.open_workbook(xls_file) - read_sheet = wb.sheet_by_name(sheet) - cols = read_sheet.ncols - rows = read_sheet.nrows - for row_idx in range(rows): - row_val = [] - for col_idx in range(cols): - cell_value = str(read_sheet.cell(row_idx, col_idx)) - - row_val.append(cell_value) - return_list.append(row_val) - return return_list.sort(key=itemgetter(1)) + wb = openpyxl.load_workbook(xls_file) + return sorted([value for row in wb[sheet].values for value in row if value]) def xls_field_order(xls_file, sheet): - # returns list of fields (in order) in an excel sheet - wb = xlrd3.open_workbook(xls_file).sheet_by_name(sheet) - return [str(wb.cell_value(0, col)) for col in range(1, wb.ncols)] + ''' returns the list of fields in the order they appear in an excel sheet + removes the commented out first col header + ''' + wb = openpyxl.load_workbook(xls_file) + return list(next(wb[sheet].values))[1:] @pytest.mark.file_operation -def test_create_xls_vendor(connection_mock, mocker, returned_vendor_schema): - xls_file = "./tests/data_files/GFI_test_vendor.xls" - xls_ref_file = "./tests/data_files/GFI_test_vendor_reference.xls" - import os +def test_create_xlsx_default_options(connection_mock, mocker, returned_bcc_schema): + """ creates a workbook with one BiosampleCellCulture sheet with default options for populating rows + schema used is a fixture with a trimmed version that cotains properties to test various permutations + """ + EXPECTED = [ + '#Additional Info:', '#Description:', '#Field Name:', '#Field Type:', '*culture_start_date', + '-', '-', '-', '-', '-', '-', + 'A short description of the cell culture procedure - eg. Details on culturing a preparation of K562 cells', + "Choices:['Yes', 'No']", "Choices:['cardiac muscle myoblast', 'cardiac muscle cell']", "Choices:['non synchronized', 'G1']", + 'If a culture is synchronized the cell cycle stage from which the biosample used in an experiment is prepared', + 'Item:OntologyTerm', 'Protocols including additional culture manipulations such as stem cell differentiation or cell cycle synchronization.', + 'Relevant for pluripotent and stem cell lines - set to Yes if cells have undergone in vitro differentiation', + 'The resulting tissue or cell type for cells that have undergone differentiation.', + 'Total number of culturing days since receiving original vial', 'YYYY-MM-DD format date for most recently thawed cell culture.', + 'array of Item:Protocol', 'culture_duration', 'culture_harvest_date', 'description', 'in_vitro_differentiated', + 'integer', 'number', 'passage_number', 'protocols_additional', 'string', 'string', 'string', 'string', 'string', + 'synchronization_stage', 'tissue' + ] + xls_file = "./tests/data_files/workbooks/GFI_test_bcc_sheet.xlsx" try: - os.remove(xls_file) + os.remove(xls_file) # file should be created by the test except OSError: pass - mocker.patch('dcicutils.ff_utils.get_metadata', return_value=returned_vendor_schema.json()) - field_dict = gfi.get_uploadable_fields(connection_mock, ['Vendor']) - gfi.create_xls(field_dict, xls_file) - assert os.path.isfile(xls_file) - assert xls_to_list(xls_file, "Vendor") == xls_to_list(xls_ref_file, "Vendor") + mocker.patch('dcicutils.ff_utils.get_metadata', return_value=returned_bcc_schema.json()) + field_dict = gfi.get_uploadable_fields(connection_mock, ['BiosampleCellCulture']) + gfi.create_excel(field_dict, xls_file) + assert Path(xls_file).is_file() + assert xls_to_list(xls_file, "BiosampleCellCulture") == EXPECTED try: os.remove(xls_file) - except OSError: - pass + except OSError as e: + assert False + print("Cleanup needed! {}".format(e)) @pytest.mark.file_operation -def test_create_xls_lookup_order(connection_mock, mocker, returned_vendor_schema_l): - xls_file = "./tests/data_files/GFI_test_vendor_lookup.xls" - ref_list = ['aliases', '*title', 'description', 'contributing_labs', 'tags', 'url'] - import os +def test_create_xlsx_non_defaults(connection_mock, mocker, returned_bcc_schema): + xls_file = "./tests/data_files/workbooks/GFI_test_bcc_sheet.xlsx" + EXPECTED = [ + '#Additional Info:', '#Description:', '#Field Name:', '#Field Type:', + '*culture_start_date', '-', '-', '-', '-', '-', '-', '-', '-', + 'Date can be submitted in as YYYY-MM-DD or YYYY-MM-DDTHH:MM:SSTZD', + 'Item:OntologyTerm', 'array of Item:Protocol', 'culture_duration', + 'culture_harvest_date', 'description', 'in_vitro_differentiated', 'integer', + 'number', 'passage_number', 'protocols_additional', 'string', + 'string', 'string', 'string', 'string', 'synchronization_stage', 'tissue' + ] try: os.remove(xls_file) except OSError: pass - mocker.patch('dcicutils.ff_utils.get_metadata', return_value=returned_vendor_schema_l.json()) - field_dict = gfi.get_uploadable_fields(connection_mock, ['Vendor']) - gfi.create_xls(field_dict, xls_file) + mocker.patch('dcicutils.ff_utils.get_metadata', return_value=returned_bcc_schema.json()) + field_dict = gfi.get_uploadable_fields(connection_mock, ['BiosampleCellCulture'], no_description=True, include_comments=True, no_enums=True) + gfi.create_excel(field_dict, xls_file) assert os.path.isfile(xls_file) - assert xls_field_order(xls_file, "Vendor") == ref_list + assert xls_to_list(xls_file, "BiosampleCellCulture") == EXPECTED try: os.remove(xls_file) - except OSError: - pass + except OSError as e: + assert False + print("Cleanup needed! {}".format(e)) @pytest.mark.file_operation -def test_create_xls_experiment_set(connection_mock, mocker, returned_experiment_set_schema): - xls_file = "./tests/data_files/GFI_test_Experiment_Set.xls" - xls_ref_file = "./tests/data_files/GFI_test_Experiment_Set_reference.xls" - import os +def test_create_xls_lookup_order(connection_mock, mocker, returned_vendor_schema_l): + xls_file = "./tests/data_files/workbooks/GFI_test_vendor_lookup.xlsx" + ref_list = ['aliases', '*title', 'description', 'contributing_labs', 'tags', 'url'] try: os.remove(xls_file) except OSError: pass - mocker.patch('dcicutils.ff_utils.get_metadata', return_value=returned_experiment_set_schema.json()) - field_dict = gfi.get_uploadable_fields(connection_mock, ['ExperimentSet'], True, True, True) - gfi.create_xls(field_dict, xls_file) - assert os.path.isfile(xls_file) - assert xls_to_list(xls_file, "ExperimentSet") == xls_to_list(xls_ref_file, "ExperimentSet") + mocker.patch('dcicutils.ff_utils.get_metadata', return_value=returned_vendor_schema_l.json()) + field_dict = gfi.get_uploadable_fields(connection_mock, ['Vendor']) + gfi.create_excel(field_dict, xls_file) + assert Path(xls_file).is_file() + assert xls_field_order(xls_file, "Vendor") == ref_list try: os.remove(xls_file) - except OSError: - pass + except OSError as e: + assert False + print("Cleanup needed! {}".format(e)) def test_get_sheet_names(capfd): diff --git a/tests/test_import_data.py b/tests/test_import_data.py index e9fcadd1..33963e1a 100644 --- a/tests/test_import_data.py +++ b/tests/test_import_data.py @@ -76,8 +76,10 @@ def test_attachment_not_accepted(): @pytest.mark.file_operation -def test_reader(vendor_raw_xls_fields): - readxls = imp.reader('./tests/data_files/Vendor.xls') +def test_reader_no_sheet_name(vendor_raw_xls_fields, workbooks): + sheet = 'Vendor' + sheetkey = "{}.xlsx".format(sheet) + readxls = imp.reader(workbooks.get(sheetkey)) for n, row in enumerate(readxls): # reader deletes the trailing space in description (at index 3.8) if n == 2: @@ -87,8 +89,10 @@ def test_reader(vendor_raw_xls_fields): @pytest.mark.file_operation -def test_reader_with_sheetname(vendor_raw_xls_fields): - readxls = imp.reader('./tests/data_files/Vendor.xls', 'Vendor') +def test_reader_with_sheetname(vendor_raw_xls_fields, workbooks): + sheet = 'Vendor' + sheetkey = "{}.xlsx".format(sheet) + readxls = imp.reader(workbooks.get(sheetkey), sheet) for n, row in enumerate(readxls): # reader deletes the trailing space in description (at index 3.8) if n == 2: @@ -98,17 +102,23 @@ def test_reader_with_sheetname(vendor_raw_xls_fields): @pytest.mark.file_operation -def test_reader_wrong_sheetname(): - readxls = imp.reader('./tests/data_files/Vendor.xls', 'Enzyme') - list_readxls = list(readxls) - assert list_readxls == [] +def test_reader_wrong_sheetname(capsys): + msg = "string indices must be integers\nEnzyme\nERROR: Can not find the collection sheet in excel file (openpyxl error)\n" + sheet = 'Vendor' + sheetkey = "{}.xlsx".format(sheet) + readxls = imp.reader(sheetkey, 'Enzyme') + assert readxls is None + out = capsys.readouterr()[0] + assert out == msg -@pytest.mark.file_operation -def test_cell_value(): - readxls = imp.reader('./tests/data_files/test_cell_values.xls') +def test_cell_value(workbooks): + readxls = imp.reader(workbooks.get('test_cell_values.xlsx')) list_readxls = list(readxls) - assert list_readxls == [['BOOLEAN', '1'], ['NUMBER', '10'], ['DATE', '2016-09-02']] + assert list_readxls == [ + ['BOOLEAN', True], ['INT', 10100], ['FLOAT', 5.5], ['DATE', '2016-09-02'], + ['STRDATE', '2022-01-01'], ['STRING', 'testing'] + ] def test_formatter_gets_ints_correctly(): @@ -246,12 +256,12 @@ def test_error_report(connection_mock): err_dict = {"title": "Unprocessable Entity", "status": "error", "errors": [ - {"name": "protocol_documents", - "description": "'dcic:insituhicagar' not found", "location": "body"}, - {"name": "age", - "description": "'at' is not of type 'number'", "location": "body"}, - {"name": "sex", - "description": "'green' is not one of ['male', 'female', 'unknown', 'mixed']", "location": "body"}], + {"name": "protocol_documents", + "description": "'dcic:insituhicagar' not found", "location": "body"}, + {"name": "age", + "description": "'at' is not of type 'number'", "location": "body"}, + {"name": "sex", + "description": "'green' is not one of ['male', 'female', 'unknown', 'mixed']", "location": "body"}], "code": 422, "@type": ["ValidationFailure", "Error"], "description": "Failed validation"} @@ -298,12 +308,20 @@ def test_fix_attribution(connection_mock): assert result_json['award'] == 'test_award' -# these tests will be replaced with dryrun tests - @pytest.mark.file_operation -def test_excel_reader_no_update_no_patchall_new_doc_with_attachment(capsys, mocker, connection_mock): +def test_digest_xlsx(workbooks): + WORKBOOK_DIR = './tests/data_files/workbooks/' + for fn, workbook in workbooks.items(): + book, sheets = imp.digest_xlsx(WORKBOOK_DIR + fn) + assert sheets == workbook.sheetnames + for sheet in sheets: + assert book[sheet].max_row == workbook[sheet].max_row + assert book[sheet].max_column == workbook[sheet].max_column + + +def test_workbooks_reader_no_update_no_patchall_new_doc_with_attachment(capsys, mocker, connection_mock, workbooks): # test new item submission without patchall update tags and check the return message - test_insert = './tests/data_files/Document_insert.xls' + test_insert = 'Document_insert.xlsx' dict_load = {} dict_rep = {} dict_set = {} @@ -313,90 +331,71 @@ def test_excel_reader_no_update_no_patchall_new_doc_with_attachment(capsys, mock mocker.patch('wranglertools.import_data.remove_deleted', return_value={}) # mocking the test post line mocker.patch('dcicutils.ff_utils.post_metadata', return_value={'status': 'success'}) - imp.excel_reader(test_insert, 'Document', False, connection_mock, False, - all_aliases, dict_load, dict_rep, dict_set, True, ['attachment']) + imp.workbook_reader(workbooks.get(test_insert), 'Document', False, connection_mock, False, + all_aliases, dict_load, dict_rep, dict_set, True, ['attachment']) args = imp.remove_deleted.call_args attach = args[0][0]['attachment'] assert attach['href'].startswith('data:image/jpeg;base64') -# @pytest.mark.file_operation -# def test_excel_reader_no_update_no_patchall_new_item(capsys, mocker, connection): -# # test new item submission without patchall update tags and check the return message -# test_insert = './tests/data_files/Vendor_insert.xls' -# dict_load = {} -# dict_rep = {} -# dict_set = {} -# message = "This looks like a new row but the update flag wasn't passed, use --update to post new data" -# post_json = {'lab': 'sample-lab', -# 'description': 'Sample description', -# 'award': 'SampleAward', -# 'title': 'Sample Vendor', -# 'url': 'https://www.sample_vendor.com/', -# 'aliases': ['dcic:sample_vendor']} -# mocker.patch('wranglertools.import_data.get_existing', return_value={}) -# imp.excel_reader(test_insert, 'Vendor', False, connection, False, dict_load, dict_rep, dict_set, True) -# args = imp.get_existing.call_args -# assert args[0][0] == post_json -# out = capsys.readouterr()[0] -# assert out.strip() == message - - -# @pytest.mark.file_operation -# def test_excel_reader_no_update_no_patchall_existing_item(capsys, mocker, connection): -# # test exisiting item submission without patchall update tags and check the return message -# test_insert = "./tests/data_files/Vendor_insert.xls" -# dict_load = {} -# dict_rep = {} -# dict_set = {} -# message = "VENDOR(1) : 0 posted / 0 not posted 0 patched / 1 not patched, 0 errors" -# post_json = {'lab': 'sample-lab', -# 'description': 'Sample description', -# 'award': 'SampleAward', -# 'title': 'Sample Vendor', -# 'url': 'https://www.sample_vendor.com/', -# 'aliases': ['dcic:sample_vendor']} -# existing_vendor = {'uuid': 'sample_uuid'} -# mocker.patch('wranglertools.import_data.get_existing', return_value=existing_vendor) -# imp.excel_reader(test_insert, 'Vendor', False, connection, False, dict_load, dict_rep, dict_set, True) -# args = imp.get_existing.call_args -# assert args[0][0] == post_json -# out = capsys.readouterr()[0] -# assert out.strip() == message - -# @pytest.mark.file_operation -@pytest.mark.ftp -def test_excel_reader_post_ftp_file_upload(capsys, mocker, connection_mock): - test_insert = './tests/data_files/Ftp_file_test_md5.xls' +def test_workbook_reader_no_update_no_patchall_existing_item(capsys, mocker, connection_mock, workbooks): + # test exisiting item submission without patchall update tags and check the return message + test_insert = "Vendor_insert.xlsx" + dict_load = {} + dict_rep = {} + dict_set = {} + message = "VENDOR(1) : 0 posted / 0 not posted 0 patched / 1 not patched, 0 errors\n" + post_json = {'lab': 'sample-lab', + 'description': 'Sample description', + 'award': 'SampleAward', + 'title': 'Sample Vendor', + 'url': 'https://www.sample_vendor.com/', + 'aliases': ['dcic:sample_vendor']} + existing_vendor = {'uuid': 'sample_uuid'} + mocker.patch('wranglertools.import_data.get_existing', return_value=existing_vendor) + mocker.patch('wranglertools.import_data.ff_utils.patch_metadata', + return_value={'status': 'success', '@graph': [{'uuid': 'uid1', '@id': '/vendor/test'}]}) + imp.workbook_reader(workbooks.get(test_insert), 'Vendor', False, connection_mock, False, {}, dict_load, dict_rep, dict_set, True, []) + out = capsys.readouterr() + args = imp.get_existing.call_args + assert args[0][0] == post_json + assert out[0] == message + + +def test_workbook_reader_post_ftp_file_upload(capsys, mocker, connection_mock, workbooks): + test_insert = 'Ftp_file_test_md5.xlsx' dict_load = {} dict_rep = {} dict_set = {} all_aliases = {} - message0_1 = "INFO: Attempting to download file from this url to your computer before upload " - message0_2 = "ftp://speedtest.tele2.net/1KB.zip" - message1 = "FILECALIBRATION(1) : 1 posted / 0 not posted 0 patched / 0 not patched, 0 errors" + message1 = "FILECALIBRATION(1) : 1 posted / 0 not posted 0 patched / 0 not patched, 0 errors\n" e = {'status': 'success', '@graph': [{'uuid': 'some_uuid', '@id': 'some_uuid'}]} # mock fetching existing info, return None mocker.patch('wranglertools.import_data.get_existing', return_value={}) # mock upload file and skip mocker.patch('wranglertools.import_data.upload_file_item', return_value={}) + # mock the ftp copy - this should get it's own tests + mocker.patch('wranglertools.import_data.ftp_copy', return_value=(True, {'md5sum': '0f343b0931126a20f133d67c2b018a3b'}, '1KB.zip')) + # mock file deletion + mocker.patch('wranglertools.import_data.pp.Path.unlink') # mock posting new items mocker.patch('dcicutils.ff_utils.post_metadata', return_value=e) - imp.excel_reader(test_insert, 'FileCalibration', True, connection_mock, False, - all_aliases, dict_load, dict_rep, dict_set, True, []) + imp.workbook_reader(workbooks.get(test_insert), 'FileCalibration', True, connection_mock, False, + all_aliases, dict_load, dict_rep, dict_set, True, []) args = imp.ff_utils.post_metadata.call_args out = capsys.readouterr()[0] - outlist = [i.strip() for i in out.split('\n') if i.strip()] post_json_arg = args[0][0] assert post_json_arg['md5sum'] == '0f343b0931126a20f133d67c2b018a3b' - assert message0_1 + message0_2 == outlist[0] - assert message1 == outlist[1] + assert message1 == out -# @pytest.mark.file_operation -@pytest.mark.ftp -def test_excel_reader_post_ftp_file_upload_no_md5(capsys, mocker, connection_mock): - test_insert = './tests/data_files/Ftp_file_test.xls' +def test_workbook_reader_post_ftp_file_upload_no_md5(capsys, mocker, connection_mock, workbooks): + """ This appears to actually mainly be testing the ftp_copy function - confirming that + the correct error messages are generated when you try to copy an ftp file without + including an md5sum in the post and subsequently that the workbook_reader function + will still post the metadata without uploading a file + """ + test_insert = 'Ftp_file_test.xlsx' dict_load = {} dict_rep = {} dict_set = {} @@ -411,8 +410,8 @@ def test_excel_reader_post_ftp_file_upload_no_md5(capsys, mocker, connection_moc mocker.patch('wranglertools.import_data.upload_file_item', return_value={}) # mock posting new items mocker.patch('dcicutils.ff_utils.post_metadata', return_value=e) - imp.excel_reader(test_insert, 'FileCalibration', True, connection_mock, False, - all_aliases, dict_load, dict_rep, dict_set, True, []) + imp.workbook_reader(workbooks.get(test_insert), 'FileCalibration', True, connection_mock, False, + all_aliases, dict_load, dict_rep, dict_set, True, []) out = capsys.readouterr()[0] outlist = [i.strip() for i in out.split('\n') if i.strip()] assert message0 == outlist[0] @@ -421,14 +420,18 @@ def test_excel_reader_post_ftp_file_upload_no_md5(capsys, mocker, connection_moc @pytest.mark.file_operation -def test_excel_reader_update_new_experiment_post_and_file_upload(capsys, mocker, connection_mock): - test_insert = './tests/data_files/Exp_HiC_insert.xls' +def test_workbook_reader_update_new_file_fastq_post_and_file_upload(capsys, mocker, connection_mock, workbooks): + """ This appears to actually mainly be testing the md5 function - confirming that + the correct output is generated when and that the md5sum is as expected + and that the workbook_reader function posts the metadata with expected output + """ + test_insert = 'File_fastq_upload.xlsx' dict_load = {} dict_rep = {} dict_set = {} all_aliases = {} message0 = "calculating md5 sum for file ./tests/data_files/example.fastq.gz" - message1 = "EXPERIMENTHIC(1) : 1 posted / 0 not posted 0 patched / 0 not patched, 0 errors" + message1 = "FILEFASTQ(1) : 1 posted / 0 not posted 0 patched / 0 not patched, 0 errors" e = {'status': 'success', '@graph': [{'uuid': 'some_uuid', '@id': 'some_uuid'}]} # mock fetching existing info, return None mocker.patch('wranglertools.import_data.get_existing', return_value={}) @@ -436,8 +439,8 @@ def test_excel_reader_update_new_experiment_post_and_file_upload(capsys, mocker, mocker.patch('wranglertools.import_data.upload_file_item', return_value={}) # mock posting new items mocker.patch('dcicutils.ff_utils.post_metadata', return_value=e) - imp.excel_reader(test_insert, 'ExperimentHiC', True, connection_mock, False, - all_aliases, dict_load, dict_rep, dict_set, True, []) + imp.workbook_reader(workbooks.get(test_insert), 'FileFastq', True, connection_mock, False, + all_aliases, dict_load, dict_rep, dict_set, True, []) args = imp.ff_utils.post_metadata.call_args out = capsys.readouterr()[0] outlist = [i.strip() for i in out.split('\n') if i is not ""] @@ -450,14 +453,18 @@ def test_excel_reader_update_new_experiment_post_and_file_upload(capsys, mocker, # a weird test that has filename in an experiment # needs to change @pytest.mark.file_operation -def test_excel_reader_patch_experiment_post_and_file_upload(capsys, mocker, connection_mock): - test_insert = './tests/data_files/Exp_HiC_insert.xls' +def test_workbook_reader_patch_file_meta_and_file_upload(capsys, mocker, connection_mock, workbooks): + """ This appears to actually mainly be testing the md5 function - confirming that + the correct output is generated when and that the md5sum is as expected + and that the workbook_reader function patches the metadata with expected output + """ + test_insert = 'File_fastq_upload.xlsx' dict_load = {} dict_rep = {} dict_set = {} all_aliases = {} message0 = "calculating md5 sum for file ./tests/data_files/example.fastq.gz" - message1 = "EXPERIMENTHIC(1) : 0 posted / 0 not posted 1 patched / 0 not patched, 0 errors" + message1 = "FILEFASTQ(1) : 0 posted / 0 not posted 1 patched / 0 not patched, 0 errors" existing_exp = {'uuid': 'sample_uuid', 'status': "uploading"} e = {'status': 'success', '@graph': [{'uuid': 'some_uuid', @@ -472,8 +479,8 @@ def test_excel_reader_patch_experiment_post_and_file_upload(capsys, mocker, conn mocker.patch('dcicutils.ff_utils.patch_metadata', return_value=e) # mock get upload creds mocker.patch('wranglertools.import_data.get_upload_creds', return_value="new_creds") - imp.excel_reader(test_insert, 'ExperimentHiC', False, connection_mock, True, - all_aliases, dict_load, dict_rep, dict_set, True, []) + imp.workbook_reader(workbooks.get(test_insert), 'FileFastq', False, connection_mock, True, + all_aliases, dict_load, dict_rep, dict_set, True, []) # check for md5sum args = imp.ff_utils.patch_metadata.call_args post_json_arg = args[0][0] @@ -489,9 +496,8 @@ def test_excel_reader_patch_experiment_post_and_file_upload(capsys, mocker, conn assert message1 == outlist[1] -@pytest.mark.file_operation -def test_excel_reader_update_new_filefastq_post(capsys, mocker, connection_mock): - test_insert = './tests/data_files/File_fastq_insert.xls' +def test_workbook_reader_update_new_filefastq_meta_post(capsys, mocker, connection_mock, workbooks): + test_insert = 'File_fastq_insert.xlsx' dict_load = {} dict_rep = {} dict_set = {} @@ -506,8 +512,8 @@ def test_excel_reader_update_new_filefastq_post(capsys, mocker, connection_mock) mocker.patch('wranglertools.import_data.get_existing', return_value={}) # mock posting new items mocker.patch('dcicutils.ff_utils.post_metadata', return_value=e) - imp.excel_reader(test_insert, 'FileFastq', True, connection_mock, False, - all_aliases, dict_load, dict_rep, dict_set, True, []) + imp.workbook_reader(workbooks.get(test_insert), 'FileFastq', True, connection_mock, False, + all_aliases, dict_load, dict_rep, dict_set, True, []) args = imp.ff_utils.post_metadata.call_args out = capsys.readouterr()[0] print([i for i in args]) @@ -515,9 +521,8 @@ def test_excel_reader_update_new_filefastq_post(capsys, mocker, connection_mock) assert args[0][0] == final_post -@pytest.mark.file_operation -def test_excel_reader_update_new_replicate_set_post(capsys, mocker, connection_mock): - test_insert = './tests/data_files/Exp_Set_Replicate_insert.xls' +def test_workbook_reader_update_new_replicate_set_post(capsys, mocker, connection_mock, workbooks): + test_insert = 'Exp_Set_Replicate_insert.xlsx' dict_load = {} dict_rep = {'sample_repset': [{'replicate_exp': 'awesome_uuid', 'bio_rep_no': 1.0, 'tec_rep_no': 1.0}]} dict_set = {} @@ -531,17 +536,16 @@ def test_excel_reader_update_new_replicate_set_post(capsys, mocker, connection_m mocker.patch('wranglertools.import_data.get_existing', return_value={}) # mock upload file and skip mocker.patch('dcicutils.ff_utils.post_metadata', return_value=e) - imp.excel_reader(test_insert, 'ExperimentSetReplicate', True, connection_mock, - False, all_aliases, dict_load, dict_rep, dict_set, True, []) + imp.workbook_reader(workbooks.get(test_insert), 'ExperimentSetReplicate', True, connection_mock, + False, all_aliases, dict_load, dict_rep, dict_set, True, []) args = imp.ff_utils.post_metadata.call_args out = capsys.readouterr()[0] assert message == out.strip() assert args[0][0] == final_post -@pytest.mark.file_operation -def test_excel_reader_update_new_experiment_set_post(capsys, mocker, connection_mock): - test_insert = './tests/data_files/Exp_Set_insert.xls' +def test_workbook_reader_update_new_experiment_set_post(capsys, mocker, connection_mock, workbooks): + test_insert = 'Exp_Set_insert.xlsx' dict_load = {} dict_rep = {} dict_set = {'sample_expset': ['awesome_uuid']} @@ -554,17 +558,16 @@ def test_excel_reader_update_new_experiment_set_post(capsys, mocker, connection_ mocker.patch('wranglertools.import_data.get_existing', return_value={}) # mock upload file and skip mocker.patch('dcicutils.ff_utils.post_metadata', return_value=e) - imp.excel_reader(test_insert, 'ExperimentSet', True, connection_mock, False, - all_aliases, dict_load, dict_rep, dict_set, True, []) + imp.workbook_reader(workbooks.get(test_insert), 'ExperimentSet', True, connection_mock, False, + all_aliases, dict_load, dict_rep, dict_set, True, []) args = imp.ff_utils.post_metadata.call_args out = capsys.readouterr()[0] assert message == out.strip() assert args[0][0] == final_post -@pytest.mark.file_operation -def test_user_workflow_reader_wfr_post(capsys, mocker, connection_mock): - test_insert = './tests/data_files/Pseudo_wfr_insert.xls' +def test_user_workflow_reader_wfr_post(capsys, mocker, connection_mock, workbooks): + test_insert = 'Pseudo_wfr_insert.xlsx' sheet_name = 'user_workflow_1' message = "USER_WORKFLOW_1(1) : 1 posted / 0 not posted - patched / - not patched, 0 errors" @@ -618,7 +621,7 @@ def test_user_workflow_reader_wfr_post(capsys, mocker, connection_mock): 'object_key': '4DNFIGOJW3XZ.pairs.gz', 'uuid': '0292e08e-facf-4a16-a94e-59606f2bfc71'} ]) mocker.patch('dcicutils.ff_utils.post_metadata', return_value=e) - imp.user_workflow_reader(test_insert, sheet_name, connection_mock) + imp.user_workflow_reader(workbooks.get(test_insert), sheet_name, connection_mock) args = imp.ff_utils.post_metadata.call_args out = capsys.readouterr()[0] print([i for i in args]) @@ -668,10 +671,12 @@ def test_verify_and_return_item_bad_item(mocker, connection_mock): @pytest.mark.file_operation def test_cabin_cross_check_dryrun(mocker, connection_mock, capsys): + """ checks that the filename passed in is a file and otherwise treats as normal dryrun + """ mocker.patch('wranglertools.import_data._verify_and_return_item', side_effect=[ {'awards': '/awards/test_award/'}, {'@id': '/awards/test_award/'} ]) - imp.cabin_cross_check(connection_mock, False, False, './tests/data_files/Exp_Set_insert.xls', False, None, None) + imp.cabin_cross_check(connection_mock, False, False, './tests/data_files/workbooks/Exp_Set_insert.xlsx', False, None, None) out = capsys.readouterr()[0] message = ''' Running on: https://data.4dnucleome.org/ @@ -708,7 +713,6 @@ def test_cabin_cross_check_remote_w_single_lab_award(mocker, connection_mock, ca assert out.strip() == message.strip() -@pytest.mark.skip # invalid mock use, needs refactor def test_cabin_cross_check_not_remote_w_lab_award_options(mocker, connection_mock, capsys): mocker.patch('wranglertools.import_data.pp.Path.is_file', return_value=True) mocker.patch.object(connection_mock, 'prompt_for_lab_award', return_value='blah') @@ -866,19 +870,12 @@ def test_cabin_cross_check_remote_w_award_not_for_lab_options(mocker, connection connection_mock.labs = ['test_lab', '/labs/bing-ren-lab'] imp.cabin_cross_check(connection_mock, False, False, 'blah', True, '/labs/bing-ren-lab/', '/awards/non-ren-lab-award/') -# with pytest.raises(SystemExit): -# Disabled - public account is not compatible with the connection object at the moment -# # TODO: use mastertest tests for this purpose -# def test_get_collections(connection_public): -# all_cols = imp.get_collections(connection_public) -# assert len(all_cols) > 10 - -def test_get_all_aliases(): - wb = "./tests/data_files/Exp_Set_insert.xls" +def test_get_all_aliases(workbooks): + wbname = "Exp_Set_insert.xlsx" sheet = ["ExperimentSet"] my_aliases = {'sample_expset': 'ExperimentSet'} - all_aliases = imp.get_all_aliases(wb, sheet) + all_aliases = imp.get_all_aliases(workbooks.get(wbname), sheet) assert my_aliases == all_aliases @@ -1159,10 +1156,11 @@ def test_file_pair_chk_sheets_w_no_aliases_col_skipped(): @pytest.mark.file_operation -def test_file_pair_chk_multiple_aliases(): +def test_file_pair_chk_multiple_aliases(workbooks): """This file contains multiple aliases and various ways to link the paired files If the check is running properly, should not see any errors.""" - fastq_rows = imp.reader('./tests/data_files/FileFastq_pairing.xlsx', sheetname='FileFastq') + wbname = 'FileFastq_pairing.xlsx' + fastq_rows = imp.reader(workbooks.get(wbname), sheetname='FileFastq') pair_errs = imp.check_file_pairing(fastq_rows) assert not pair_errs diff --git a/wranglertools/get_field_info.py b/wranglertools/get_field_info.py index 6c516851..7935a7ca 100755 --- a/wranglertools/get_field_info.py +++ b/wranglertools/get_field_info.py @@ -4,13 +4,13 @@ import argparse from dcicutils import ff_utils import attr -import xlwt +import openpyxl import sys import json EPILOG = ''' - To create an xls file with sheets to be filled use the example and modify to your needs. + To create an excel workbook file with sheets to be filled use the examples below and modify to your needs. It will accept the following optional parameters. --keyfile the path to the file where you have stored your access key info (default ~/keypairs.json) --key the name of the key identifier for the access key and secret in your keys file (default=default) @@ -18,7 +18,7 @@ --nodesc do not add the descriptions in the second line (by default they are added) --noenums do not add the list of options for a field if they are specified (by default they are added) --comments adds any (usually internal) comments together with enums (by default False) - --outfile change the default file name "fields.xls" to a specified one + --outfile change the default file name "fields.xlsx" to a specified one --debug to add more debugging output --noadmin if you have admin access to 4DN this option lets you generate the sheet as a non-admin user @@ -37,8 +37,8 @@ To change the result filename use --outfile flag followed by the new file name - %(prog)s --type Biosample --outfile biosample_only.xls - %(prog)s --type Biosample --type Experiment --outfile my_selection.xls + %(prog)s --type Biosample --outfile biosample_only.xlsx + %(prog)s --type Biosample --type Experiment --outfile my_selection.xlsx ''' @@ -95,8 +95,8 @@ def getArgs(): # pragma: no cover action='store_true', help="Do not include enums (or suggestions) for fields.") parser.add_argument('--outfile', - default='fields.xls', - help="The name of the output file. Default is fields.xls") + default='fields.xlsx', + help="The name of the output file. Default is fields.xlsx") parser.add_argument('--noadmin', default=False, action='store_true', @@ -151,7 +151,7 @@ def __init__(self, key4dn): sys.exit(1) if me_page.get('submits_for') is not None: # get all the labs that the user making the connection submits_for - self.labs = [l['@id'] for l in me_page['submits_for']] + self.labs = [lp['@id'] for lp in me_page['submits_for']] # take the first one as default value for the connection - reset in # import_data if needed by calling set_lab_award self.lab = self.labs[0] @@ -390,33 +390,31 @@ def get_uploadable_fields(connection, types, no_description=False, return fields -def create_xls(all_fields, filename): +def create_excel(all_fields, filename): ''' - fields being a dictionary of sheet -> FieldInfo(objects) - create one sheet per dictionary item, with three columns of fields - for fieldname, description and enum + all_fields being a dictionary of sheet/Item names -> list of FieldInfo(objects) + create one sheet per dictionary item, that inserts 4 commented header rows for each column + that corresponds to one of the FieldInfo objects in the list + header rows are for fieldname, fieldtype, description and comments/enums ''' - wb = xlwt.Workbook() - # text styling for all columns - style = xlwt.XFStyle() - style.num_format_str = "@" + wb = openpyxl.Workbook() + wb.remove(wb.active) # removes the by default created empty sheet named Sheet # order sheets sheet_list = [(sheet, all_fields[sheet]) for sheet in sheet_order if sheet in all_fields.keys()] for obj_name, fields in sheet_list: - ws = wb.add_sheet(obj_name) - ws.write(0, 0, "#Field Name:") - ws.write(1, 0, "#Field Type:") - ws.write(2, 0, "#Description:") - ws.write(3, 0, "#Additional Info:") - # add empty formatting for first column - for i in range(100): - ws.write(4+i, 0, '', style) + ws = wb.create_sheet(title=obj_name) + ws.cell(row=1, column=1, value="#Field Name:") + ws.cell(row=2, column=1, value="#Field Type:") + ws.cell(row=3, column=1, value="#Description:") + ws.cell(row=4, column=1, value="#Additional Info:") # order fields in sheet based on lookup numbers, then alphabetically for col, field in enumerate(sorted(sorted(fields), key=lambda x: x.lookup)): - ws.write(0, col+1, str(field.name), style) - ws.write(1, col+1, str(field.ftype), style) + ws.cell(row=1, column=col+2, value=str(field.name)) + ws.cell(row=2, column=col+2, value=str(field.ftype)) + description = '' if field.desc: - ws.write(2, col+1, str(field.desc), style) + description = str(field.desc) + ws.cell(row=3, column=col+2, value=description) # combine comments and Enum add_info = '' if field.comm: @@ -425,10 +423,7 @@ def create_xls(all_fields, filename): add_info += "Choices:" + str(field.enum) if not field.comm and not field.enum: add_info = "-" - ws.write(3, col+1, add_info, style) - # add empty formatting for all columns - for i in range(100): - ws.write(4+i, col+1, '', style) + ws.cell(row=4, column=col+2, value=add_info) wb.save(filename) @@ -488,7 +483,7 @@ def main(): # pragma: no cover if args.outfile: file_name = args.outfile - create_xls(fields, file_name) + create_excel(fields, file_name) if __name__ == '__main__': diff --git a/wranglertools/import_data.py b/wranglertools/import_data.py index 336d2344..43491306 100755 --- a/wranglertools/import_data.py +++ b/wranglertools/import_data.py @@ -8,7 +8,9 @@ sheet_order, FDN_Key, FDN_Connection, create_common_arg_parser, _remove_all_from_types) from dcicutils import ff_utils -import xlrd3 as xlrd +import openpyxl +import warnings # to suppress openpxl warning about headers +from openpyxl.utils.exceptions import InvalidFileException import datetime import sys import mimetypes @@ -228,46 +230,86 @@ def attachment(path): return attach -def reader(filename, sheetname=None): +def digest_xlsx(filename): + try: + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + book = openpyxl.load_workbook(filename) + except InvalidFileException as e: + if filename.endswith('.xls'): + print("WARNING - Old xls format not supported - please save your workbook as xlsx") + else: + print("ERROR - ", e) + sys.exit(1) + sheets = book.sheetnames + return book, sheets + + +def reader(workbook, sheetname=None): """Read named sheet or first and only sheet from xlsx file.""" - book = xlrd.open_workbook(filename) if sheetname is None: - sheet, = book.sheets() + sheet = workbook.worksheets[0] else: try: - sheet = book.sheet_by_name(sheetname) - except xlrd.XLRDError: + sheet = workbook[sheetname] + except Exception as e: + print(e) print(sheetname) - print("ERROR: Can not find the collection sheet in excel file (xlrd error)") + print("ERROR: Can not find the collection sheet in excel file (openpyxl error)") + return + # Generator that gets rows from excel sheet + # NB we have a lot of empty no formatting rows added (can we get rid of that) + # or do we need to be careful to check for the first totally emptyvalue row? + return row_generator(sheet) + + +def row_generator(sheet): + """Generator that gets rows from excel sheet + Note that this currently checks to see if a row is empty and if so stops + This is needed as plain text formatting of cells is recognized as data + """ + for row in sheet.rows: + vals = [cell_value(cell) for cell in row] + if not any([v for v in vals]): return - datemode = sheet.book.date_mode - for index in range(sheet.nrows): - yield [cell_value(cell, datemode) for cell in sheet.row(index)] + else: + yield vals -def cell_value(cell, datemode): - """Get cell value from excel.""" - # This should be always returning text format if the excel is generated - # by the get_field_info command - ctype = cell.ctype +def cell_value(cell): + """Get cell value from excel. [From Submit4DN]""" + ctype = cell.data_type value = cell.value - if ctype == xlrd.XL_CELL_ERROR: # pragma: no cover - raise ValueError(repr(cell), 'cell error') - elif ctype == xlrd.XL_CELL_BOOLEAN: - return str(value).upper().strip() - elif ctype == xlrd.XL_CELL_NUMBER: - if value.is_integer(): - value = int(value) - return str(value).strip() - elif ctype == xlrd.XL_CELL_DATE: - value = xlrd.xldate_as_tuple(value, datemode) - if value[3:] == (0, 0, 0): - return datetime.date(*value[:3]).isoformat() - else: # pragma: no cover - return datetime.datetime(*value).isoformat() - elif ctype in (xlrd.XL_CELL_TEXT, xlrd.XL_CELL_EMPTY, xlrd.XL_CELL_BLANK): + if ctype == openpyxl.cell.cell.TYPE_ERROR: # pragma: no cover + raise ValueError('Cell %s contains a cell error' % str(cell.coordinate)) + elif ctype == openpyxl.cell.cell.TYPE_BOOL: + boolstr = str(value).strip() + if boolstr == 'TRUE': + return True + elif boolstr == 'FALSE': + return False + else: + return value + elif ctype in (openpyxl.cell.cell.TYPE_NUMERIC, openpyxl.cell.cell.TYPE_NULL): + if isinstance(value, float): + if value.is_integer(): + value = int(value) + if not value: + return '' + return value + elif isinstance(value, openpyxl.cell.cell.TIME_TYPES): + if isinstance(value, datetime.datetime): + if value.time() == datetime.time(0, 0, 0): + return value.date().isoformat() + else: # pragma: no cover + return value.isoformat() + else: + return value.isoformat() + elif ctype in (openpyxl.cell.cell.TYPE_STRING, openpyxl.cell.cell.TYPE_INLINE): return value.strip() - raise ValueError(repr(cell), 'unknown cell type') # pragma: no cover + raise ValueError( + 'Cell %s is not an acceptable cell type' % str(cell.coordinate) + ) # pragma: no cover def data_formatter(value, val_type, field=None): @@ -283,6 +325,8 @@ def data_formatter(value, val_type, field=None): elif val_type in ["list", "array"]: data_list = value.strip("[\']").split(",") return [data.strip() for data in data_list] + elif val_type == 'boolean': + return value else: # default assumed to be string return str(value).strip() @@ -328,15 +372,10 @@ def get_sub_field_number(field_name): return 0 -# @attr.s -# class FieldInfo(object): -# name = attr.ib() -# field_type = attr.ib(default=u'') -# value = attr.ib(default=u'') - - def build_field(field, field_data, field_type): - if not field_data or not field: + if field_data is False: + pass + elif not field_data or not field: return None patch_field_name = get_field_name(field) if not field_type: @@ -393,18 +432,18 @@ def get_existing(post_json, connection): temp = {} uuids = [] for an_id in all_ids: - try: - temp = ff_utils.get_metadata(an_id, key=connection.key, add_on="frame=object") - except Exception as e: - exc = parse_exception(e) - # if the item does not exist get_metadata will raise an exceptions - # see if the exception message has 404, then continue, if not throw that exception - if exc['code'] == 404: - temp = {} - else: - raise e - if temp.get("uuid"): - uuids.append(temp.get("uuid")) + try: + temp = ff_utils.get_metadata(an_id, key=connection.key, add_on="frame=object") + except Exception as e: + exc = parse_exception(e) + # if the item does not exist get_metadata will raise an exceptions + # see if the exception message has 404, then continue, if not throw that exception + if exc['code'] == 404: + temp = {} + else: + raise e + if temp.get("uuid"): + uuids.append(temp.get("uuid")) # check if all existing identifiers point to the same object unique_uuids = list(set(uuids)) @@ -483,6 +522,7 @@ def validate_field(field_data, field_type, aliases_by_type, connection): to_trim = 'array of embedded objects, ' is_array = False msg = None + field_data = data_formatter(field_data, field_type) if field_type.startswith(to_trim): field_type = field_type.replace(to_trim, '') if 'array' in field_type: @@ -494,6 +534,8 @@ def validate_field(field_data, field_type, aliases_by_type, connection): elif 'string' in field_type: strings = _convert_to_array(field_data, is_array) msg = validate_string(strings, aliases_by_type) + elif 'boolean' in field_type: + pass # for now return msg @@ -973,7 +1015,7 @@ def check_file_pairing(fastq_row): paired_end = row[pair_idx] if pair_idx else None saw_pair = False for i, fld in enumerate(row): - if fld.strip() == 'paired with': + if isinstance(fld, str) and fld.strip() == 'paired with': if saw_pair: err = 'single row with multiple paired_with values' errors = _add_e_to_edict(aliases[0], err, errors) @@ -1034,15 +1076,17 @@ def check_file_pairing(fastq_row): return _pairing_consistency_check(files, errors) -def excel_reader(datafile, sheet, update, connection, patchall, aliases_by_type, - dict_patch_loadxl, dict_replicates, dict_exp_sets, novalidate, attach_fields): - """takes an excel sheet and post or patched the data in.""" +def workbook_reader(workbook, sheet, update, connection, patchall, aliases_by_type, + dict_patch_loadxl, dict_replicates, dict_exp_sets, novalidate, attach_fields): + """takes an openpyxl workbook object and posts, patches or does a dry run on the data depending + on the options passed in. + """ # determine right from the top if dry run dryrun = not(update or patchall) all_aliases = [k for k in aliases_by_type] # dict for acumulating cycle patch data patch_loadxl = [] - row = reader(datafile, sheetname=sheet) + row = reader(workbook, sheetname=sheet) skip_dryrun = False if sheet == "ExperimentMic_Path": skip_dryrun = True @@ -1065,23 +1109,28 @@ def excel_reader(datafile, sheet, update, connection, patchall, aliases_by_type, if sheet == "FileFastq" and not novalidate: # check for consistent file pairing of fastqs in the sheet - pair_errs = check_file_pairing(reader(datafile, sheetname=sheet)) + pair_errs = check_file_pairing(reader(workbook, sheetname=sheet)) for f, err in sorted(pair_errs.items()): for e in err: print('WARNING: ', f, '\t', e) # iterate over the rows for values in row: - # Delete trailing commas and spaces - values = [item.strip(', ') for item in values] # Rows that start with # are skipped if values[0].startswith("#"): continue # Get rid of the first empty cell values.pop(0) total += 1 + clean_values = [] + for item in values: + try: + # strip trailing commas and spaces if a str + clean_values.append(item.strip(', ')) + except AttributeError: + clean_values.append(item) # build post_json and get existing if available - post_json = OrderedDict(zip(keys, values)) + post_json = OrderedDict(zip(keys, clean_values)) # Get existing data if available # existing_data = get_existing(post_json, connection) @@ -1096,6 +1145,15 @@ def excel_reader(datafile, sheet, update, connection, patchall, aliases_by_type, # if we get this far continue to build the json post_json = build_patch_json(post_json, fields2types) + + # # validate the row by fields and data_types + # if not novalidate: + # row_errors = pre_validate_json(post_json, fields2types, aliases_by_type, connection) + # if row_errors: + # error += 1 + # pre_validate_errors.extend(row_errors) + # invalid = True + # continue filename_to_post = post_json.get('filename') post_json, existing_data, file_to_upload, extrafiles = populate_post_json( post_json, connection, sheet, attach_fields) @@ -1307,9 +1365,9 @@ def build_tibanna_json(keys, types, values, connection): return template -def user_workflow_reader(datafile, sheet, connection): +def user_workflow_reader(workbook, sheet, connection): """takes the user workflow runsheet and ony post it to fourfront endpoint.""" - row = reader(datafile, sheetname=sheet) + row = reader(workbook, sheetname=sheet) keys = next(row) # grab the first row of headers types = next(row) # grab second row with type info # remove title column @@ -1591,12 +1649,14 @@ def main(): # pragma: no cover connection = FDN_Connection(key) cabin_cross_check(connection, args.patchall, args.update, args.infile, args.remote, args.lab, args.award) + # support for xlsx only - adjust if allowing different + workbook, sheetnames = digest_xlsx(args.infile) + # This is not in our documentation, but if single sheet is used, file name can be the collection if args.type and 'all' not in args.type: names = args.type else: - book = xlrd.open_workbook(args.infile) - names = book.sheet_names() + names = sheetnames # get me a list of all the data_types in the system profiles = get_profiles(connection) supported_collections = get_collections(profiles) @@ -1604,7 +1664,7 @@ def main(): # pragma: no cover # we want to read through names in proper upload order sorted_names = order_sorter(names) # get all aliases from all sheets for dryrun object connections tests - aliases_by_type = get_all_aliases(args.infile, sorted_names) + aliases_by_type = get_all_aliases(workbook, sorted_names) # all_aliases = list(aliases_by_type.keys()) # dictionaries that accumulate information during submission dict_loadxl = {} @@ -1614,14 +1674,14 @@ def main(): # pragma: no cover # accumulate = {dict_loadxl: {}, dict_replicates: {}, dict_exp_sets: {}} for n in sorted_names: if n.lower() in supported_collections: - excel_reader(args.infile, n, args.update, connection, args.patchall, aliases_by_type, - dict_loadxl, dict_replicates, dict_exp_sets, args.novalidate, attachment_fields) + workbook_reader(workbook, n, args.update, connection, args.patchall, aliases_by_type, + dict_loadxl, dict_replicates, dict_exp_sets, args.novalidate, attachment_fields) elif n.lower() == "experimentmic_path": - excel_reader(args.infile, "ExperimentMic_Path", args.update, connection, args.patchall, aliases_by_type, - dict_loadxl, dict_replicates, dict_exp_sets, args.novalidate, attachment_fields) + workbook_reader(workbook, "ExperimentMic_Path", args.update, connection, args.patchall, aliases_by_type, + dict_loadxl, dict_replicates, dict_exp_sets, args.novalidate, attachment_fields) elif n.lower().startswith('user_workflow'): if args.update: - user_workflow_reader(args.infile, n, connection) + user_workflow_reader(workbook, n, connection) else: print('user workflow sheets will only be processed with the --update argument') else: @@ -1640,4 +1700,4 @@ def main(): # pragma: no cover if __name__ == '__main__': - main() + main()