Merge pull request #25 from hms-dbmi/FF-320_II

FF-320
4dn-dcic · Nov 1, 2016 · 1590173 · 1590173
2 parents e6ddcc3 + d5989d6
commit 1590173
Show file tree

Hide file tree

Showing 3 changed files with 144 additions and 128 deletions.
diff --git a/Data_Files/Rao_et_al_2014/fieldsRao.xls b/Data_Files/Rao_et_al_2014/fieldsRao.xls
diff --git a/wranglertools/fdnDCIC.py b/wranglertools/fdnDCIC.py
@@ -130,109 +130,130 @@ def md5(path):
     return md5sum.hexdigest()
 
 
+############################################################
+############################################################
+# use the following order to process the sheets
+# if name is not here, will not be processed during ordering
+############################################################
+############################################################
 sheet_order = [
-    "User",
-    "Award",
-    "Lab",
-    "Document",
-    "Protocol",
-    "Publication",
-    "Organism",
-    "IndividualMouse",
-    "IndividualHuman",
-    "Vendor",
-    "Biosource",
-    "Construct",
-    "TreatmentRnai",
-    "TreatmentChemical",
-    "GenomicRegion",
-    "Target",
-    "Modification",
-    "Image",
-    "BiosampleCellCulture",
-    "Biosample",
-    "Enzyme",
-    "FileSet",
-    "FileFastq",
-    "FileFasta",
-    "ExperimentSet",
-    "ExperimentHiC",
-    "ExperimentCaptureC"
+    "User", "Award", "Lab", "Document", "Protocol", "Publication", "Organism", "IndividualMouse", "IndividualHuman",
+    "Vendor", "Enzyme", "Biosource", "Construct", "TreatmentRnai", "TreatmentChemical",
+    "GenomicRegion", "Target", "Modification", "Image", "BiosampleCellCulture", "Biosample",
+    "FileSet", "FileFastq", "FileFasta", "ExperimentSet", "ExperimentHiC", "ExperimentCaptureC"]
+
+do_not_use = [
+    "submitted_by", "date_created", "organism", "schema_version", "accession", "uuid", "status",
+    "quality_metric_flags", "notes", "restricted", "file_size", "filename", "alternate_accessions",
+    "content_md5sum", "md5sum", "quality_metric", "files_in_set", "experiments", "experiments_in_set"]
+
+
+def filter_and_sort(list_names):
+    """Filter and sort fields"""
+    useful = []
+    for field in list_names:
+        if field in do_not_use:
+            pass
+        else:
+            useful.append(field)
+    # sort alphabetically
+    useful = sorted(useful)
+    return useful
+
+move_frond = ['award', '*award', 'lab', '*lab', 'description',
+              'title', '*title', 'name', '*name', 'aliases', '#Field Name:']
+
+
+def move_to_frond(list_names):
+    """Move names frond"""
+    for frond in move_frond:
+        try:
+            list_names.remove(frond)
+            list_names.insert(0, frond)
+        except:
+            pass
+    return list_names
+
+move_end = ['documents', 'references', 'url', 'dbxrefs']
+
+
+def move_to_end(list_names):
+    """Move names to end"""
+    for end in move_end:
+        try:
+            list_names.pop(list_names.index(end))
+            list_names.append(end)
+        except:
+            pass
+    return list_names
+
+# reorder individual items in sheets, [SHEET, MOVE_ITEM, MOVE_BEFORE]
+reorder = [
+    ['Biosource', 'cell_line', 'SOP_cell_line'],
+    ['Biosource', 'cell_line_tier', 'SOP_cell_line'],
+    ['GenomicRegion', 'start_coordinate', 'end_coordinate'],
+    ['GenomicRegion', 'start_location', 'end_location'],
+    ['GenomicRegion', 'location_description', 'start_location'],
+    ['BiosampleCellCulture', 'protocol_documents', 'protocol_SOP_deviations'],
+    ['Biosample', 'biosample_relation.relationship_type', 'biosample_relation.biosample'],
+    ['Enzyme', 'catalog_number', 'attachment'],
+    ['Enzyme', 'recognition_sequence', 'attachment'],
+    ['Enzyme', 'site_length', 'attachment'],
+    ['Enzyme', 'cut_position', 'attachment'],
+    ['File', 'related_files.relationship_type', 'related_files.file'],
+    ['Experiment', 'average_fragment_size', 'fragment_size_range'],
+    ['Experiment', 'files', 'documents'],
+    ['Experiment', 'filesets', 'documents'],
+    ['Experiment', 'experiment_relation.relationship_type', 'documents'],
+    ['Experiment', 'experiment_relation.experiment', 'documents'],
+    ['Experiment', 'experiment_sets|0', 'documents'],
+    ['Experiment', 'experiment_sets|1', 'documents'],
+    ['Experiment', 'experiment_sets|2', 'documents'],
+    ['Experiment', 'experiment_sets|3', 'documents'],
 ]
 
 
-def order_FDN(input_xls):
-    """Order and filter created xls file."""
+def switch_fields(list_names, sheet):
+    for sort_case in reorder:
+        # to look for all experiments with "Experiment" name, it will also get ExperimentSet
+        # there are no conflicting field names
+        if sort_case[0] in sheet:
+            try:
+                # tihs is working more consistently then the pop item method
+                list_names.remove(sort_case[1])
+                list_names.insert(list_names.index(sort_case[2]), sort_case[1])
+            except:
+                pass
+    return list_names
+
+# if object name is in the following list, fetch all current/released items and add to xls
+fetch_items = {
+    "Protocol": "protocol", "Enzymes": "enzyme", "Biosource": "biosource",
+    "Publication": "publication", "Vendor": "vendor"}
+
+
+def fetch_all_items(sheet, field_list, connection):
+    """For a given sheet, get all released items"""
+    all_items = []
+    if sheet in fetch_items.keys():
+        obj_id = "search/?type=" + fetch_items[sheet]
+        get_FDN(obj_id, connection)
+        items_list = get_FDN(obj_id, connection)['@graph']
+        for item in items_list:
+            item_info = []
+            for field in field_list:
+                if field == "#Field Name:":
+                    item_info.append("#")
+                else:
+                    item_info.append(item.get(field, ''))
+            all_items.append(item_info)
+        return all_items
+    else:
+        return
 
-    do_not_use = [
-        "submitted_by",
-        "date_created",
-        "organism",
-        "schema_version",
-        "accession",
-        "uuid",
-        "status",
-        "quality_metric_flags",
-        "notes",
-        "restricted",
-        "file_size",
-        "filename",
-        "alternate_accessions",
-        "content_md5sum",
-        "md5sum",
-        "quality_metric",
-        "files_in_set",
-        "experiments",
-        "experiments_in_set"
-    ]
-
-    move_frond = [
-        'award',
-        '*award',
-        'lab',
-        '*lab',
-        'description',
-        'title',
-        '*title',
-        'name',
-        '*name',
-        'aliases',
-        '#Field Name:'
-    ]
-
-    move_end = [
-        'documents',
-        'references',
-        'url',
-        'dbxrefs'
-    ]
-
-    # reorder individual items in sheets, [SHEET, MOVE_ITEM, MOVE_BEFORE]
-    reorder = [
-        ['Biosource', 'cell_line', 'SOP_cell_line'],
-        ['Biosource', 'cell_line_tier', 'SOP_cell_line'],
-        ['GenomicRegion', 'start_coordinate', 'end_coordinate'],
-        ['GenomicRegion', 'start_location', 'end_location'],
-        ['GenomicRegion', 'location_description', 'start_location'],
-        ['BiosampleCellCulture', 'protocol_documents', 'protocol_SOP_deviations'],
-        ['Biosample', 'biosample_relation.relationship_type', 'biosample_relation.biosample'],
-        ['Enzyme', 'catalog_number', 'attachment'],
-        ['Enzyme', 'recognition_sequence', 'attachment'],
-        ['Enzyme', 'site_length', 'attachment'],
-        ['Enzyme', 'cut_position', 'attachment'],
-        ['File', 'related_files.relationship_type', 'related_files.file'],
-        ['Experiment', 'average_fragment_size', 'fragment_size_range'],
-        ['Experiment', 'files', 'documents'],
-        ['Experiment', 'filesets', 'documents'],
-        ['Experiment', 'experiment_relation.relationship_type', 'documents'],
-        ['Experiment', 'experiment_relation.experiment', 'documents'],
-        ['Experiment', 'experiment_sets|0', 'documents'],
-        ['Experiment', 'experiment_sets|1', 'documents'],
-        ['Experiment', 'experiment_sets|2', 'documents'],
-        ['Experiment', 'experiment_sets|3', 'documents'],
-
-    ]
 
+def order_FDN(input_xls, connection):
+    """Order and filter created xls file."""
     ReadFile = input_xls
     OutputFile = input_xls[:-4]+'_ordered.xls'
     bookread = xlrd.open_workbook(ReadFile)
@@ -250,49 +271,44 @@ def order_FDN(input_xls):
     if Sheets_read:
         print(Sheets_read, "not in sheet_order list, please update")
         Sheets.extend(Sheets_read)
-
     for sheet in Sheets:
         useful = []
         active_sheet = bookread.sheet_by_name(sheet)
         first_row_values = active_sheet.row_values(rowx=0)
-        for field in first_row_values:
-            if field in do_not_use:
-                pass
-            else:
-                useful.append(field)
-        useful = sorted(useful)
+        # remove items from fields in xls
+        useful = filter_and_sort(first_row_values)
         # move selected to front
-        for frond in move_frond:
-            try:
-                useful.remove(frond)
-                useful.insert(0, frond)
-            except:
-                pass
+        useful = move_to_frond(useful)
         # move selected to end
-        for end in move_end:
-            try:
-                useful.pop(useful.index(end))
-                useful.append(end)
-            except:
-                pass
+        useful = move_to_end(useful)
         # reorder some items based on reorder list
-        for sort_case in reorder:
-            if sort_case[0] in sheet:
-                try:
-                    useful.remove(sort_case[1])
-                    useful.insert(useful.index(sort_case[2]), sort_case[1])
-                except:
-                    pass
+        useful = switch_fields(useful, sheet)
+        # fetch all items for common objects
+        all_items = fetch_all_items(sheet, useful, connection)
         # create a new sheet and write the data
         new_sheet = book_w.add_sheet(sheet)
         for write_row_index, write_item in enumerate(useful):
             read_col_ind = first_row_values.index(write_item)
             column_val = active_sheet.col_values(read_col_ind)
             for write_column_index, cell_value in enumerate(column_val):
                 new_sheet.write(write_column_index, write_row_index, cell_value, style)
+        # write common objects
+        if all_items:
+            for i, item in enumerate(all_items):
+                for ix in range(len(useful)):
+                    write_column_index_II = write_column_index+1+i
+                    new_sheet.write(write_column_index_II, ix, item[ix], style)
+        else:
+            write_column_index_II = write_column_index
         # write 50 empty lines with text formatting
         for i in range(100):
             for ix in range(len(useful)):
-                new_sheet.write(write_column_index+1+i, ix, '', style)
-
+                write_column_index_III = write_column_index_II+1+i
+                new_sheet.write(write_column_index_III, ix, '', style)
     book_w.save(OutputFile)
+    ############################################################
+    ############################################################
+    # use the following order to process the sheets
+    # if name is not here, will not be processed during ordering
+    ############################################################
+    ############################################################
diff --git a/wranglertools/get_field_info.py b/wranglertools/get_field_info.py
@@ -174,14 +174,14 @@ def get_uploadable_fields(connection, types, include_description=False,
     return fields
 
 
-def create_xls(fields, filename):
+def create_xls(all_fields, filename):
     '''
     fields being a dictionary of sheet -> FieldInfo(objects)
     create one sheet per dictionary item, with three columns of fields
     for fieldname, description and enum
     '''
     wb = xlwt.Workbook()
-    for obj_name, fields in fields.items():
+    for obj_name, fields in all_fields.items():
         ws = wb.add_sheet(obj_name)
         ws.write(0, 0, "#Field Name:")
         ws.write(1, 0, "#Field Type:")
@@ -220,7 +220,7 @@ def main():
         file_name = args.outfile
         create_xls(fields, file_name)
         if args.order:
-            fdnDCIC.order_FDN(file_name)
+            fdnDCIC.order_FDN(file_name, connection)
 
 if __name__ == '__main__':
     main()