From 23d82d26140477242a519b80f97cb9740810b20b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Koray=20K=C4=B1rl=C4=B1?= Date: Mon, 31 Oct 2016 18:20:03 -0400 Subject: [PATCH 1/4] FF-320 #comment simply the code --- wranglertools/fdnDCIC.py | 235 ++++++++++++++++---------------- wranglertools/get_field_info.py | 4 +- 2 files changed, 117 insertions(+), 122 deletions(-) diff --git a/wranglertools/fdnDCIC.py b/wranglertools/fdnDCIC.py index e0d38901..db5068ed 100644 --- a/wranglertools/fdnDCIC.py +++ b/wranglertools/fdnDCIC.py @@ -130,109 +130,110 @@ def md5(path): return md5sum.hexdigest() +############################################################ +############################################################ +# use the following order to process the sheets +# if name is not here, will not be processed during ordering +############################################################ +############################################################ sheet_order = [ - "User", - "Award", - "Lab", - "Document", - "Protocol", - "Publication", - "Organism", - "IndividualMouse", - "IndividualHuman", - "Vendor", - "Biosource", - "Construct", - "TreatmentRnai", - "TreatmentChemical", - "GenomicRegion", - "Target", - "Modification", - "Image", - "BiosampleCellCulture", - "Biosample", - "Enzyme", - "FileSet", - "FileFastq", - "FileFasta", - "ExperimentSet", - "ExperimentHiC", - "ExperimentCaptureC" -] + "User", "Award", "Lab", "Document", "Protocol", "Publication", "Organism", "IndividualMouse", "IndividualHuman", + "Vendor", "Enzyme", "Biosource", "Construct", "TreatmentRnai", "TreatmentChemical", + "GenomicRegion", "Target", "Modification", "Image", "BiosampleCellCulture", "Biosample", + "FileSet", "FileFastq", "FileFasta", "ExperimentSet", "ExperimentHiC", "ExperimentCaptureC"] +do_not_use = [ + "submitted_by", "date_created", "organism", "schema_version", "accession", "uuid", "status", + "quality_metric_flags", "notes", "restricted", "file_size", "filename", "alternate_accessions", + "content_md5sum", "md5sum", "quality_metric", "files_in_set", "experiments", "experiments_in_set"] -def order_FDN(input_xls): - """Order and filter created xls file.""" - do_not_use = [ - "submitted_by", - "date_created", - "organism", - "schema_version", - "accession", - "uuid", - "status", - "quality_metric_flags", - "notes", - "restricted", - "file_size", - "filename", - "alternate_accessions", - "content_md5sum", - "md5sum", - "quality_metric", - "files_in_set", - "experiments", - "experiments_in_set" - ] +def filter_and_sort(list_names): + """Filter and sort fields""" + useful = [] + for field in list_names: + if field in do_not_use: + pass + else: + useful.append(field) + # sort alphabetically + useful = sorted(useful) + return useful - move_frond = [ - 'award', - '*award', - 'lab', - '*lab', - 'description', - 'title', - '*title', - 'name', - '*name', - 'aliases', - '#Field Name:' - ] +move_frond = ['award', '*award', 'lab', '*lab', 'description', + 'title', '*title', 'name', '*name', 'aliases', '#Field Name:'] - move_end = [ - 'documents', - 'references', - 'url', - 'dbxrefs' - ] - # reorder individual items in sheets, [SHEET, MOVE_ITEM, MOVE_BEFORE] - reorder = [ - ['Biosource', 'cell_line', 'SOP_cell_line'], - ['Biosource', 'cell_line_tier', 'SOP_cell_line'], - ['GenomicRegion', 'start_coordinate', 'end_coordinate'], - ['GenomicRegion', 'start_location', 'end_location'], - ['GenomicRegion', 'location_description', 'start_location'], - ['BiosampleCellCulture', 'protocol_documents', 'protocol_SOP_deviations'], - ['Biosample', 'biosample_relation.relationship_type', 'biosample_relation.biosample'], - ['Enzyme', 'catalog_number', 'attachment'], - ['Enzyme', 'recognition_sequence', 'attachment'], - ['Enzyme', 'site_length', 'attachment'], - ['Enzyme', 'cut_position', 'attachment'], - ['File', 'related_files.relationship_type', 'related_files.file'], - ['Experiment', 'average_fragment_size', 'fragment_size_range'], - ['Experiment', 'files', 'documents'], - ['Experiment', 'filesets', 'documents'], - ['Experiment', 'experiment_relation.relationship_type', 'documents'], - ['Experiment', 'experiment_relation.experiment', 'documents'], - ['Experiment', 'experiment_sets|0', 'documents'], - ['Experiment', 'experiment_sets|1', 'documents'], - ['Experiment', 'experiment_sets|2', 'documents'], - ['Experiment', 'experiment_sets|3', 'documents'], +def move_to_frond(list_names): + """Move names frond""" + for frond in move_frond: + try: + list_names.remove(frond) + list_names.insert(0, frond) + except: + pass + return list_names + +move_end = ['documents', 'references', 'url', 'dbxrefs'] + + +def move_to_end(list_names): + """Move names to end""" + for end in move_end: + try: + list_names.pop(list_names.index(end)) + list_names.append(end) + except: + pass + return list_names + +# reorder individual items in sheets, [SHEET, MOVE_ITEM, MOVE_BEFORE] +reorder = [ + ['Biosource', 'cell_line', 'SOP_cell_line'], + ['Biosource', 'cell_line_tier', 'SOP_cell_line'], + ['GenomicRegion', 'start_coordinate', 'end_coordinate'], + ['GenomicRegion', 'start_location', 'end_location'], + ['GenomicRegion', 'location_description', 'start_location'], + ['BiosampleCellCulture', 'protocol_documents', 'protocol_SOP_deviations'], + ['Biosample', 'biosample_relation.relationship_type', 'biosample_relation.biosample'], + ['Enzyme', 'catalog_number', 'attachment'], + ['Enzyme', 'recognition_sequence', 'attachment'], + ['Enzyme', 'site_length', 'attachment'], + ['Enzyme', 'cut_position', 'attachment'], + ['File', 'related_files.relationship_type', 'related_files.file'], + ['Experiment', 'average_fragment_size', 'fragment_size_range'], + ['Experiment', 'files', 'documents'], + ['Experiment', 'filesets', 'documents'], + ['Experiment', 'experiment_relation.relationship_type', 'documents'], + ['Experiment', 'experiment_relation.experiment', 'documents'], + ['Experiment', 'experiment_sets|0', 'documents'], + ['Experiment', 'experiment_sets|1', 'documents'], + ['Experiment', 'experiment_sets|2', 'documents'], + ['Experiment', 'experiment_sets|3', 'documents'], +] + +def switch_fields(list_names, sheet): + for sort_case in reorder: + # to look for all experiments with "Experiment" name, it will also get ExperimentSet + # there are no conflicting field names + if sort_case[0] in sheet: + try: + # tihs is working more consistently then the pop item method + list_names.remove(sort_case[1]) + list_names.insert(list_names.index(sort_case[2]), sort_case[1]) + except: + pass + return list_names + +# if object name is in the following list, fetch all current/released items and add to xls +fetch_items = [ + "Protocol", "Enzymes", "Biosource", "Publication", "Vendor" ] + +def order_FDN(input_xls): + """Order and filter created xls file.""" ReadFile = input_xls OutputFile = input_xls[:-4]+'_ordered.xls' bookread = xlrd.open_workbook(ReadFile) @@ -250,39 +251,28 @@ def order_FDN(input_xls): if Sheets_read: print(Sheets_read, "not in sheet_order list, please update") Sheets.extend(Sheets_read) - for sheet in Sheets: useful = [] active_sheet = bookread.sheet_by_name(sheet) first_row_values = active_sheet.row_values(rowx=0) - for field in first_row_values: - if field in do_not_use: - pass - else: - useful.append(field) - useful = sorted(useful) + print('1') + print(first_row_values) + # remove items from fields in xls + useful = filter_and_sort(first_row_values) + print('2') + print(useful) # move selected to front - for frond in move_frond: - try: - useful.remove(frond) - useful.insert(0, frond) - except: - pass + useful = move_to_frond(useful) + print('3') + print(useful) # move selected to end - for end in move_end: - try: - useful.pop(useful.index(end)) - useful.append(end) - except: - pass + useful = move_to_end(useful) + print('4') + print(useful) # reorder some items based on reorder list - for sort_case in reorder: - if sort_case[0] in sheet: - try: - useful.remove(sort_case[1]) - useful.insert(useful.index(sort_case[2]), sort_case[1]) - except: - pass + useful = switch_fields(useful, sheet) + print('5') + print(useful) # create a new sheet and write the data new_sheet = book_w.add_sheet(sheet) for write_row_index, write_item in enumerate(useful): @@ -294,5 +284,10 @@ def order_FDN(input_xls): for i in range(100): for ix in range(len(useful)): new_sheet.write(write_column_index+1+i, ix, '', style) - book_w.save(OutputFile) + ############################################################ + ############################################################ + # use the following order to process the sheets + # if name is not here, will not be processed during ordering + ############################################################ + ############################################################ diff --git a/wranglertools/get_field_info.py b/wranglertools/get_field_info.py index 8002572c..faa1cc1b 100755 --- a/wranglertools/get_field_info.py +++ b/wranglertools/get_field_info.py @@ -174,14 +174,14 @@ def get_uploadable_fields(connection, types, include_description=False, return fields -def create_xls(fields, filename): +def create_xls(all_fields, filename): ''' fields being a dictionary of sheet -> FieldInfo(objects) create one sheet per dictionary item, with three columns of fields for fieldname, description and enum ''' wb = xlwt.Workbook() - for obj_name, fields in fields.items(): + for obj_name, fields in all_fields.items(): ws = wb.add_sheet(obj_name) ws.write(0, 0, "#Field Name:") ws.write(1, 0, "#Field Type:") From b78f0daca806e452e70101f97196b1af95aa6dd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Koray=20K=C4=B1rl=C4=B1?= Date: Mon, 31 Oct 2016 18:57:01 -0400 Subject: [PATCH 2/4] + --- wranglertools/fdnDCIC.py | 28 ++++++++++++++-------------- wranglertools/get_field_info.py | 2 +- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/wranglertools/fdnDCIC.py b/wranglertools/fdnDCIC.py index db5068ed..16b99e44 100644 --- a/wranglertools/fdnDCIC.py +++ b/wranglertools/fdnDCIC.py @@ -227,12 +227,18 @@ def switch_fields(list_names, sheet): return list_names # if object name is in the following list, fetch all current/released items and add to xls -fetch_items = [ - "Protocol", "Enzymes", "Biosource", "Publication", "Vendor" - ] +fetch_items = { + "Protocol": "protocol", "Enzymes": "enzymes", "Biosource": "biosources", + "Publication": "publications", "Vendor": "vendors"} -def order_FDN(input_xls): +def fetch_all_items(sheet, field_list, connection): + if sheet in fetch_items.keys(): + json_list = get_FDN(fetch_items[sheet], connection) + return(json_list) + + +def order_FDN(input_xls, connection): """Order and filter created xls file.""" ReadFile = input_xls OutputFile = input_xls[:-4]+'_ordered.xls' @@ -255,24 +261,18 @@ def order_FDN(input_xls): useful = [] active_sheet = bookread.sheet_by_name(sheet) first_row_values = active_sheet.row_values(rowx=0) - print('1') - print(first_row_values) # remove items from fields in xls useful = filter_and_sort(first_row_values) - print('2') - print(useful) # move selected to front useful = move_to_frond(useful) - print('3') - print(useful) # move selected to end useful = move_to_end(useful) - print('4') - print(useful) # reorder some items based on reorder list useful = switch_fields(useful, sheet) - print('5') - print(useful) + # fetch all items for common objects + all_items = fetch_all_items(sheet, useful, connection) + print(all_items) + # create a new sheet and write the data new_sheet = book_w.add_sheet(sheet) for write_row_index, write_item in enumerate(useful): diff --git a/wranglertools/get_field_info.py b/wranglertools/get_field_info.py index faa1cc1b..a395aaf5 100755 --- a/wranglertools/get_field_info.py +++ b/wranglertools/get_field_info.py @@ -220,7 +220,7 @@ def main(): file_name = args.outfile create_xls(fields, file_name) if args.order: - fdnDCIC.order_FDN(file_name) + fdnDCIC.order_FDN(file_name, connection) if __name__ == '__main__': main() From d38eb0db04ef2c7db32bc01c1e3c3a2586d4c281 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Koray=20K=C4=B1rl=C4=B1?= Date: Tue, 1 Nov 2016 14:26:08 -0400 Subject: [PATCH 3/4] FF-320 #comment added the code in the ordering function --- Data_Files/Rao_et_al_2014/fieldsRao.xls | Bin 309248 -> 309248 bytes wranglertools/fdnDCIC.py | 45 ++++++++++++++++++------ wranglertools/get_field_info.py | 2 +- 3 files changed, 35 insertions(+), 12 deletions(-) diff --git a/Data_Files/Rao_et_al_2014/fieldsRao.xls b/Data_Files/Rao_et_al_2014/fieldsRao.xls index 6179c8c415609774de7e3e2bcab3ba6e6e2694ea..ea5ec5b47d9bde8f5f8e4b5368c514a62fa41e0f 100644 GIT binary patch delta 37 qcmZqpAk^?dXhRMQ+m|1c`cqAsi&)x=SQvqr35c1u7qPJPZ3X}={|-3- delta 37 qcmZqpAk^?dXhRMQ+p(O$sMW^JMJ(+_EQ~ Date: Tue, 1 Nov 2016 14:52:03 -0400 Subject: [PATCH 4/4] FF-320 #comment Added connection for user specific items to be fetched --- wranglertools/fdnDCIC.py | 14 ++++++-------- wranglertools/get_field_info.py | 2 +- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/wranglertools/fdnDCIC.py b/wranglertools/fdnDCIC.py index 6abcaaaa..e6fea400 100644 --- a/wranglertools/fdnDCIC.py +++ b/wranglertools/fdnDCIC.py @@ -232,15 +232,13 @@ def switch_fields(list_names, sheet): "Publication": "publication", "Vendor": "vendor"} -def fetch_all_items(sheet, field_list): +def fetch_all_items(sheet, field_list, connection): """For a given sheet, get all released items""" all_items = [] if sheet in fetch_items.keys(): - obj = fetch_items[sheet] - HEADERS = {'accept': 'application/json'} - URL = "http://data.4dnucleome.org/search/?type={}&frame=object&limit=all&format=json".format(obj) - response = requests.get(URL, headers=HEADERS) - items_list = response.json()['@graph'] + obj_id = "search/?type=" + fetch_items[sheet] + get_FDN(obj_id, connection) + items_list = get_FDN(obj_id, connection)['@graph'] for item in items_list: item_info = [] for field in field_list: @@ -254,7 +252,7 @@ def fetch_all_items(sheet, field_list): return -def order_FDN(input_xls): +def order_FDN(input_xls, connection): """Order and filter created xls file.""" ReadFile = input_xls OutputFile = input_xls[:-4]+'_ordered.xls' @@ -286,7 +284,7 @@ def order_FDN(input_xls): # reorder some items based on reorder list useful = switch_fields(useful, sheet) # fetch all items for common objects - all_items = fetch_all_items(sheet, useful) + all_items = fetch_all_items(sheet, useful, connection) # create a new sheet and write the data new_sheet = book_w.add_sheet(sheet) for write_row_index, write_item in enumerate(useful): diff --git a/wranglertools/get_field_info.py b/wranglertools/get_field_info.py index faa1cc1b..a395aaf5 100755 --- a/wranglertools/get_field_info.py +++ b/wranglertools/get_field_info.py @@ -220,7 +220,7 @@ def main(): file_name = args.outfile create_xls(fields, file_name) if args.order: - fdnDCIC.order_FDN(file_name) + fdnDCIC.order_FDN(file_name, connection) if __name__ == '__main__': main()