Skip to content

Commit

Permalink
Merge pull request #25 from hms-dbmi/FF-320_II
Browse files Browse the repository at this point in the history
FF-320
  • Loading branch information
aschroed committed Nov 1, 2016
2 parents e6ddcc3 + d5989d6 commit 1590173
Show file tree
Hide file tree
Showing 3 changed files with 144 additions and 128 deletions.
Binary file modified Data_Files/Rao_et_al_2014/fieldsRao.xls
Binary file not shown.
266 changes: 141 additions & 125 deletions wranglertools/fdnDCIC.py
Expand Up @@ -130,109 +130,130 @@ def md5(path):
return md5sum.hexdigest()


############################################################
############################################################
# use the following order to process the sheets
# if name is not here, will not be processed during ordering
############################################################
############################################################
sheet_order = [
"User",
"Award",
"Lab",
"Document",
"Protocol",
"Publication",
"Organism",
"IndividualMouse",
"IndividualHuman",
"Vendor",
"Biosource",
"Construct",
"TreatmentRnai",
"TreatmentChemical",
"GenomicRegion",
"Target",
"Modification",
"Image",
"BiosampleCellCulture",
"Biosample",
"Enzyme",
"FileSet",
"FileFastq",
"FileFasta",
"ExperimentSet",
"ExperimentHiC",
"ExperimentCaptureC"
"User", "Award", "Lab", "Document", "Protocol", "Publication", "Organism", "IndividualMouse", "IndividualHuman",
"Vendor", "Enzyme", "Biosource", "Construct", "TreatmentRnai", "TreatmentChemical",
"GenomicRegion", "Target", "Modification", "Image", "BiosampleCellCulture", "Biosample",
"FileSet", "FileFastq", "FileFasta", "ExperimentSet", "ExperimentHiC", "ExperimentCaptureC"]

do_not_use = [
"submitted_by", "date_created", "organism", "schema_version", "accession", "uuid", "status",
"quality_metric_flags", "notes", "restricted", "file_size", "filename", "alternate_accessions",
"content_md5sum", "md5sum", "quality_metric", "files_in_set", "experiments", "experiments_in_set"]


def filter_and_sort(list_names):
"""Filter and sort fields"""
useful = []
for field in list_names:
if field in do_not_use:
pass
else:
useful.append(field)
# sort alphabetically
useful = sorted(useful)
return useful

move_frond = ['award', '*award', 'lab', '*lab', 'description',
'title', '*title', 'name', '*name', 'aliases', '#Field Name:']


def move_to_frond(list_names):
"""Move names frond"""
for frond in move_frond:
try:
list_names.remove(frond)
list_names.insert(0, frond)
except:
pass
return list_names

move_end = ['documents', 'references', 'url', 'dbxrefs']


def move_to_end(list_names):
"""Move names to end"""
for end in move_end:
try:
list_names.pop(list_names.index(end))
list_names.append(end)
except:
pass
return list_names

# reorder individual items in sheets, [SHEET, MOVE_ITEM, MOVE_BEFORE]
reorder = [
['Biosource', 'cell_line', 'SOP_cell_line'],
['Biosource', 'cell_line_tier', 'SOP_cell_line'],
['GenomicRegion', 'start_coordinate', 'end_coordinate'],
['GenomicRegion', 'start_location', 'end_location'],
['GenomicRegion', 'location_description', 'start_location'],
['BiosampleCellCulture', 'protocol_documents', 'protocol_SOP_deviations'],
['Biosample', 'biosample_relation.relationship_type', 'biosample_relation.biosample'],
['Enzyme', 'catalog_number', 'attachment'],
['Enzyme', 'recognition_sequence', 'attachment'],
['Enzyme', 'site_length', 'attachment'],
['Enzyme', 'cut_position', 'attachment'],
['File', 'related_files.relationship_type', 'related_files.file'],
['Experiment', 'average_fragment_size', 'fragment_size_range'],
['Experiment', 'files', 'documents'],
['Experiment', 'filesets', 'documents'],
['Experiment', 'experiment_relation.relationship_type', 'documents'],
['Experiment', 'experiment_relation.experiment', 'documents'],
['Experiment', 'experiment_sets|0', 'documents'],
['Experiment', 'experiment_sets|1', 'documents'],
['Experiment', 'experiment_sets|2', 'documents'],
['Experiment', 'experiment_sets|3', 'documents'],
]


def order_FDN(input_xls):
"""Order and filter created xls file."""
def switch_fields(list_names, sheet):
for sort_case in reorder:
# to look for all experiments with "Experiment" name, it will also get ExperimentSet
# there are no conflicting field names
if sort_case[0] in sheet:
try:
# tihs is working more consistently then the pop item method
list_names.remove(sort_case[1])
list_names.insert(list_names.index(sort_case[2]), sort_case[1])
except:
pass
return list_names

# if object name is in the following list, fetch all current/released items and add to xls
fetch_items = {
"Protocol": "protocol", "Enzymes": "enzyme", "Biosource": "biosource",
"Publication": "publication", "Vendor": "vendor"}


def fetch_all_items(sheet, field_list, connection):
"""For a given sheet, get all released items"""
all_items = []
if sheet in fetch_items.keys():
obj_id = "search/?type=" + fetch_items[sheet]
get_FDN(obj_id, connection)
items_list = get_FDN(obj_id, connection)['@graph']
for item in items_list:
item_info = []
for field in field_list:
if field == "#Field Name:":
item_info.append("#")
else:
item_info.append(item.get(field, ''))
all_items.append(item_info)
return all_items
else:
return

do_not_use = [
"submitted_by",
"date_created",
"organism",
"schema_version",
"accession",
"uuid",
"status",
"quality_metric_flags",
"notes",
"restricted",
"file_size",
"filename",
"alternate_accessions",
"content_md5sum",
"md5sum",
"quality_metric",
"files_in_set",
"experiments",
"experiments_in_set"
]

move_frond = [
'award',
'*award',
'lab',
'*lab',
'description',
'title',
'*title',
'name',
'*name',
'aliases',
'#Field Name:'
]

move_end = [
'documents',
'references',
'url',
'dbxrefs'
]

# reorder individual items in sheets, [SHEET, MOVE_ITEM, MOVE_BEFORE]
reorder = [
['Biosource', 'cell_line', 'SOP_cell_line'],
['Biosource', 'cell_line_tier', 'SOP_cell_line'],
['GenomicRegion', 'start_coordinate', 'end_coordinate'],
['GenomicRegion', 'start_location', 'end_location'],
['GenomicRegion', 'location_description', 'start_location'],
['BiosampleCellCulture', 'protocol_documents', 'protocol_SOP_deviations'],
['Biosample', 'biosample_relation.relationship_type', 'biosample_relation.biosample'],
['Enzyme', 'catalog_number', 'attachment'],
['Enzyme', 'recognition_sequence', 'attachment'],
['Enzyme', 'site_length', 'attachment'],
['Enzyme', 'cut_position', 'attachment'],
['File', 'related_files.relationship_type', 'related_files.file'],
['Experiment', 'average_fragment_size', 'fragment_size_range'],
['Experiment', 'files', 'documents'],
['Experiment', 'filesets', 'documents'],
['Experiment', 'experiment_relation.relationship_type', 'documents'],
['Experiment', 'experiment_relation.experiment', 'documents'],
['Experiment', 'experiment_sets|0', 'documents'],
['Experiment', 'experiment_sets|1', 'documents'],
['Experiment', 'experiment_sets|2', 'documents'],
['Experiment', 'experiment_sets|3', 'documents'],

]

def order_FDN(input_xls, connection):
"""Order and filter created xls file."""
ReadFile = input_xls
OutputFile = input_xls[:-4]+'_ordered.xls'
bookread = xlrd.open_workbook(ReadFile)
Expand All @@ -250,49 +271,44 @@ def order_FDN(input_xls):
if Sheets_read:
print(Sheets_read, "not in sheet_order list, please update")
Sheets.extend(Sheets_read)

for sheet in Sheets:
useful = []
active_sheet = bookread.sheet_by_name(sheet)
first_row_values = active_sheet.row_values(rowx=0)
for field in first_row_values:
if field in do_not_use:
pass
else:
useful.append(field)
useful = sorted(useful)
# remove items from fields in xls
useful = filter_and_sort(first_row_values)
# move selected to front
for frond in move_frond:
try:
useful.remove(frond)
useful.insert(0, frond)
except:
pass
useful = move_to_frond(useful)
# move selected to end
for end in move_end:
try:
useful.pop(useful.index(end))
useful.append(end)
except:
pass
useful = move_to_end(useful)
# reorder some items based on reorder list
for sort_case in reorder:
if sort_case[0] in sheet:
try:
useful.remove(sort_case[1])
useful.insert(useful.index(sort_case[2]), sort_case[1])
except:
pass
useful = switch_fields(useful, sheet)
# fetch all items for common objects
all_items = fetch_all_items(sheet, useful, connection)
# create a new sheet and write the data
new_sheet = book_w.add_sheet(sheet)
for write_row_index, write_item in enumerate(useful):
read_col_ind = first_row_values.index(write_item)
column_val = active_sheet.col_values(read_col_ind)
for write_column_index, cell_value in enumerate(column_val):
new_sheet.write(write_column_index, write_row_index, cell_value, style)
# write common objects
if all_items:
for i, item in enumerate(all_items):
for ix in range(len(useful)):
write_column_index_II = write_column_index+1+i
new_sheet.write(write_column_index_II, ix, item[ix], style)
else:
write_column_index_II = write_column_index
# write 50 empty lines with text formatting
for i in range(100):
for ix in range(len(useful)):
new_sheet.write(write_column_index+1+i, ix, '', style)

write_column_index_III = write_column_index_II+1+i
new_sheet.write(write_column_index_III, ix, '', style)
book_w.save(OutputFile)
############################################################
############################################################
# use the following order to process the sheets
# if name is not here, will not be processed during ordering
############################################################
############################################################
6 changes: 3 additions & 3 deletions wranglertools/get_field_info.py
Expand Up @@ -174,14 +174,14 @@ def get_uploadable_fields(connection, types, include_description=False,
return fields


def create_xls(fields, filename):
def create_xls(all_fields, filename):
'''
fields being a dictionary of sheet -> FieldInfo(objects)
create one sheet per dictionary item, with three columns of fields
for fieldname, description and enum
'''
wb = xlwt.Workbook()
for obj_name, fields in fields.items():
for obj_name, fields in all_fields.items():
ws = wb.add_sheet(obj_name)
ws.write(0, 0, "#Field Name:")
ws.write(1, 0, "#Field Type:")
Expand Down Expand Up @@ -220,7 +220,7 @@ def main():
file_name = args.outfile
create_xls(fields, file_name)
if args.order:
fdnDCIC.order_FDN(file_name)
fdnDCIC.order_FDN(file_name, connection)

if __name__ == '__main__':
main()

0 comments on commit 1590173

Please sign in to comment.