## This is the updated version to automatically update the GEO metadata sheet for GEO submission
### Important notes:
#### 1. Please add information only in the B (USER INPUT) section. 
#### 2. Verify all the information added to the sheet. 
#### 3. To make any corrections or to add any missing data, please run the entire notebook first and then make any necassary changes manually at the end.

## A) Initialization and dictionaries

In [None]:
from dcicutils import ff_utils
from functions.notebook_functions import *
from functions.cleanup import get_workflow_details, delete_wfrs
import time
import pandas as pd
import openpyxl
from rapidfuzz import process

In [None]:
#lists and dictionaries


extracted_molecule = ["polyA RNA", "total RNA", "nuclear RNA", "cytoplasmic RNA" ,"genomic DNA","protein","other"]

instrument_models = ["454 GS", "454 GS 20", "454 GS FLX", "454 GS FLX+", "454 GS FLX Titanium", "454 GS Junior", "AB 5500 Genetic Analyzer", "AB 5500xl Genetic Analyzer", "AB 5500xl-W Genetic Analysis System", "AB SOLiD 3 Plus System", "AB SOLiD 4hq System", "AB SOLiD 4 System", "AB SOLiD PI System", "AB SOLiD System", "AB SOLiD System 2.0", "AB SOLiD System 3.0", "BGISEQ-500", "Complete Genomics", "DNBSEQ-G400", "DNBSEQ-G400 FAST", "DNBSEQ-G50", "DNBSEQ-T7", "Element AVITI", "FASTASeq 300", "GenoCare 1600", "GenoLab M", "GridION", "GS111", "Helicos HeliScope", "HiSeq X Five", "HiSeq X Ten", "Illumina Genome Analyzer", "Illumina Genome Analyzer II", "Illumina Genome Analyzer IIx", "Illumina HiScanSQ", "Illumina HiSeq 1000", "Illumina HiSeq 1500", "Illumina HiSeq 2000", "Illumina HiSeq 2500", "Illumina HiSeq 3000", "Illumina HiSeq 4000", "Illumina iSeq 100", "Illumina MiniSeq", "Illumina MiSeq", "Illumina NextSeq 500", "Illumina NovaSeq 6000", "Illumina NovaSeq X", "Illumina NovaSeq X Plus", "Ion GeneStudio S5", "Ion GeneStudio S5 plus", "Ion GeneStudio S5 prime", "Ion Torrent Genexus", "Ion Torrent PGM", "Ion Torrent Proton", "Ion Torrent S5", "Ion Torrent S5 XL", "MGISEQ-2000RS", "MinION", "NextSeq 1000", "NextSeq 2000", "NextSeq 550", "Onso", "PacBio RS", "PacBio RS II", "PromethION", "Revio", "Sentosa SQ301", "Sequel", "Sequel II", "Sequel IIe", "Tapestri", "UG 100"]

experiment_type_dic = {'2-stage Repli-seq': 'OTHER',
 '4C-seq': '4C-Seq',
 'ATAC-seq': 'ATAC-seq',
 'BLISS': 'OTHER',
 'Bru-seq': 'BRU-Seq',
 'Capture Hi-C': 'Capture-C',
 'ChIA-Drop': 'ChIA-PET',
 'ChIA-PET': 'ChIA-PET',
 'ChIP-exo': 'OTHER',
 'ChIP-seq': 'ChIP-Seq',
 'CUT&RUN': 'CUT&Run',
 'CUT&Tag': 'CUT&Tag',
 'DamID-seq': 'DamID-Seq',
 'Dilution Hi-C': 'OTHER',
 'DNA FISH': 'OTHER',
 'DNA SPRITE': 'OTHER',
 'DNase Hi-C': 'OTHER',
 'Droplet paired-tag': 'OTHER',
 'Electron Tomography': 'OTHER',
 'GAM': 'OTHER',
 'HiCAR': 'OTHER',
 'HiChIP': 'HiChIP',
 'Immunofluorescence': 'OTHER',
 'in situ ChIA-PET': 'ChIA-PET',
 'in situ Hi-C': 'Hi-C',
 'MARGI': 'OTHER',
 'MC-3C': 'OTHER',
 'MC-Hi-C': 'OTHER',
 'Methyl Hi-C': 'OTHER',
 'Micro-C': 'OTHER',
 'Multi-stage Repli-seq': 'OTHER',
 'multiplexed FISH': 'OTHER',
 'NAD-seq': 'OTHER',
 'OptoDroplet': 'OTHER',
 'pA-DamID': 'OTHER',
 'PLAC-seq': 'OTHER',
 'RE-seq': 'OTHER',
 'RNA FISH': 'OTHER',
 'RNA-DNA SPRITE': 'OTHER',
 'RNA-seq': 'RNA-Seq',
 'sci-ATAC-seq': 'OTHER',
 'sci-Hi-C': 'OTHER',
 'sci-RNA-seq': 'scRNA-seq',
 'single cell ATAC-seq': 'scATAC-seq',
 'single cell Hi-C': 'OTHER',
 'single cell Methyl Hi-C': 'OTHER',
 'single cell RNA-seq': 'OTHER',
 'SLAM-seq': 'OTHER',
 'sn-Hi-C': 'OTHER',
 'SPT': 'OTHER',
 'TCC': 'OTHER',
 'TrAC-loop': 'OTHER',
 'TRIP': 'OTHER',
 'TSA-seq': 'OTHER',
 'WGBS': 'OTHER'}


organism_name_dic = {'C. jacchus': 'Callithrix jacchus',
 'M. mulatta': 'Macaca mulatta',
 'M. domestica': 'Monodelphis domestica',
 'S. pyogenes': 'S. pyogenes',
 'M. auratus': 'Mesocricetus auratus',
 'D. rerio': 'Danio rerio',
 'C. sabaeus': 'Chlorocebus sabaeus',
 'G. gallus': 'Gallus gallus',
 'C. elegans': 'Caenorhabditis elegans',
 'D. melanogaster': 'Drosophila melanogaster',
 'R. norvegicus': 'Rattus norvegicus',
 'M. musculus': 'Mus musculus',
 'H. sapiens': 'Homo sapiens'}

### B) USER INPUT: Please add necessary text and conditions

In [None]:
#Add key and path to blank GEO submission template

my_auth = get_key('', keyfile='')  #add_key
GEO_metadata_template_file = '' #https://www.ncbi.nlm.nih.gov/geo/info/seq.htm

In [None]:
#add file type(s) for processed files associated with experiments (not experiment sets)
processed_files= [] #file_types should be be lower case as in file_type value json metadata e.g "gene expression"

#add file type(s) for processed files associated with experiment-sets i.e. linked to multiple experiments (leave empty if you don't want to include). These will be added as supplementary files and will be associated to the entire GEO series
add_expset_proc_as_supp_names = []

#Add file type for supplementary files tagged as "other processed files" in 4DN(leave empty if you don't want to include). These will be added as supplementary files and linked to entire GEO series.
supplementary_file_type = []

molecule = ""  #add extracted molecule, choices ["polyA RNA", "total RNA", "nuclear RNA", "cytoplasmic RNA", "genomic DNA", "protein", "other"]



In [None]:
#Add datasets to be uploaded. All datasets should be from one publication or study and filter for one organism at a time.

sets_list = [] #4DN experiment set Ids 4DNESxxx

#or

search_url  = '' #complete URL from the browse page on 4DN

if sets_list:
    esets = [ff_utils.get_metadata(i, my_auth) for i in sets_list]
elif search_url:
    esets = [i for i in ff_utils.search_metadata(search_url, my_auth)]
    
print("No. of experiments/samples collected: {}".format(len(esets))) 

## c) Extracting data (Don't edit below)

In [None]:
## Extracting STUDY SECTION 
user_input = []

for eset in esets[0:1]:
    if eset.get("produced_in_pub") == None:
        print("No publication present, add manually")
        has_pub = False
        user_input.append("title")
        user_input.append("summary")
        user_input.append("author list")
    else:
        has_pub = True
        pub_details = eset.get("produced_in_pub")
        pub_title = pub_details.get("title")
        summary = pub_details.get("abstract")
        authors = pub_details.get("authors")
        full_name = []

        for author in authors:
            name = author.split(" ")
            surname = name[0]
            if len(name) > 2:
                firstname = name[-1]
            else:
                firstname = name[1]
            author_name = firstname + ", " + surname
            full_name.append([author_name])
        print("publication details collected")
        
if has_pub == False:
    print("No publication details in 4DN, add information manually in the sheet at the end")
  

In [None]:
##Extracting SAMPLES details

portal_url = "https://data.4dnucleome.org"
replicate_desc = {}
replicate_desc_short = {}
md5sums_raw = {}
md5sums_proc = {}
paired_raw_files = {}
all_raw_files = {}
all_proc_files = {}
total_supplementary_files = []
proc_file_description = {}

dataset_labels = []
conditions = []
organisms = []
all_tissues = []
raw_file_warning = []

tissue_name = ""
cell_line_name = ""
cell_type = ""
treatment = ""
batch = ""

threshold = 80 #To match instrument model name to GEO's model name list.
all_matches = ()
matched = {}
best_matched = ()
best_matched_bt = ()

samples_rows = []

for eset in esets:
    if eset.get("other_processed_files"):
        supplementary_files = eset.get("other_processed_files")
        for sup_file_info in supplementary_files:
            s_files = sup_file_info.get("files")
            for sfile in s_files:
                s_file_type = sfile.get("file_type")
                if s_file_type in supplementary_file_type:
                    sfile_acc = sfile.get("accession")
                    sfile_name = sfile.get('display_title')
                    sfile_md5sum = sfile.get("md5sum")
                    proc_file_description[s_file_type] = sfile_name.split(".", 1)[1]
                    total_supplementary_files.append(sfile_name)
    if len(add_expset_proc_as_supp_names) > 0:
        if eset.get('processed_files'):
            eset_proc = eset.get('processed_files')
            for eset_f in eset_proc:
                    eset_proc_acc = eset_f.get("accession")
                    eset_proc_name = eset_f.get('display_title')
                    eset_proc_md5sum = eset_f.get("md5sum")
                    eset_proc_filetype = eset_f.get('file_type')
                    if eset_proc_filetype in add_expset_proc_as_supp_names:
                        total_supplementary_files.append(eset_proc_name)            
    if eset.get("dataset_label") not in dataset_labels:
        dataset_label = eset.get("dataset_label")
        dataset_labels.append(dataset_label)
    condition = eset.get("condition")
    genotype = condition
    if genotype not in conditions:
        conditions.append(genotype)
    replicate_info = eset.get("replicate_exps")
    for reps in replicate_info:
            biorep = reps.get("bio_rep_no")     
            techrep = reps.get("tec_rep_no")
            rep_description = "Biological replicate " +  str(biorep) + ", Technical replicate " + str(techrep)
            rep_description_short = "B" +  str(biorep) + " T" + str(techrep)
            rep_info = reps.get("replicate_exp")
            rep_info_acc = rep_info.get("accession")
            replicate_desc[rep_info_acc] = rep_description
            replicate_desc_short[rep_info_acc] = rep_description_short
    exps = eset.get('experiments_in_set')
    for exp in exps:
        raw_files_per_exp = []
        proc_files_per_exp = []
        exp_id = exp.get("@id")
        exp_url = portal_url + exp_id
        exp_acc = exp.get("accession")
        library_name = exp.get("display_title")
        exp_details = exp.get("experiment_type")
        experiment_assay = exp_details.get('display_title')
        library_strategy = experiment_type_dic[experiment_assay]
        files = exp.get("files")
        for file in files:
            file_acc = file.get("accession")
            file_type = file.get("file_type")
            file_name = file.get("display_title")
            if file_type == "reads":
                raw_files_per_exp.append(file.get('display_title'))
                file_metadata = ff_utils.get_metadata(file_acc, my_auth)
                first_line = file_metadata.get('file_first_line')
                if "@SRR" in first_line:
                    raw_file_warning.append(file_acc)
                    print('Raw file already deposited in SRA')
                instrument_name = file_metadata.get('instrument')
                best_match, score, index = process.extractOne(instrument_name, instrument_models)
                all_matches = (instrument_name,best_match,score)
                if score >= threshold:
                    best_matched = (instrument_name,best_match,score)
                    instrument = best_match
                else:
                    best_matched_bt = (instrument_name,best_match,score)
                    instrument = instrument_name
                matched[instrument_name] = instrument    
                md5sum = file_metadata.get('md5sum')
                md5sums_raw[file_name] = md5sum
                if file.get("paired_end"):
                    paired = "paired-end"
                    related_files = file.get("related_files")
                    for rf in related_files:
                        if rf.get("relationship_type") == "paired with":
                            rff = rf.get('file')
                            paired_acc = rff.get("accession")
                            paired_end = rff.get("paired_end")
                            paired_name = rff.get('display_title')
                            if paired_end == "2":
                                paired_raw_files[file_name] = paired_name
                else:
                    paired = "single"
        if exp.get("processed_files"):
            proc_files = exp.get("processed_files")
            for pfile in proc_files:
                file_acc = pfile.get("accession")
                file_type = pfile.get("file_type")
                file_name = pfile.get("display_title")
                if file_type in processed_files:
                    proc_files_per_exp.append(file_name)
                    pfile_metadata = ff_utils.get_metadata(file_acc, my_auth)
                    pmd5sum = pfile_metadata.get('md5sum')
                    md5sums_proc[file_name] = pmd5sum
        else:    
            proc_files_per_exp.append('')
        biosample = exp.get("biosample")
        biosample_id = biosample.get("@id")
        biosample_url = portal_url + biosample_id
        biosample_type = biosample.get("biosample_type")
        biosources = biosample.get("biosource")
        for biosource in biosources:
            organism = biosource.get("organism")
            organism_name_dt = organism.get("display_title")
            organism_name = organism_name_dic[organism_name_dt]
            organism_uuid = organism.get("uuid")
            organism_metadata = ff_utils.get_metadata(organism_uuid, my_auth)
            if organism_metadata.get("genome_assembly"):
                 organism_genome_assembly = organism_metadata.get("genome_assembly")
            else:
                 user_input.append("organism_genome_assembly")
            if organism_name not in organisms:
                organisms.append(organism_name)
            biosource_type = biosource.get("biosource_type")
            if biosource_type == "tissue":
                tissue = biosource.get('tissue')
                tissue_name = tissue.get('term_name')
                all_tissues.append(tissue_name)
                bname = tissue_name
            else:
                cell_type = biosource_type
                cell_line = biosource.get("cell_line")
                cell_line_name = cell_line.get("term_name")
                bname = cell_line_name
        proc_files_desc_exp = []
        for file_type, ext in proc_file_description.items():
            desc = '{} ({})'.format(file_type, ext)
            proc_files_desc_exp.append(desc)
            
        processed_data_files_format_content_per_exp  = ', '.join(proc_files_desc_exp)        
        title =  exp_acc + ", " + organism_name + " - " + bname + ", " +condition + ", " + replicate_desc_short[exp_acc]
        if len(processed_data_files_format_content_per_exp) > 0:
            description = experiment_assay + " in " + bname + " (" + organism_name + ")" +", " + condition + ", " + replicate_desc[exp_acc] + ", 4DN experiment: " + exp_url + ", 4DN Biosample: " + biosample_url + ', Results include ' + processed_data_files_format_content_per_exp
        else:
            description = experiment_assay + " in " + bname + " (" + organism_name + ")" +", " + condition + ", " + replicate_desc[exp_acc] + ", 4DN experiment: " + exp_url + ", 4DN Biosample: " + biosample_url
        samples_rows.append([exp_acc,title,library_strategy,organism_name,tissue_name,cell_line_name, cell_type, genotype, treatment, batch, molecule, paired,instrument,description])
        all_raw_files[exp_acc] = raw_files_per_exp   
        all_proc_files[exp_acc] = proc_files_per_exp

#Add experimental design        
genotypes = ', '.join(conditions)
experiment_design = 'Total {} samples generated using {} in {} on {} with {}.'.format(len(samples_rows),library_strategy,organism_name, bname, genotypes)

#Add processed data files format and content
proc_descriptions = []
for file_type, ext in proc_file_description.items():
    desc = ext + ":" + file_type
    proc_descriptions.append(desc)
processed_data_files_format_content  = ', '.join(proc_descriptions)

#Warnings                                         
if len(all_proc_files) == 0:
    print("Warning: No processed files for any samples - cannot submit to GEO")
if len(organisms) > 1:
    print("Warning: Samples for more than one organisms added, please filter the search query for one organism.")
if len(raw_file_warning) > 0:
    print("Warning: Some or all raw files already deposited in SRA. See list here: {}".format(raw_file_warning))
if len(best_matched_bt) > 0:
    print("Warning: {} match found below set threshold, check manually".format(best_matched_bt))
    
                                         
                                         
print("Total samples (4DN experiments) collected: {}".format(len(samples_rows)))


## D) Verify important information collected

In [None]:
missing = ', '.join(user_input)
if len(missing) > 0:
    print("Add the following manually: {}".format(missing))

if 'organism_genome_assembly' not in user_input:
    print("Genome assembly collected {}, if that is incorrect please correct it manually in the sheet".format(organism_genome_assembly))

In [None]:
#check collected instrument model. 
#GEO has a vocabulary for instrument names, It has to match correctly for submission to be successful
#This script uses a package to do a best match but please verifiy the original and instrument name and edit in the metadata sheet directly

for original, match in matched.items():
    print("original = {}, matched = {}".format(original, match))
    
print("If not matched correctly see the instrument_models list to select manually and update in the sheet directly")    

## E) Populates metadata with extracted data

In [None]:
#Populates metadata sheet i.e. STUDY, SAMPLES and PROTOCOLS sections.

workbook = openpyxl.load_workbook(GEO_metadata_template_file)
sheet = workbook['Metadata']

#STUDY section

start_row = 39

if len(total_supplementary_files) > 0:
    supp_files = ' '.join(total_supplementary_files)

for row in sheet.iter_rows():
    for cell in row:
        if cell.value == "*title":
            sheet[f"B{cell.row}"] = pub_title
        if cell.value == "*summary (abstract)":
            sheet[f"B{cell.row}"] = summary
        if cell.value == "*experimental design":
            sheet[f"B{cell.row}"] = experiment_design
        if cell.value == "*genome build/assembly":
            sheet[f"B{cell.row}"] = organism_genome_assembly
        if cell.value == "*processed data files format and content":
            sheet[f"B{cell.row}"] = processed_data_files_format_content
        if len(total_supplementary_files) > 0:
            if cell.value == "supplementary file":
                sheet[f"B{cell.row}"] = supp_files

#author names
if len(full_name) > 7:
    sheet.insert_rows(22, amount=len(full_name)-6)
    start_row = 39 + len(full_name) - 6

    for row in range(22, 22+(len(full_name)-6)):
        sheet[f"A{row}"] = 'contributor'

for i, row_data in enumerate(full_name, start=15):
    for j, value in enumerate(row_data, start=2):
        sheet.cell(row=i, column=j, value=value)
        

#SAMPLES section

if len(samples_rows) > 16:
    add_no_rows = len(samples_rows) - 16    
    sheet.insert_rows(start_row + 16, add_no_rows + 1)
                
for i, row_data in enumerate(samples_rows, start=start_row):
    for j, value in enumerate(row_data, start=1):
        sheet.cell(row=i, column=j, value=value)

# SAMPLES section - files metadata        
edit_file_headers = False

no_of_raw_files = []
no_of_processed_files = []
raw_files = []
proc_files = []
        
for key, value in all_proc_files.items():
    no_of_processed_files.append(len(value))
    proc_files.append(value)
    
for key, value in all_raw_files.items():
    no_of_raw_files.append(len(value))
    raw_files.append(value)    
    
max_raw = max(no_of_raw_files)
max_proc = max(no_of_processed_files)

if max_proc > 2 or max_raw > 5:
    print("Updating column headers for files")
    edit_file_headers = True

if edit_file_headers:
    file_headers =  []
    for i in range(max_proc):
        file_headers.append('processed data file')
    file_headers.append('*raw file')    
    for i in range(max_raw-1):
        file_headers.append('raw file')  

    start_row_files = start_row - 1
    for col_num, value in enumerate(file_headers, start=15):
        sheet.cell(row=start_row_files, column=col_num, value=value)
        
    start_row_files = start_row
    start_column_proc = 15
    
    for i, row_data in enumerate(proc_files, start=start_row_files):
        for j, value in enumerate(row_data, start=start_column_proc):
            sheet.cell(row=i, column=j, value=value)
    
    for i, row_data in enumerate(raw_files, start=start_row_files):
        for j, value in enumerate(row_data, start=start_column_proc + max_proc):
            sheet.cell(row=i, column=j, value=value)   

else:
    start_row_files = start_row
    start_column_proc = 15
    
    for i, row_data in enumerate(proc_files, start=start_row_files):
        for j, value in enumerate(row_data, start=start_column_proc):
            sheet.cell(row=i, column=j, value=value)
    
    for i, row_data in enumerate(raw_files, start=start_row_files):
        for j, value in enumerate(row_data, start=start_column_proc + 2):
            sheet.cell(row=i, column=j, value=value)
                          
            
#Populate Paired-end experiment    
if len(paired_raw_files) > 0:
    for row in sheet.iter_rows():
        for cell in row:
            if cell.value == "file name 1":
                paired_files_start_row = cell.row
    
    paired_per_exp = []
    for r1, r2 in paired_raw_files.items():
        paired_per_exp.append([r1, r2])

    for i, row_data in enumerate(paired_per_exp, start=paired_files_start_row+1):
        for j, value in enumerate(row_data, start=1):
            sheet.cell(row=i, column=j, value=value)                        
workbook.save(GEO_metadata_template_file)  

print("Metadata sheet updated")

In [None]:
# Populates MD5 Checksums sheet.

add_md5sum_raw = []
add_md5sum_proc = []

for filename, md5sum in md5sums_raw.items():
    add_md5sum_raw.append([filename,md5sum ])

add_md5sum_proc = []

for filename, md5sum in md5sums_proc.items():
    add_md5sum_proc.append([filename,md5sum ])


workbook = openpyxl.load_workbook(GEO_metadata_template_file)
sheet = workbook['MD5 Checksums']

start_row = 9
for i, row_data in enumerate(add_md5sum_raw, start=start_row):
    for j, value in enumerate(row_data, start=1):
        sheet.cell(row=i, column=j, value=value)
        
for i, row_data in enumerate(add_md5sum_proc, start=start_row):
    for j, value in enumerate(row_data, start=6):
        sheet.cell(row=i, column=j, value=value)        
        
workbook.save(GEO_metadata_template_file)

print("MD5sum sheet updated")

print('Please verify information added in the sheets!')