In [None]:
##GEO submission update (in progress)

In [2]:
from dcicutils import ff_utils
from functions.notebook_functions import *
from functions.cleanup import get_workflow_details, delete_wfrs
import time
import pandas as pd
import openpyxl

In [3]:
#Add key and blank GEO submission template

my_auth = get_key('default', keyfile='~/keypairs.json')  #add_key
GEO_metadata_template_file = '/Users/rahinavelkar/Library/CloudStorage/OneDrive-HarvardUniversity/harvard/geo_submission_v2/seq_template.xlsx' #https://www.ncbi.nlm.nih.gov/geo/info/seq.html

In [4]:
#Add datasets to be uploaded. All datasets should be from one publication or study

sets_list = [] 

search_url  = '/browse/?type=ExperimentSetReplicate&experimentset_type=replicate&produced_in_pub.display_title=Gholamalamdari+O+et+al.+%282024%29+PMID%3A38712201&sort=experiments_in_set.experiment_type.display_title&sort=dataset_label&sort=condition'

if sets_list:
    esets = [ff_utils.get_metadata(i, my_auth) for i in sets_list]
elif search_url:
    esets = [i for i in ff_utils.search_metadata(search_url, my_auth)]
    
print("No. of samples {}".format(len(esets))) 

No. of samples 12


In [5]:
## Extracting STUDY SECTION 

for eset in esets[0:1]:
    if eset.get("produced_in_pub") == None:
        print("No publication present, add manually")
    else:
        pub_details = eset.get("produced_in_pub")
        pub_title = pub_details.get("title")
        summary = pub_details.get("abstract")
        authors = pub_details.get("authors")
        
full_name = []

for author in authors:
    name = author.split(" ")
    surname = name[0]
    if len(name) > 2:
        firstname = name[-1]
    else:
        firstname = name[1]
    author_name = firstname + ", " + surname
    full_name.append([author_name])        

In [23]:
##Extracting SAMPLES details

processed_files = ["normalized counts"] #add file type
supplementary_file_type = [] #add file type
molecule = "genomic DNA"  #add extracted molecule choices ["polyA RNA", "total RNA", "nuclear RNA", "cytoplasmic RNA", "genomic DNA", "protein", "other"]

portal_url = "https://data.4dnucleome.org/"
replicate_desc = {}
replicate_desc_short = {}
md5sums_raw = {}
md5sums_proc = {}
paired_raw_files = {}
all_raw_files = {}
all_proc_files = {}
total_supplementary_files = []

dataset_labels = []
conditions = []
organisms = []

tissue_name = ""
cell_line_name = ""
cell_type = ""
treatment = ""
batch = ""

samples_rows = []

for eset in esets:
    supplementary_files = eset.get("other_processed_files")
    for sup_file_info in supplementary_files:
        s_files = sup_file_info.get("files")
        for sfile in s_files:
            s_file_type = sfile.get("file_type")
            if s_file_type in supplementary_file_type:
                total_supplementary_files.append(sfile.get('display_title'))
    if eset.get("dataset_label") not in dataset_labels:
        dataset_label = eset.get("dataset_label")
        dataset_labels.append(dataset_label)
    condition = eset.get("condition")
    genotype = condition
    if genotype not in conditions:
        conditions.append(genotype)
    replicate_info = eset.get("replicate_exps")
    for reps in replicate_info:
            biorep = reps.get("bio_rep_no")     
            techrep = reps.get("tec_rep_no")
            rep_description = "Biological replicate " +  str(biorep) + ", Technical replicate " + str(techrep)
            rep_description_short = "B" +  str(biorep) + " T" + str(techrep)
            rep_info = reps.get("replicate_exp")
            rep_info_acc = rep_info.get("accession")
            replicate_desc[rep_info_acc] = rep_description
            replicate_desc_short[rep_info_acc] = rep_description_short
    exps = eset.get('experiments_in_set')
    for exp in exps:
        raw_files_per_exp = []
        proc_files_per_exp = []
        exp_id = exp.get("@id")
        exp_url = portal_url + exp_id
        exp_acc = exp.get("accession")
        library_name = exp.get("display_title")
        exp_details = exp.get("experiment_type")
        library_strategy = exp_details.get('display_title')
        files = exp.get("files")
        for file in files:
            file_acc = file.get("accession")
            file_type = file.get("file_type")
            file_name = file.get("display_title")
            if file_type == "reads":
                raw_files_per_exp.append(file.get('display_title'))
                file_metadata = ff_utils.get_metadata(file_acc, my_auth)
                instrument = file_metadata.get('instrument')
                md5sum = file_metadata.get('md5sum')
                md5sums_raw[file_name] = md5sum
                if file.get("paired_end"):
                    paired = "paired-end"
                    related_files = file.get("related_files")
                    for rf in related_files:
                        if rf.get("relationship_type") == "paired with":
                            paired_acc = rf.get("accession")
                            paired_raw_files[file_acc] = paired_acc
                else:
                    paired = "single"
        if exp.get("processed_files"):
            proc_files = exp.get("processed_files")
            for pfile in proc_files:
                file_acc = pfile.get("accession")
                file_type = pfile.get("file_type")
                file_name = pfile.get("display_title")
                if file_type in processed_files:
                    proc_files_per_exp.append(file_name)
                    pfile_metadata = ff_utils.get_metadata(file_acc, my_auth)
                    pmd5sum = pfile_metadata.get('md5sum')
                    md5sums_proc[file_name] = pmd5sum
        else:
            proc_files_per_exp.append('')
        biosample = exp.get("biosample")
        biosample_id = biosample.get("@id")
        biosample_url = portal_url + biosample_id
        biosample_type = biosample.get("biosample_type")
        biosources = biosample.get("biosource")
        for biosource in biosources:
            organism = biosource.get("organism")
            organism_name = organism.get("display_title")
            if organism_name not in organisms:
                organisms.append(organism_name)
            biosource_type = biosource.get("biosource_type")
            if biosource_type == "tissue":
                tissue = biosource.get('tissue')
                tissue_name = tissue.get('term_name') 
            else:
                cell_type = biosource_type
                cell_line = biosource.get("cell_line")
                cell_line_name = cell_line.get("term_name")
        title =  exp_acc + ", " + organism_name + " - " + condition + ", " + replicate_desc_short[exp_acc]      
        description = library_strategy + " in " + organism_name + ", " + condition + ", " + replicate_desc[exp_acc] + ", 4DN experiment: " + exp_url + " ,4DN Biosample: " + biosample_url
        samples_rows.append([exp_acc,title,library_strategy,organism_name,tissue_name,cell_line_name, cell_type, genotype, treatment, batch, molecule, paired,instrument,description])
        all_raw_files[exp_acc] = raw_files_per_exp
        all_proc_files[exp_acc] = proc_files_per_exp

#Add experimental design        
for label in dataset_labels:
    genotypes = ', '.join(conditions)
    all_organisms = ', '.join(organisms)
    experiment_design = '{} in tissues/genotype: {}'.format(label, genotypes)        

normalized counts
4DNFIMTKDNJW.bw
normalized counts
4DNFINZYRIU6.bw
normalized counts
4DNFIKWPQKXH.bw
normalized counts
4DNFIX693RNR.bw
normalized counts
4DNFIWRT1RIZ.bw
normalized counts
4DNFIKZDFYQ8.bw
normalized counts
4DNFIPHWZ5B4.bw
normalized counts
4DNFIMI5G3HW.bw
normalized counts
4DNFI625PP2A.bw
normalized counts
4DNFIFKMOD1L.bw
normalized counts
4DNFIBY8G6RZ.bw
normalized counts
4DNFI8HIU45G.bw
normalized counts
4DNFIO4EE1OU.bw
normalized counts
4DNFI6FTPH5V.bw
normalized counts
4DNFIVZSO9RI.bw
other


In [26]:
#Populate STUDY and SAMPLES section in template file

workbook = openpyxl.load_workbook(GEO_metadata_template_file)
sheet = workbook['Metadata']

#STUDY section

sheet["B12"] = pub_title
sheet["B13"] = summary
sheet["B14"] = experiment_design

#author names
if len(full_name) > 7:
    sheet.insert_rows(22, amount=len(full_name)-6)
    start_row = 39 + len(full_name)-6
    
    for row in range(22, 22+(len(full_name)-6)):
        sheet[f"A{row}"] = 'contributor'
else:
    start_row = 39
    
for i, row_data in enumerate(full_name, start=15):
    for j, value in enumerate(row_data, start=2):
        sheet.cell(row=i, column=j, value=value)
        
#SAMPLES section

if len(samples_rows) > 16:
    add_no_rows = len(samples_rows) - 16    
    sheet.insert_rows(start_row + 16, add_no_rows + 1)
        
        
for i, row_data in enumerate(samples_rows, start=start_row):
    for j, value in enumerate(row_data, start=1):
        sheet.cell(row=i, column=j, value=value)

# SAMPLES section - files metadata        
edit_file_headers = False

no_of_raw_files = []
no_of_processed_files = []
raw_files = []
proc_files = []
        
for key, value in all_proc_files.items():
    no_of_processed_files.append(len(value))
    proc_files.append(value)
    
for key, value in all_raw_files.items():
    no_of_raw_files.append(len(value))
    raw_files.append(value)    
    
max_raw = max(no_of_raw_files)
max_proc = max(no_of_processed_files)

if max_proc > 2 or max_raw > 5:
    print("Updating column headers for files")
    edit_file_headers = True

if edit_file_headers == True:
    file_headers =  []
    for i in range(max_proc):
        file_headers.append('processed data file')
    file_headers.append('*raw file')    
    for i in range(max_raw-1):
        file_headers.append('raw file')  

    start_row = start_row - 1
    for col_num, value in enumerate(file_headers, start=15):
        sheet.cell(row=file_headers_row, column=col_num, value=value)
        
    start_row_files = start_row
    start_column_proc = 15
    
    for i, row_data in enumerate(proc_files, start=start_row_files):
        for j, value in enumerate(row_data, start=start_column_proc):
            sheet.cell(row=i, column=j, value=value)
    
    for i, row_data in enumerate(raw_files, start=start_row_files):
        for j, value in enumerate(row_data, start=start_column_proc + max_proc):
            sheet.cell(row=i, column=j, value=value)   

else:
    start_row_files = start_row
    start_column_proc = 15
    
    for i, row_data in enumerate(proc_files, start=start_row_files):
        for j, value in enumerate(row_data, start=start_column_proc):
            sheet.cell(row=i, column=j, value=value)
    
    for i, row_data in enumerate(raw_files, start=start_row_files):
        for j, value in enumerate(row_data, start=start_column_proc + 2):
            sheet.cell(row=i, column=j, value=value)
                        

workbook.save(GEO_metadata_template_file)        

  warn(msg)


In [None]:
#Populate paired-end

paired_end_start_row = start_row + len(samples_rows) + 23

In [27]:
# Populate MD5 Checksums sheet.

add_md5sum_raw = []
add_md5sum_proc = []

for filename, md5sum in md5sums_raw.items():
    add_md5sum_raw.append([filename,md5sum ])

add_md5sum_proc = []

for filename, md5sum in md5sums_proc.items():
    add_md5sum_proc.append([filename,md5sum ])


workbook = openpyxl.load_workbook(GEO_metadata_template_file)
sheet = workbook['MD5 Checksums']

start_row = 9
for i, row_data in enumerate(add_md5sum_raw, start=start_row):
    for j, value in enumerate(row_data, start=1):
        sheet.cell(row=i, column=j, value=value)
        
for i, row_data in enumerate(add_md5sum_proc, start=start_row):
    for j, value in enumerate(row_data, start=6):
        sheet.cell(row=i, column=j, value=value)        
        
workbook.save(GEO_metadata_template_file)