In [41]:
import openpyxl
from openpyxl import Workbook
from openpyxl.worksheet.datavalidation import DataValidation
from openpyxl.styles import PatternFill, Font, Alignment
from openpyxl.workbook.defined_name import DefinedName

In [42]:
import os
import requests
import zipfile
import io

# Visuals in Excel: Setup styling parameters

In [43]:

# Define fill colors for Category cells (only for Column A in MetaDataTemplate)
color_map = {
    "Study Information": "FFC7CE",         # light red
    "Sample Information": "C6EFCE",          # light green
    "Experimental Conditions": "FFC7CE",     # light red
    "Sequencing Details": "C6EFCE",          # light green
    "Bioinformatics Analysis": "FFC7CE",     # light red
    "Ethical & Legal": "C6EFCE",             # light green
    "Contact Information": "FFC7CE"          # light red
}

# Header styling for both sheets
header_fill = PatternFill(start_color="FFC000", end_color="FFC000", fill_type="solid")
header_font = Font(bold=True)
default_alignment = Alignment(wrap_text=True)
# Light grey font for Example_Value column (but now we apply drop-down to Value column, so we won't change its font)
light_grey_font = Font(color="808080")

# Uniform column width
col_width = 25

# 1. Create the workbook and the MetaDataTemplate sheet

In [44]:
wb = Workbook()

# Create "MetaDataTemplate" sheet.
metadata_ws = wb.active
metadata_ws.title = "MetaDataTemplate"
metadata_header = [
    "Category", 
    "Key", 
    "Value", 
    "Example_Value", 
    "Description", 
    "Controlled_Vocabulary/Value-Restrictions", 
    "Resource_Link(s)"
]
metadata_ws.append(metadata_header)

# Apply header formatting for MetaDataTemplate.
for cell in metadata_ws[1]:
    cell.fill = header_fill
    cell.font = header_font
    cell.alignment = default_alignment

# Define metadata rows.
metadata_rows = [
    ["Study Information", "Title", "", "Analysis of Intestinal Crypt, Villus, and Polyp Cells", "A concise title describing the overall study", "Free text; consider limiting length (e.g., ≤250 characters)", "N/A"],
    ["Study Information", "Summary/Abstract", "", "This study investigates transcriptomic changes under infection conditions.", "Detailed study summary and objectives", "Free text; follow standard writing guidelines", "N/A"],
    ["Study Information", "Experimental_Design", "", "RNA-seq of single cells comparing infected vs. control conditions.", "Overview of study design including factors, controls, and replicates", "Free text; consider using standardized descriptors if available", "https://www.ebi.ac.uk/arrayexpress/help/"],
    ["Sample Information", "Sample_Identifier", "", "RNA-Seq_Sample_001", "Unique identifier for each biological sample", "Must be unique; use an alphanumeric/underscore format (e.g., regex: ^[A-Za-z0-9_]+$)", "N/A"],
    ["Sample Information", "Organism", "", "Pseudomonas aeruginosa", "Species name from which the sample is derived", "Must conform to NCBI Taxonomy names", "https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html"],
    ["Sample Information", "Tissue", "", "Pancreas", "Specific tissue", "Use controlled terms from ontologies such as Uberon or Cell Ontology", "http://www.ontobee.org/"],
    ["Sample Information", "Cell_Type", "", "HeLa cells", "Specific cell line, or cell type of origin", "Use controlled terms from ontologies such as Uberon or Cell Ontology", "http://www.ontobee.org/"],
    ["Sample Information", "Strain_Identifier", "", "JCM 14847", "Strain information (if applicable)", "Use recognized strain IDs (e.g., from ATCC, DSMZ)", "https://www.atcc.org/"],
    ["Sample Information", "Collection_Date", "", "2025-03-24", "Date on which the sample was collected", "Must follow ISO 8601 format (YYYY-MM-DD)", "https://www.iso.org/iso-8601-date-and-time-format.html"],
    ["Experimental Conditions", "Treatment_Conditions", "", "Anti-Grem1 antibody treatment", "Description of the experimental treatment(s) or conditions applied", "If available, select from a controlled list (e.g., EFO terms)", "https://www.ebi.ac.uk/efo/"],
    ["Experimental Conditions", "Treatment_Duration", "", "10 weeks", "Duration for which the treatment was applied", "Numeric value with a unit; standardize unit selection (e.g., hours, days, weeks)", "N/A"],
    ["Experimental Conditions", "Concentration", "", "30 mg/kg", "Concentration or dosage of the treatment agent", "Numeric value with unit; ensure consistency across samples", "N/A"],
    ["Sequencing Details", "Library_Preparation_Method", "", "TruSeq Stranded mRNA Library Prep Kit", "Method or kit used for nucleic acid library preparation", "Use a predefined list from vendor documentation or internal standards", "https://www.illumina.com/"],
    ["Sequencing Details", "Sequencing_Platform", "", "Illumina NovaSeq", "Instrument or platform used for sequencing", "Must be selected from a controlled list (e.g., Illumina, PacBio, Oxford Nanopore)", "https://www.ncbi.nlm.nih.gov/sra/docs/submitdesign/"],
    ["Sequencing Details", "Sequencing_Depth", "", "50 million reads", "Number of reads obtained or overall depth of sequencing", "Free numeric text; standard units should be used", "N/A"],
    ["Sequencing Details", "Single_or_Paired-End", "", "Paired-end", "Specifies whether the sequencing was single-end or paired-end", "Allowed values: Single-end, Paired-end", "N/A"],
    ["Sequencing Details", "Instrument_Model", "", "Illumina HiSeq 2000", "Specific model of the sequencing instrument", "Use a controlled list (as provided by the manufacturer)", "https://www.illumina.com/systems/sequencing-platforms.html"],
    ["Sequencing Details", "Quality_Control_Metrics", "", "RIN > 8, Q30 > 85%", "Key quality metrics such as RNA integrity number and quality scores", "Free numeric values; standardize units and thresholds", "N/A"],
    ["Bioinformatics Analysis", "Read_Alignment_Algorithm", "", "HISAT2", "Software used for aligning reads to the reference genome", "Choose from a controlled list (e.g., HISAT2, STAR, Bowtie2)", "https://github.com/ewels/MultiQC"],
    ["Bioinformatics Analysis", "Genome_Reference", "", "Pseudomonas aeruginosa PAO1", "Reference genome or assembly used for analysis", "Must match identifiers from recognized genome databases", "https://www.ncbi.nlm.nih.gov/assembly/"],
    ["Bioinformatics Analysis", "Differential_Expression_Analysis", "", "DESeq2 with FDR < 0.05", "Software and statistical thresholds used for differential expression analysis", "Choose from accepted tools (e.g., DESeq2, edgeR, limma)", "https://bioconductor.org/packages/release/bioc/html/DESeq2.html"],
    ["Bioinformatics Analysis", "Gene_Annotation", "", "Ensembl, KEGG", "Databases or methods used for gene functional annotation", "Use controlled database names; if possible, select from a predefined list", "https://www.ensembl.org/"],
    ["Ethical & Legal", "Ethical_Approval", "", "IRB #12345", "Details regarding ethics committee or IRB approval", "Free text; may follow a predefined institutional format", "N/A"],
    ["Ethical & Legal", "Data_Sharing_Policy", "", "Data available under CC-BY 4.0", "Conditions or restrictions related to data sharing", "Must be selected from a controlled list of licensing options", "https://creativecommons.org/licenses/by/4.0/"],
    ["Contact Information", "Contributor/Contact_Name", "", "Jane Doe", "Primary contact or contributor for the study", "Free text; consider enforcing a 'Last Name, First Name' format", "N/A"],
    ["Contact Information", "Contact_Email", "", "jane.doe@example.com", "Email address of the study contact", "Must follow standard email format", "https://html.spec.whatwg.org/multipage/input.html#valid-e-mail-address"],
    ["Contact Information", "Contact_Affiliation", "", "Example University", "Affiliation or institution of the study contact", "Free text; optionally choose from a controlled list if available", "https://grid.ac/"]
]

# Append metadata rows and apply formatting.
for row in metadata_rows:
    metadata_ws.append(row)
    current_row = metadata_ws.max_row
    # Color only the Category cell (Column A) based on the category.
    cat_value = metadata_ws.cell(row=current_row, column=1).value
    if cat_value in color_map:
        metadata_ws.cell(row=current_row, column=1).fill = PatternFill(start_color=color_map[cat_value],
                                                                        end_color=color_map[cat_value],
                                                                        fill_type="solid")
    # Set wrap text for each cell in the row.
    for cell in metadata_ws[current_row]:
        cell.alignment = default_alignment

# Set column widths for MetaDataTemplate sheet.
for col in metadata_ws.columns:
    col_letter = col[0].column_letter
    metadata_ws.column_dimensions[col_letter].width = col_width




# 2. Create the Database sheet with controlled vocabulary values

In [45]:

db_ws = wb.create_sheet(title="Database")
db_header = [
    "Title",
    "Summary/Abstract",
    "Experimental Design",
    "Sample Identifier",
    "Organism",
    "Tissue",
    "Cell_Type",
    "Strain Identifier",
    "Collection Date",
    "Treatment Conditions",
    "Treatment Duration",
    "Concentration",
    "Library Preparation Method",
    "Sequencing Platform",
    "Sequencing Depth",
    "Single or Paired-End",
    "Instrument Model",
    "Quality Control Metrics",
    "Read Alignment Algorithm",
    "Genome Reference",
    "Differential Expression Analysis",
    "Gene Annotation",
    "Ethical Approval",
    "Data Sharing Policy",
    "Contributor/Contact Name",
    "Contact Email",
    "Contact Affiliation"
]
db_ws.append(db_header)

# Apply header formatting for Database.
for cell in db_ws[1]:
    cell.fill = header_fill
    cell.font = header_font
    cell.alignment = default_alignment

# Set column widths for Database sheet.
for col in db_ws.columns:
    col_letter = col[0].column_letter
    db_ws.column_dimensions[col_letter].width = col_width





## 2.1 download data and fill database with values

###  2.1.1 Example

In [46]:
from openpyxl.utils import get_column_letter

def add_controlled_vocabulary(ws, values, start_row, col, range_name, sheet_name="Database", alignment=default_alignment):
    """
    Write a list of controlled vocabulary values into a specific column of a worksheet,
    and create a named range for these values.

    Parameters:
      ws         : Worksheet to write values into.
      values     : List of controlled vocabulary values.
      start_row  : Row number where writing starts.
      col        : Column index (1-indexed) where values will be written.
      range_name : The name for the defined range to be created.
      sheet_name : The sheet name used in the defined range reference (default is "Database").
      alignment  : Alignment to apply to each cell (default is default_alignment).
    """
    for i, value in enumerate(values, start=start_row):
        ws.cell(row=i, column=col, value=value).alignment = alignment
    end_row = start_row + len(values) - 1
    col_letter = get_column_letter(col)
    cell_range = f"{sheet_name}!${col_letter}${start_row}:${col_letter}${end_row}"
    wb.defined_names.add(DefinedName(range_name, attr_text=cell_range))

# --- Usage Examples ---

# For "Organism" (Column E)
organism_values = [
    "Homo sapiens", "Mus musculus", "Rattus norvegicus", "Sus scrofa",
    "Pseudomonas aeruginosa", "Staphylococcus aureus", "Escherichia coli",
    "Mycobacterium tuberculosis", "Influenza A virus", "SARS-CoV-2",
    "Candida albicans", "Plasmodium falciparum", "Toxoplasma gondii"
]
add_controlled_vocabulary(db_ws, organism_values, start_row=2, col=5, range_name="organismList")

# For "Tissue" (Column F)
tissue_values = [
    "Lung", "Liver", "Kidney", "Spleen", "Intestine", "Blood",
    "Lymph Node", "Brain", "Skin", "Pancreas", "Bone Marrow", "Heart"
]
add_controlled_vocabulary(db_ws, tissue_values, start_row=2, col=6, range_name="TissueList")

# For "Cell Type" (Column G)
cell_type_values = [
    "HeLa cells", "Macrophage", "Epithelial cell", "T cell",
    "B cell", "Dendritic cell", "Neutrophil", "Fibroblast"
]
add_controlled_vocabulary(db_ws, cell_type_values, start_row=2, col=7, range_name="CellTypeList")

# For "Single or Paired-End" (Column O)
spe_values = ["Single-end", "Paired-end"]
add_controlled_vocabulary(db_ws, spe_values, start_row=2, col=15, range_name="SPEList")

# For "Strain Identifier" (Column H)
strain_identifier_values = [
    "E. coli K-12",                 # Common laboratory strain of E. coli
    "E. coli O157:H7",              # Pathogenic strain of E. coli
    "Staphylococcus aureus USA300", # Clinically relevant S. aureus strain
    "Pseudomonas aeruginosa PAO1",    # Widely used P. aeruginosa strain
    "Mycobacterium tuberculosis H37Rv",  # Standard reference strain for TB research
    "Salmonella Typhimurium SL1344",     # Common strain for Salmonella studies
    "Candida albicans SC5314",      # Standard strain for C. albicans infection research
    "C57BL/6"                     # Common inbred mouse strain used as a host model
]
add_controlled_vocabulary(db_ws, strain_identifier_values, start_row=2, col=8, range_name="StrainIdentifierList")

# For "Library Preparation Method" (Database Column M)
library_prep_values = [
    "Illumina TruSeq Stranded mRNA",
    "Illumina Nextera XT",
    "NEBNext Ultra II DNA Library Prep",
    "SMARTer Stranded Total RNA-Seq",
    "10x Genomics Chromium Single Cell"
]
add_controlled_vocabulary(db_ws, library_prep_values, start_row=2, col=13, range_name="LibraryPreparationMethodList")


# For "Sequencing Platform" (Database Column N)
sequencing_platform_values = [
    "Illumina NovaSeq 6000",
    "Illumina HiSeq X",
    "Illumina NextSeq 500",
    "Illumina MiSeq",
    "PacBio Sequel II",
    "Oxford Nanopore MinION",
    "Ion Torrent Proton",
    "BGISEQ-500"
]
add_controlled_vocabulary(db_ws, sequencing_platform_values, start_row=2, col=14, range_name="SequencingPlatformList")


# For "Instrument Model" (Column Q)
instrument_model_values = [
    "Illumina NovaSeq 6000",
    "Illumina HiSeq 2500",
    "Illumina MiSeq",
    "PacBio Sequel II",
    "Oxford Nanopore MinION Mk1B",
    "Oxford Nanopore PromethION",
    "Ion Torrent Proton",
    "Ion Torrent S5 XL",
    "BGISEQ-500",
    "MGISEQ-2000"
]
add_controlled_vocabulary(db_ws, instrument_model_values, start_row=2, col=17, range_name="InstrumentModelList")

# For "Read Alignment Algorithm" (Column S)
# For "Read Alignment Algorithm" (Database Column S)
read_align_values = [
    "BWA-MEM",
    "Bowtie2",
    "STAR",
    "HISAT2",
    "minimap2",
    "Novoalign",
    "TopHat2"
]
add_controlled_vocabulary(db_ws, read_align_values, start_row=2, col=19, range_name="ReadAlignmentAlgorithmList")


# For "Genome Reference" (Column T)
# For "Genome Reference" (Database Column T)
genome_reference_values = [
    "Homo sapiens GRCh38",
    "Mus musculus GRCm39",
    "Rattus norvegicus Rnor_6.0",
    "Sus scrofa Sscrofa11.1",
    "Pseudomonas aeruginosa PAO1",
    "Staphylococcus aureus N315",
    "Escherichia coli K-12 MG1655",
    "Mycobacterium tuberculosis H37Rv",
    "Influenza A virus (H1N1)",
    "SARS-CoV-2 Wuhan-Hu-1",
    "Candida albicans SC5314",
    "Plasmodium falciparum 3D7",
    "Toxoplasma gondii ME49"
]
add_controlled_vocabulary(db_ws, genome_reference_values, start_row=2, col=20, range_name="GenomeReferenceList")


# For "Gene Annotation" (Column V)
gene_annotation_values = [
    "Ensembl Gene",
    "NCBI Gene",
    "RefSeq Gene",
    "GENCODE"
]
add_controlled_vocabulary(db_ws, gene_annotation_values, start_row=2, col=22, range_name="GeneAnnotationList")

# For "Data Sharing Policy" (Column X)
data_sharing_values = ["CC BY", "CC BY-SA", "ODC PDDL", "ODbL"]
add_controlled_vocabulary(db_ws, data_sharing_values, start_row=2, col=24, range_name="DataSharingPolicyList")




# 3. Add Data Validation (Drop-down Menus) to the MetaDataTemplate sheet using named ranges

In [47]:
def find_row_by_key(ws, key, key_column=2, start_row=2, end_row=None):
    """
    Return the row number where the given key is found in the specified column of a worksheet.

    Parameters:
      ws         : The openpyxl worksheet object to search in.
      key        : The value to search for.
      key_column : The column index (1-indexed) to search the key in (default is 2, i.e., Column B).
      start_row  : The row number to start searching from (default is 2).
      end_row    : The row number to end search (defaults to ws.max_row if not provided).

    Returns:
      The row number where the key is found; otherwise, returns None.
    """
    if end_row is None:
        end_row = ws.max_row
    for row in ws.iter_rows(min_row=start_row, max_row=end_row, values_only=False):
        if row[key_column - 1].value == key:
            return row[0].row
    return None

def add_dropdown_validation(ws, field_name, named_range, target_column="C"):
    """
    Add a drop-down list validation to a cell in the worksheet for a given field.

    Parameters:
      ws           : The openpyxl worksheet object where the validation will be added.
      field_name   : The header (or label) of the field to search for in the worksheet.
      named_range  : The name of the defined range in the workbook containing the list values.
      target_column: The column letter where the drop-down will be applied (default is "C").

    This function locates the row corresponding to the field_name and attaches a drop-down 
    data validation (using the values from the named_range) to the target cell.
    """
    row = find_row_by_key(ws, field_name)
    if row:
        cell_coord = f"{target_column}{row}"
        validation = DataValidation(
            type="list",
            formula1=f"={named_range}",
            allow_blank=True,
            showDropDown=False
        )
        ws.add_data_validation(validation)
        validation.add(cell_coord)

# List of fields with corresponding named ranges for drop-down validations.
fields_validations = [
    ("Organism", "organismList"),
    ("Tissue", "TissueList"),
    ("Cell_Type", "CellTypeList"),
    ("Single_or_Paired-End", "SPEList"),
    ("Strain_Identifier", "StrainIdentifierList"),
    ("Library_Preparation_Method", "LibraryPreparationMethodList"),
    ("Sequencing_Platform", "SequencingPlatformList"),
    ("Instrument_Model", "InstrumentModelList"),
    ("Read_Alignment_Algorithm", "ReadAlignmentAlgorithmList"),
    ("Genome_Reference", "GenomeReferenceList"),
    ("Gene_Annotation", "GeneAnnotationList"),
    ("Data_Sharing_Policy", "DataSharingPolicyList")
]

# Apply the drop-down validations to the MetaDataTemplate sheet.
for field, named_range in fields_validations:
    add_dropdown_validation(metadata_ws, field, named_range)




# For Collection_Date in MetaDataTemplate, enforce date format (YYYY-MM-DD).
collection_date_row = find_row_by_key(metadata_ws, "Collection_Date")
if collection_date_row:
    cell_coord = f"C{collection_date_row}"
    date_validation = DataValidation(
        type="date",
        operator="between",
        formula1='"1900-01-01"',
        formula2='"2100-12-31"',
        allow_blank=True,
        showDropDown=False,
        error="Please enter a date in YYYY-MM-DD format."
    )
    metadata_ws.add_data_validation(date_validation)
    date_validation.add(cell_coord)


# 4. Save the workbook to a file

In [48]:
output_filename = "combined_metadata_template_2025-03-26.xlsx"
wb.save(output_filename)
print(f"Excel file with 'MetaDataTemplate' and 'Database' sheets created as '{output_filename}'")

Excel file with 'MetaDataTemplate' and 'Database' sheets created as 'combined_metadata_template_2025-03-26.xlsx'
