In [1]:
import openpyxl
from openpyxl import Workbook
from openpyxl.worksheet.datavalidation import DataValidation

In [3]:

# ===============================
# Create a new workbook
# ===============================
wb = Workbook()

# -------------------------------
# 1. Create the Metadata Template Sequencing sheet
# -------------------------------
metadata_ws = wb.active
metadata_ws.title = "Metadata Template Sequencing"

# Define header row for the metadata template
metadata_header = [
    "Category", 
    "Key", 
    "Description", 
    "Example Value", 
    "Controlled Vocabulary / Value Restrictions", 
    "Resource Link(s)"
]
metadata_ws.append(metadata_header)

# Define metadata rows (each row represents a key–value pair).
rows = [
    ["Study Information", "Title", 
     "A concise title describing the overall study", 
     "Analysis of Intestinal Crypt, Villus, and Polyp Cells", 
     "Free text; consider limiting length (e.g., ≤250 characters)", 
     "N/A"],
    
    ["Study Information", "Summary / Abstract", 
     "Detailed study summary and objectives", 
     "This study investigates transcriptomic changes under infection conditions.", 
     "Free text; follow standard writing guidelines", 
     "N/A"],
    
    ["Study Information", "Experimental Design", 
     "Overview of study design including factors, controls, and replicates", 
     "RNA-seq of single cells comparing infected vs. control conditions.", 
     "Free text; consider using standardized descriptors if available", 
     "https://www.ebi.ac.uk/arrayexpress/help/"],
    
    ["Sample Information", "Sample Identifier", 
     "Unique identifier for each biological sample", 
     "RNA-Seq_Sample_001", 
     "Must be unique; use an alphanumeric/underscore format (e.g., regex: ^[A-Za-z0-9_]+$)", 
     "N/A"],
    
    ["Sample Information", "Organism", 
     "Species name from which the sample is derived", 
     "Pseudomonas aeruginosa", 
     "Must conform to NCBI Taxonomy names", 
     "https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html"],
    
    ["Sample Information", "Tissue / Cell Type", 
     "Specific tissue, cell line, or cell type of origin", 
     "HeLa cells", 
     "Use controlled terms from ontologies such as Uberon or Cell Ontology", 
     "http://www.ontobee.org/"],
    
    ["Sample Information", "Strain Identifier", 
     "Strain information (if applicable)", 
     "JCM 14847", 
     "Use recognized strain IDs (e.g., from ATCC, DSMZ)", 
     "https://www.atcc.org/"],
    
    ["Sample Information", "Collection Date", 
     "Date on which the sample was collected", 
     "2025-03-24", 
     "Must follow ISO 8601 format (YYYY-MM-DD)", 
     "https://www.iso.org/iso-8601-date-and-time-format.html"],
    
    ["Experimental Conditions", "Treatment Conditions", 
     "Description of the experimental treatment(s) or conditions applied", 
     "Anti-Grem1 antibody treatment", 
     "If available, select from a controlled list (e.g., EFO terms)", 
     "https://www.ebi.ac.uk/efo/"],
    
    ["Experimental Conditions", "Treatment Duration", 
     "Duration for which the treatment was applied", 
     "10 weeks", 
     "Numeric value with a unit; standardize unit selection (e.g., hours, days, weeks)", 
     "N/A"],
    
    ["Experimental Conditions", "Concentration", 
     "Concentration or dosage of the treatment agent", 
     "30 mg/kg", 
     "Numeric value with unit; ensure consistency across samples", 
     "N/A"],
    
    ["Sequencing Details", "Library Preparation Method", 
     "Method or kit used for nucleic acid library preparation", 
     "TruSeq Stranded mRNA Library Prep Kit", 
     "Use a predefined list from vendor documentation or internal standards", 
     "https://www.illumina.com/"],
    
    ["Sequencing Details", "Sequencing Platform", 
     "Instrument or platform used for sequencing", 
     "Illumina NovaSeq", 
     "Must be selected from a controlled list (e.g., Illumina, PacBio, Oxford Nanopore)", 
     "https://www.ncbi.nlm.nih.gov/sra/docs/submitdesign/"],
    
    ["Sequencing Details", "Sequencing Depth", 
     "Number of reads obtained or overall depth of sequencing", 
     "50 million reads", 
     "Free numeric text; standard units should be used", 
     "N/A"],
    
    ["Sequencing Details", "Single or Paired-End", 
     "Specifies whether the sequencing was single-end or paired-end", 
     "Paired-end", 
     "Allowed values: Single-end, Paired-end", 
     "N/A"],
    
    ["Sequencing Details", "Instrument Model", 
     "Specific model of the sequencing instrument", 
     "Illumina HiSeq 2000", 
     "Use a controlled list (as provided by the manufacturer)", 
     "https://www.illumina.com/systems/sequencing-platforms.html"],
    
    ["Sequencing Details", "Quality Control Metrics", 
     "Key quality metrics such as RNA integrity number and quality scores", 
     "RIN > 8, Q30 > 85%", 
     "Free numeric values; standardize units and thresholds", 
     "N/A"],
    
    ["Bioinformatics Analysis", "Read Alignment Algorithm", 
     "Software used for aligning reads to the reference genome", 
     "HISAT2", 
     "Choose from a controlled list (e.g., HISAT2, STAR, Bowtie2)", 
     "https://github.com/ewels/MultiQC"],
    
    ["Bioinformatics Analysis", "Genome Reference", 
     "Reference genome or assembly used for analysis", 
     "Pseudomonas aeruginosa PAO1", 
     "Must match identifiers from recognized genome databases", 
     "https://www.ncbi.nlm.nih.gov/assembly/"],
    
    ["Bioinformatics Analysis", "Differential Expression Analysis", 
     "Software and statistical thresholds used for differential expression analysis", 
     "DESeq2 with FDR < 0.05", 
     "Choose from accepted tools (e.g., DESeq2, edgeR, limma)", 
     "https://bioconductor.org/packages/release/bioc/html/DESeq2.html"],
    
    ["Bioinformatics Analysis", "Gene Annotation", 
     "Databases or methods used for gene functional annotation", 
     "Ensembl, KEGG", 
     "Use controlled database names; if possible, select from a predefined list", 
     "https://www.ensembl.org/"],
    
    ["Ethical & Legal", "Ethical Approval", 
     "Details regarding ethics committee or IRB approval", 
     "IRB #12345", 
     "Free text; may follow a predefined institutional format", 
     "N/A"],
    
    ["Ethical & Legal", "Data Sharing Policy", 
     "Conditions or restrictions related to data sharing", 
     "Data available under CC-BY 4.0", 
     "Must be selected from a controlled list of licensing options", 
     "https://creativecommons.org/licenses/by/4.0/"],
    
    ["Contact Information", "Contributor / Contact Name", 
     "Primary contact or contributor for the study", 
     "Jane Doe", 
     "Free text; consider enforcing a 'Last Name, First Name' format", 
     "N/A"],
    
    ["Contact Information", "Contact Email", 
     "Email address of the study contact", 
     "jane.doe@example.com", 
     "Must follow standard email format", 
     "https://html.spec.whatwg.org/multipage/input.html#valid-e-mail-address"],
    
    ["Contact Information", "Contact Affiliation", 
     "Affiliation or institution of the study contact", 
     "Example University", 
     "Free text; optionally choose from a controlled list if available", 
     "https://grid.ac/"]
]

# Append all metadata rows to the Metadata Template sheet.
for row in rows:
    metadata_ws.append(row)

# -------------------------------
# 2. Create the Database sheet with controlled vocabulary values
# -------------------------------
db_ws = wb.create_sheet(title="Database")

# Define the header row for the Database sheet.
db_header = [
    "Title",
    "Summary / Abstract",
    "Experimental Design",
    "Sample Identifier",
    "Organism",
    "Tissue / Cell Type",
    "Strain Identifier",
    "Collection Date",
    "Treatment Conditions",
    "Treatment Duration",
    "Concentration",
    "Library Preparation Method",
    "Sequencing Platform",
    "Sequencing Depth",
    "Single or Paired-End",
    "Instrument Model",
    "Quality Control Metrics",
    "Read Alignment Algorithm",
    "Genome Reference",
    "Differential Expression Analysis",
    "Gene Annotation",
    "Ethical Approval",
    "Data Sharing Policy",
    "Contributor / Contact Name",
    "Contact Email",
    "Contact Affiliation"
]
db_ws.append(db_header)

# For demonstration, fill in controlled vocabulary values for the "Organism" column (column E)
organism_values = ["Homo sapiens", "Pseudomonas aeruginosa", "Mus musculus"]
start_row = 2
for i, value in enumerate(organism_values, start=start_row):
    db_ws.cell(row=i, column=5, value=value)

# Also, fill in controlled vocabulary for "Single or Paired-End" (column O, the 15th column)
spe_values = ["Single-end", "Paired-end"]
for i, value in enumerate(spe_values, start=start_row):
    db_ws.cell(row=i, column=15, value=value)

# -------------------------------
# 3. Add Data Validation (Drop-down Menus) to the Metadata Template sheet
# -------------------------------

# Example: Add drop-down for "Organism" in the Metadata Template sheet.
# Locate the row for the key "Organism" (which is in column B).
organism_row = None
for row in metadata_ws.iter_rows(min_row=2, max_row=metadata_ws.max_row, values_only=False):
    if row[1].value == "Organism":  # Column B contains the key
        organism_row = row[0].row
        break

if organism_row:
    # The "Example Value" column is column D.
    cell_coord = f"D{organism_row}"
    organism_validation = DataValidation(
        type="list",
        formula1='=Database!$E$2:$E$4',  # Refers to the "Organism" list in Database sheet (E2:E4)
        allow_blank=False,
        showDropDown=True
    )
    metadata_ws.add_data_validation(organism_validation)
    organism_validation.add(cell_coord)

# Example: Add drop-down for "Single or Paired-End" in the Metadata Template sheet.
single_paired_row = None
for row in metadata_ws.iter_rows(min_row=2, max_row=metadata_ws.max_row, values_only=False):
    if row[1].value == "Single or Paired-End":  # Column B contains the key
        single_paired_row = row[0].row
        break

if single_paired_row:
    # The "Example Value" column is column D.
    cell_coord = f"D{single_paired_row}"
    spe_validation = DataValidation(
        type="list",
        formula1='=Database!$O$2:$O$3',  # Refers to the "Single or Paired-End" list in Database sheet (Column O)
        allow_blank=False,
        showDropDown=True
    )
    metadata_ws.add_data_validation(spe_validation)
    spe_validation.add(cell_coord)

# Additionally, add a simple drop-down example in cell A1 of the Metadata Template sheet.
dropdown = DataValidation(
    type="list",
    formula1='"Option1,Option2,Option3"',
    allow_blank=False,
    showDropDown=True
)
metadata_ws.add_data_validation(dropdown)
dropdown.add("A1")
metadata_ws["A1"] = "Select an Option"

# ===============================
# Save the workbook to a file
# ===============================
output_filename = "combined_metadata_template.xlsx"
wb.save(output_filename)
print(f"Excel file with Metadata Template and Database created: '{output_filename}'")


Excel file with Metadata Template and Database created: 'combined_metadata_template.xlsx'
