# Create ISA objects for EATRIS-Plus multi-omics data set of Czech population cohort

See example: https://isatools.readthedocs.io/en/latest/example-createSimpleISAtab.html

Define ontology references

In [1]:
from isatools.model import *
ontologies = {
    "CHEBI": OntologySource(
        name = "CHEBI - Chemical Entities of Biological Interest", 
        file = "http://purl.obolibrary.org/obo/chebi.owl",
        description = "A structured classification of molecular entities of biological interest focusing on 'small' chemical compounds."), 
    "CRO": OntologySource(
        # The Contributor Role Ontology (CRO) is an extension of the CASRAI Contributor Roles Taxonomy (CRediT) and replaces the former Contribution Ontology.
        name = "CRO - Contributor Role Ontology",
        file = "http://purl.obolibrary.org/obo/cro.owl",
        description = "A classification of the diverse roles performed in the work leading to a published research output in the sciences. Its purpose to provide transparency in contributions to scholarly published work, to enable improved systems of attribution, credit, and accountability."),
    "EDAM": OntologySource(
        name = "EDAM - EMBRACE Data and Methods", 
        file = "http://edamontology.org/EDAM.owl",
        description = "EDAM (EMBRACE Data and Methods) is an ontology of common bioinformatics operations, topics, types of data including identifiers, and formats. EDAM comprises common concepts (shared within the bioinformatics community) that apply to semantic annotation of resources."), 
    "EFO": OntologySource(
        name = "EFO - Experimental Factor Ontology", 
        file = "http://www.ebi.ac.uk/efo/efo.owl",
        description = "The Experimental Factor Ontology (EFO) provides a systematic description of many experimental variables available in EBI databases, and for external projects such as the NHGRI GWAS catalogue. It combines parts of several biological ontologies, such as anatomy, disease and chemical compounds. The scope of EFO is to support the annotation, analysis and visualization of data handled by many groups at the EBI and as the core ontology for OpenTargets.org"), 
    "GENEPIO": OntologySource(
        name = "GENEPIO - Genomic Epidemiology Ontology", 
        file = "http://purl.obolibrary.org/obo/genepio.owl",
        description = "The Genomic Epidemiology Ontology (GenEpiO) covers vocabulary necessary to identify, document and research foodborne pathogens and associated outbreaks."),
    "MMO": OntologySource(
        name = "MMO - Measurement method ontology", 
        file = "http://purl.obolibrary.org/obo/mmo.owl",
        description = "A representation of the variety of methods used to make clinical and phenotype measurements."),
    "NCBITAXON": OntologySource(
        name = "NCBI organismal classification", 
        file = "http://purl.obolibrary.org/obo/ncbitaxon.owl",
        description = "An ontology representation of the NCBI organismal taxonomy"),
    "NCIT": OntologySource(
        name = "NCI Thesaurus OBO Edition", 
        file = "http://purl.obolibrary.org/obo/ncit.owl",
        description = "The NCIt OBO Edition project aims to increase integration of the NCIt with OBO Library ontologies. NCIt is a reference terminology that includes broad coverage of the cancer domain, including cancer related diseases, findings and abnormalities. NCIt OBO Edition releases should be considered experimental."),
    "OBI": OntologySource(
        name = "OBI - Ontology for Biomedical Investigations", 
        file = "http://purl.obolibrary.org/obo/obi.owl",
        description = "An integrated ontology for the description of life-science and clinical investigations"),
    "OMIABIS": OntologySource(
        name = "Ontologized MIABIS", 
        file = "http://purl.obolibrary.org/obo/omiabis.owl",
        description = "An ontological version of MIABIS (Minimum Information About BIobank data Sharing)"),
    "PRIDE": OntologySource(
        name = "PRIDE Controlled Vocabulary",
        file = "http://purl.obolibrary.org/obo/pride_cv.obo",
        description = "The PRIDE PRoteomics IDEntifications (PRIDE) database is a centralized, standards compliant, public data repository for proteomics data, including protein and peptide identifications, post-translational modifications and supporting spectral evidence."),
    "STATO": OntologySource(
        name = "STATO: the statistical methods ontology",
        file = "http://purl.obolibrary.org/obo/stato.owl",
        description = "STATO is the statistical methods ontology. It contains concepts and properties related to statistical methods, probability distributions and other concepts related to statistical analysis, including relationships to study designs and plots."),
    "UBERON": OntologySource(
        name = "Uber-anatomy ontology",
        file = "http://purl.obolibrary.org/obo/uberon.owl",
        description = "Uberon is an integrated cross-species anatomy ontology representing a variety of entities classified according to traditional anatomical criteria such as structure, function and developmental lineage. The ontology includes comprehensive relationships to taxon-specific anatomical ontologies, allowing integration of functional, phenotype and expression data.")
}

Create investigation

In [2]:
investigation = Investigation(
    filename = "i_investigation.txt", 
    identifier = "", 
    title = "EATRIS-Plus - Flagship in Personalised Medicine",
    description = "EATRIS-Plus project aims to support the long-term sustainability of the European Research Infrastructure for Translational Medicine (EATRIS) by delivering innovative scientific tools to the research community, strengthening the EATRIS financial model, and reinforcing EATRIS’ leadership in the European Research Area in the field of Personalised Medicine (PM).",
    submission_date = "",
    public_release_date = "",
    ontology_source_references = [o for o in ontologies.values()],
    publications = None,
    contacts = [
        Person(
            last_name = "Keidong", 
            first_name = "Eliis",
            #mid_initials = "",
            affiliation = "EATRIS",
            roles = [
                OntologyAnnotation(
                    term = "project management role",
                    term_source = ontologies["CRO"], 
                    term_accession ="http://purl.obolibrary.org/obo/CRO_0000065")])],
    studies = None,
    comments = None)

Create study

In [3]:
cohort_study = Study(
    filename = "s_study.txt", 
    identifier = "", 
    title = "Multi-omics data of a Czech population cohort",
    description = "Multi-omics data of a Czech population cohort", 
    submission_date = "", 
    public_release_date = "",
    contacts = [
        Person(
            last_name = "Hajduch", 
            first_name = "Marian",
            #mid_initials = "",
            affiliation = "Institute of Molecular and Translational Medicine (IMTM), Palacký University Olomouc")],
    design_descriptors = [
        OntologyAnnotation(
                term = "Multi-omics study",
                term_source = ontologies["PRIDE"],
                term_accession = "http://purl.obolibrary.org/obo/PRIDE_0000461"),
        OntologyAnnotation(
                term = "population based study design",
                term_source = ontologies["OMIABIS"],
                term_accession = "http://purl.obolibrary.org/obo/OMIABIS_0001022")], 
    factors = None, 
    protocols = None,
    assays = None,
    sources = None,
    samples = None,
    process_sequence = None,
    other_material = None,
    characteristic_categories = None,
    comments = None,
    units = None)
investigation.studies.append(cohort_study)

Define sample collection protocol

In [4]:
# sample collection protocol and protocol parameters 
protocol_params = {
    "anatomical entity": ProtocolParameter(
        parameter_name = OntologyAnnotation(
            term = "anatomical entity",
            term_source = ontologies["UBERON"],
            term_accession = "http://purl.obolibrary.org/obo/UBERON_0001062"))}

protocols = {
    "sample_collection": Protocol( 
        name = "sample_collection_protocol",
        protocol_type = OntologyAnnotation(
            term = "sample collection",
            term_source = None, #ontologies[""], 
            term_accession = None),
        parameters = [protocol_params["anatomical entity"]]),
    "dna_extraction": Protocol(
        name = "DNA extraction",
        protocol_type = OntologyAnnotation(
            term = "DNA extraction",
            term_source = ontologies["OBI"], 
            term_accession = "http://purl.obolibrary.org/obo/OBI_0000257")),
    "rna_extraction": Protocol(
        name = "RNA extraction",
        protocol_type = OntologyAnnotation(
            term = "RNA extraction",
            term_source = ontologies["OBI"], 
            term_accession = "http://purl.obolibrary.org/obo/OBI_0666666")),
    "WGS": Protocol(
        name = "Whole Genome Sequencing", 
        protocol_type = OntologyAnnotation(
            term = "Whole Genome Sequencing",
            term_source = ontologies["NCIT"],
        term_accession = "http://purl.obolibrary.org/obo/NCIT_C101294")),
    "RNAseq": Protocol(
        name = "mRNA Sequencing", 
        protocol_type = OntologyAnnotation(
            term = "mRNA Sequencing",
            term_source = ontologies["NCIT"],
        term_accession = "http://purl.obolibrary.org/obo/NCIT_C129432")),
    "microRNAseq": Protocol(
        name = "MicroRNA Sequencing", 
        protocol_type = OntologyAnnotation(
            term = "MicroRNA Sequencing",
            term_source = ontologies["NCIT"],
        term_accession = "http://purl.obolibrary.org/obo/NCIT_C156057"))
}
for protocol in protocols.values():
    cohort_study.protocols.append(protocol)

Define sources and derived samples

In [5]:
# add dummy samples
for source_idx in range(1, 4):
    # create source (=individual)
    source_name = "individual_{0}".format(source_idx)
    source = Source(
        name = source_name,
        characteristics = [
            Characteristic(
                category = OntologyAnnotation(
                    term = "Organism",
                    term_source = ontologies["OBI"],
                    term_accession = "http://purl.obolibrary.org/obo/OBI_0100026"),
                value = OntologyAnnotation(
                    term = "Homo sapiens",
                    term_source = ontologies["NCBITAXON"],
                    term_accession = "http://purl.obolibrary.org/obo/NCBITaxon_9606"))])
    cohort_study.sources.append(source)
    # create sample
    sample_name = "sample_{0}".format(source_idx)
    sample = Sample(
        name = sample_name, 
        derives_from = source)
    cohort_study.samples.append(sample)
    # sample collection process
    sample_collection_process = Process(
        name = "samplecollection_{0}".format(source_idx),
        executes_protocol = protocols["sample_collection"],
        parameter_values = [
            ParameterValue(
                category = protocol_params["anatomical entity"], #ProtocolParameter 
                value = OntologyAnnotation(
                    term = "blood",
                    term_source = ontologies["UBERON"],
                    term_accession = "http://purl.obolibrary.org/obo/UBERON_0000178"))],
        inputs = [source],
        outputs = [sample])
    cohort_study.process_sequence.append(sample_collection_process)

Define assays

In [6]:
assays = {
    "genomics_imtm": Assay(
        filename = "a_assay_genomics_imtm.txt",
        measurement_type = OntologyAnnotation(
            term = "DNA Sequence",
            term_source = ontologies["NCIT"],
            term_accession = "http://purl.obolibrary.org/obo/NCIT_C13299"),
        technology_type = OntologyAnnotation(
            term = "Whole Genome Sequencing",
            term_source = ontologies["NCIT"], 
            term_accession = "http://purl.obolibrary.org/obo/NCIT_C101294"),
        technology_platform = OntologyAnnotation(
            term = "Illumina platform",
            term_source = ontologies["GENEPIO"], 
            term_accession = "http://purl.obolibrary.org/obo/GENEPIO_0001923")
    ), 
#    "dnamethylation_uu": Assay(
#        filename = "a_assay_dnamethylation_uu.txt"),
#        measurement_type = OntologyAnnotation(
#            term = "Methylation Beta Value",
#            term_source = ontologies["NCIT"],
#            term_accession = "http://purl.obolibrary.org/obo/NCIT_C164051"),
#        technology_type = OntologyAnnotation(
#            term = "DNA methylation profiling by array assay",
#            term_source = ontologies["OBI"], 
#            term_accession = "http://purl.obolibrary.org/obo/OBI_0001332"),
#        technology_platform = OntologyAnnotation(
#            term = "Illumina Infinium MethylationEPIC BeadChip",
#            term_source = ontologies["OBI"], 
#            term_accession = "http://purl.obolibrary.org/obo/OBI_0002131")), 
    "rnaseq_fimm": Assay(
        # https://eatris.sharepoint.com/:w:/r/sites/eatrisplusgroup/_layouts/15/Doc.aspx?sourcedoc=%7B4966EA29-2AE7-459A-8F58-A3E5962B6988%7D&file=RNAseq%20workflow.docx&action=default&mobileredirect=true
        filename = "a_assay_rnaseq_fimm.txt",
        measurement_type = OntologyAnnotation(
            term = "Binary format",
            term_source = ontologies["EDAM"],
            term_accession = "http://edamontology.org/format_2333"),
        technology_type = OntologyAnnotation(
            term = "mRNA Sequencing",
            term_source = ontologies["NCIT"], 
            term_accession = "http://purl.obolibrary.org/obo/NCIT_C129432"),
        technology_platform = OntologyAnnotation(
            term = "Illumina NovaSeq 6000",
            term_source = ontologies["OBI"], 
            term_accession = "http://purl.obolibrary.org/obo/OBI_0002630"))
#    "mirnaseq_fimm": Assay(
#        filename = "a_assay_mirnaseq_fimm.txt"), 
#    "mirnaseq_sermas": Assay(
#        filename = "a_assay_mirnaseq_sermas.txt"), 
#    "proteomics_imtm": Assay(
#        filename = "a_assay_proteomics_imtm.txt"), 
#    "metabolomics_acylcarnitines_rumc": Assay(
#        filename = "a_assay_metabolomics_acylcarnitines_rumc.txt"), 
#    "metabolomics_aminoacids_mumc": Assay(
#        filename = "a_assay_metabolomics_aminoacids_mumc.txt"), 
#    "metabolomics_fattyacids_mumc": Assay(
#        filename = "a_assay_metabolomics_fattyacids_mumc.txt"), 
}

Data file types are:
'Raw Data File',
'Derived Data File',
'Image File',
'Acquisition Parameter Data File',
'Derived Spectral Data File',
'Protein Assignment File',
'Raw Spectral Data File',
'Peptide Assignment File',
'Array Data File',
'Derived Array Data File',
'Post Translational Modification Assignment File',
'Derived Array Data Matrix File',
'Free Induction Decay Data File',
'Metabolite Assignment File',
'Array Data Matrix File'

In [7]:
for idx, sample in enumerate(cohort_study.samples):
    # DNA extraction
    dna = Material(
        name = "DNA_{0}".format(sample.name),
        type_ = "Extract Name")
    dna_extraction_process = Process(
        name = "DNA_extraction_{0}".format(sample.name),
        executes_protocol = protocols["dna_extraction"],
        inputs = [sample], 
        outputs = [dna])
    # RNA extraction
    rna = Material(
        name = "RNA_{0}".format(sample.name),
        type_ = "Extract Name")
    rna_extraction_process = Process(
        name = "RNA_extraction_{0}".format(sample.name),
        executes_protocol = protocols["rna_extraction"],
        inputs = [sample], 
        outputs = [rna])
    # genomics measurement
    wgs_raw_file = DataFile(
        filename = "WGS_rawdata_{0}".format(dna.name), 
        label = "Raw Data File", 
        generated_from = [dna])
    wgs_process = Process(
        name = "WGS_{0}".format(dna.name),
        executes_protocol = protocols["WGS"],
        inputs = [dna], 
        outputs = [wgs_raw_file])
    plink(dna_extraction_process, wgs_process)
    assays["genomics_imtm"].samples.append(sample)
    assays["genomics_imtm"].data_files.append(wgs_raw_file)
    assays["genomics_imtm"].other_material.append(dna)
    assays["genomics_imtm"].process_sequence.append(dna_extraction_process)
    assays["genomics_imtm"].process_sequence.append(wgs_process)
    # transcriptomics - RNAseq
    rnaseq_binarybasecall_file = DataFile(
        filename = "RNAseq_BCL_{0}".format(rna.name), 
        label = "Raw Data File", 
        generated_from = [rna])
    rnaseq_process = Process(
        name = "RNAseq_{0}".format(rna.name),
        executes_protocol = protocols["RNAseq"],
        inputs = [rna], 
        outputs = [rnaseq_binarybasecall_file])
    plink(rna_extraction_process, rnaseq_process)
    assays["rnaseq_fimm"].samples.append(sample)
    assays["rnaseq_fimm"].data_files.append(rnaseq_binarybasecall_file)
    assays["rnaseq_fimm"].other_material.append(rna)
    assays["rnaseq_fimm"].process_sequence.append(rna_extraction_process)
    assays["rnaseq_fimm"].process_sequence.append(rnaseq_process)
    # microRNAseq - FIMM

In [8]:
# add assays to study
for assay in assays.values():
    cohort_study.assays.append(assay)

Write ISA-Tab files

In [9]:
# write to ISA-Tab
import os
out_dir = "."
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)
from isatools import isatab
isatab.dump(investigation, out_dir)

isatools.model.Investigation(identifier='', filename='i_investigation.txt', title='EATRIS-Plus - Flagship in Personalised Medicine', submission_date='', public_release_date='', ontology_source_references=[isatools.model.OntologySource(name='CHEBI - Chemical Entities of Biological Interest', file='http://purl.obolibrary.org/obo/chebi.owl', version='', description='A structured classification of molecular entities of biological interest focusing on 'small' chemical compounds.', comments=[]), isatools.model.OntologySource(name='CRO - Contributor Role Ontology', file='http://purl.obolibrary.org/obo/cro.owl', version='', description='A classification of the diverse roles performed in the work leading to a published research output in the sciences. Its purpose to provide transparency in contributions to scholarly published work, to enable improved systems of attribution, credit, and accountability.', comments=[]), isatools.model.OntologySource(name='EDAM - EMBRACE Data and Methods', file='ht

In [10]:
# write to ISA-JSON
# see example: https://isa-tools.org/isa-api/content/examples/example-createSimpleISAJSON.html
import json
from isatools.isajson import ISAJSONEncoder
with open(os.path.join(out_dir, "isa.json"), "w") as out_file:
    json.dump(
        investigation, 
        out_file,
        cls = ISAJSONEncoder, 
        sort_keys = True, 
        indent = 4, 
        separators = (',', ': '))

In [11]:
#from isatools.convert import json2isatab
#from isatools import isajson
#isajson.validate(open('isa.json'))
#with open("isa.json") as file_pointer:
#    json2isatab.convert(file_pointer, './ISA/')