# ETL: Loading data to beacon compliant JSON format


In [1]:
import duckdb
from functools import lru_cache
from itertools import chain
import json

In [2]:
# Load CSV file into DuckDB
con = duckdb.connect(database='./metadata.db')
con.execute("CREATE TABLE IF NOT EXISTS datasets AS SELECT * FROM read_csv('./data/dataset.csv', ALL_VARCHAR=TRUE)")
con.execute("CREATE TABLE IF NOT EXISTS individuals AS SELECT * FROM read_csv('./data/individuals.csv', ALL_VARCHAR=TRUE)")
con.execute("CREATE TABLE IF NOT EXISTS biosamples AS SELECT * FROM read_csv('./data/biosamples.csv', ALL_VARCHAR=TRUE)")
con.execute("CREATE TABLE IF NOT EXISTS runs AS SELECT * FROM read_csv('./data/runs.csv', ALL_VARCHAR=TRUE)")
con.execute("CREATE TABLE IF NOT EXISTS analyses AS SELECT * FROM read_csv('./data/analyses.csv', ALL_VARCHAR=TRUE)")
con.execute("CREATE TABLE IF NOT EXISTS dict AS SELECT * FROM read_csv('./data/data_dictionary.csv', ALL_VARCHAR=TRUE)")
con.execute("CREATE TABLE IF NOT EXISTS diseases AS SELECT * FROM read_csv('./data/individual_diseases.csv', ALL_VARCHAR=TRUE)")
con.execute("SHOW TABLES").df()


Unnamed: 0,name
0,analyses
1,biosamples
2,datasets
3,dict
4,diseases
5,individuals
6,runs


In [3]:
datasets_df = con.execute("SELECT * FROM datasets").df()
datasets_df

Unnamed: 0,id,createDateTime,dataUseConditions,dataUseConditionsVersions,description,externalUrl,info,name,updateDateTime,version
0,UNQ_1,2021-03-21T02:37:00-08:00,DUO:0000042,17-07-2016,Simulation set 1.,http://example.org/wiki/Main_Page,{},Dataset with fake data,2022-08-05T17:21:00+01:00,v1.1


In [4]:
@lru_cache(maxsize=1000)
def fetch_term(term):
    if not len(term):
        return {"id":"","label":"","ontology":""}
    result = con.execute(f"SELECT * FROM dict WHERE id='{term}'").df()
    return result.iloc[0].to_dict()

In [5]:
for row in datasets_df.iterrows():
    idx, data = row
    data = data.to_dict()
    dataset = {
        "id": data["id"],
        "createDateTime": data["createDateTime"],
        "dataUseConditions": {
            "duoDataUse": [
                {
                    "id": cond,
                    "label": fetch_term(cond)["label"],
                    "version": ver
                } for (cond, ver) in zip(data["dataUseConditions"].split(","), data["dataUseConditionsVersions"].split(","))
            ]
        },
        "description": "Simulation set 1.",
        "externalUrl": "http://example.org/wiki/Main_Page",
        "info": {},
        "name": "Dataset with fake data",
        "updateDateTime": "2022-08-05T17:21:00+01:00",
        "version": "v1.1"
    }
    
print(json.dumps(dataset, indent=2))

{
  "id": "UNQ_1",
  "createDateTime": "2021-03-21T02:37:00-08:00",
  "dataUseConditions": {
    "duoDataUse": [
      {
        "id": "DUO:0000042",
        "label": "general research use",
        "version": "17-07-2016"
      }
    ]
  },
  "description": "Simulation set 1.",
  "externalUrl": "http://example.org/wiki/Main_Page",
  "info": {},
  "name": "Dataset with fake data",
  "updateDateTime": "2022-08-05T17:21:00+01:00",
  "version": "v1.1"
}


In [6]:
individuals_df = con.execute("SELECT * FROM individuals").df()
individuals_df

Unnamed: 0,id,ethnicity_id,geographic_origin_id,karyotypic_sex,sex_id,interventions_or_procedures
0,UNQ_1-1,SNOMED:52075006,SNOMED:223688001,XXY,SNOMED:407378000,"NCIT:C79426,NCIT:C64264"
1,UNQ_1-2,SNOMED:12556008,SNOMED:223688001,XXYY,SNOMED:407378000,"NCIT:C79426,NCIT:C64264"
2,UNQ_1-3,SNOMED:113170005,SNOMED:223688001,XXX,SNOMED:407374003,"NCIT:C79426,NCIT:C64263"
3,UNQ_1-4,SNOMED:10432001,SNOMED:223600005,XYY,SNOMED:407377005,"NCIT:C79426,NCIT:C64264"
4,UNQ_1-5,SNOMED:12556008,SNOMED:223498002,XXXX,SNOMED:407374003,"NCIT:C64263,NCIT:C64264"
5,UNQ_1-6,SNOMED:17789004,SNOMED:223713009,XX,SNOMED:248152002,NCIT:C93025
6,UNQ_1-7,SNOMED:77502007,SNOMED:223498002,XXXY,SNOMED:407377005,NCIT:C79426
7,UNQ_1-8,SNOMED:89026003,SNOMED:223498002,XX,SNOMED:407378000,NCIT:C64263
8,UNQ_1-9,SNOMED:10292001,SNOMED:223498002,XXXX,SNOMED:407377005,
9,UNQ_1-10,SNOMED:76460008,SNOMED:223688001,XXXY,SNOMED:248153007,NCIT:C64264


In [7]:
diseases_df = con.execute("SELECT individual_id, GROUP_CONCAT(disease, ',') diseases FROM diseases GROUP BY individual_id").df().set_index("individual_id") 
diseases_df

Unnamed: 0_level_0,diseases
individual_id,Unnamed: 1_level_1
UNQ_1-4,SNOMED:26929004
UNQ_1-9,SNOMED:359642000
UNQ_1-6,"SNOMED:23853001,SNOMED:722600006,SNOMED:80690008"
UNQ_1-10,"SNOMED:734099007,SNOMED:56265001,SNOMED:254955001"
UNQ_1-7,SNOMED:734099007
UNQ_1-3,"SNOMED:734099007,SNOMED:359642000"


In [8]:
def get_disease_codes(individual_id):
    if not individual_id in diseases_df.index:
        return []
    diseases_str  = diseases_df.loc[individual_id].iloc[0]
    diseases  = (diseases_str.split(",") if diseases_str else [])
    
    return diseases

In [9]:
individuals = []

for data in individuals_df.iterrows():
    idx, data = data
    data.fillna("", inplace=True)
    data = data.to_dict()
    individual = {
            "id": data["id"],
            "ethnicity": {
                "id": data["ethnicity_id"],
                "label": fetch_term(data["ethnicity_id"])["label"]
            },
            "geographicOrigin": {
                "id": data["geographic_origin_id"],
                "label": fetch_term(data["geographic_origin_id"])["label"]
            },
            "diseases": [
                {
                    "diseaseCode": {
                        "id": code,
                        "label": fetch_term(code)["label"]
                    }
                }
                for code in get_disease_codes(data["id"])
            ],
            "interventionsOrProcedures": [
                {
                    "procedureCode": {
                        "id": proc,
                        "label": fetch_term(proc)["label"]
                    }
                } for proc in (data["interventions_or_procedures"].split(",") if data["interventions_or_procedures"] else [])
            ],
            "karyotypicSex": data["karyotypic_sex"],
            "sex": {
                "id": data["sex_id"],
                "label": fetch_term(data["sex_id"])["label"]
            }
        }
    individuals.append(individual)

print(json.dumps(individuals, indent=2))

[
  {
    "id": "UNQ_1-1",
    "ethnicity": {
      "id": "SNOMED:52075006",
      "label": "Congolese"
    },
    "geographicOrigin": {
      "id": "SNOMED:223688001",
      "label": "United States of America"
    },
    "diseases": [],
    "interventionsOrProcedures": [
      {
        "procedureCode": {
          "id": "NCIT:C79426",
          "label": "Cancer Diagnostic or Therapeutic Procedure"
        }
      },
      {
        "procedureCode": {
          "id": "NCIT:C64264",
          "label": "Imaging Biomarker Analysis"
        }
      }
    ],
    "karyotypicSex": "XXY",
    "sex": {
      "id": "SNOMED:407378000",
      "label": "Surgically transgendered transsexual, male-to-female"
    }
  },
  {
    "id": "UNQ_1-2",
    "ethnicity": {
      "id": "SNOMED:12556008",
      "label": "Tamils"
    },
    "geographicOrigin": {
      "id": "SNOMED:223688001",
      "label": "United States of America"
    },
    "diseases": [],
    "interventionsOrProcedures": [
      {
        "

In [10]:
biosamples_df = con.execute("SELECT * FROM biosamples").df()
biosamples_df

Unnamed: 0,id,individual_id,biosample_status_id,collection_date,collection_moment,histological_diagnosis_id,obtention_procedure_id,pathological_tnm_finding,sample_origin_detail_id,sample_origin_type_id,tumor_progression_id,notes
0,UNQ_1-1,UNQ_1-1,SNOMED:365641003,2019-04-23,P32Y6M1D,SNOMED:719046005,NCIT:C157179,NCIT:C48725,SNOMED:258497007,SNOMED:31675002,NCIT:C84509,
1,UNQ_1-2,UNQ_1-2,SNOMED:702782002,2022-04-23,P32Y6M1D,SNOMED:771439009,,NCIT:C48699,SNOMED:734336008,SNOMED:31675002,,
2,UNQ_1-3,UNQ_1-3,SNOMED:702782002,2021-04-23,P32Y6M1D,SNOMED:771439009,,NCIT:C48725,,SNOMED:702451000,NCIT:C4813,
3,UNQ_1-4,UNQ_1-4,SNOMED:365641003,2021-04-23,P7D,SNOMED:771439009,NCIT:C157179,NCIT:C48725,SNOMED:258603007,SNOMED:782814004,NCIT:C84509,
4,UNQ_1-5,UNQ_1-5,SNOMED:310294002,2022-04-23,P7D,SNOMED:362965005,,NCIT:C48725,SNOMED:258500001,SNOMED:782814004,,
5,UNQ_1-6,UNQ_1-6,SNOMED:276447000,2018-04-23,P32Y6M1D,SNOMED:719046005,NCIT:C15189,,,SNOMED:782814004,NCIT:C84509,
6,UNQ_1-7,UNQ_1-7,SNOMED:310294002,2021-04-23,P32Y6M1D,SNOMED:237592006,,NCIT:C48699,SNOMED:734336008,SNOMED:31675002,NCIT:C84509,
7,UNQ_1-8,UNQ_1-8,SNOMED:702782002,2015-04-23,P32Y6M1D,SNOMED:237592006,,,SNOMED:385338007,SNOMED:422236008,,
8,UNQ_1-9,UNQ_1-9,SNOMED:310293008,2018-04-23,P32Y6M1D,SNOMED:771439009,,NCIT:C48709,,SNOMED:31675002,,
9,UNQ_1-10,UNQ_1-10,SNOMED:365641003,2022-04-23,P7D,SNOMED:719046005,,NCIT:C48709,,SNOMED:422236008,NCIT:C84509,


In [11]:
biosamples = []

for data in biosamples_df.iterrows():
    idx, data = data
    data.fillna("", inplace=True)
    data = data.to_dict()
    biosample = {
            "id": data["id"],
            "individualId": data["individual_id"],
            "biosampleStatus": {
                "id": data["biosample_status_id"],
                "label": fetch_term(data["biosample_status_id"])["label"]
            },
            "collectionDate": data["collection_date"],
            "collectionMoment": data["collection_moment"],
            "histologicalDiagnosis": {
                "id": data["histological_diagnosis_id"],
                "label": fetch_term(data["histological_diagnosis_id"])["label"]
            },
            "obtentionProcedure": {
                "procedureCode": {
                    "id": data["obtention_procedure_id"],
                    "label": fetch_term(data["obtention_procedure_id"])["label"]
                }
            },
            "pathologicalTnmFinding": [
                {
                    "id": data["pathological_tnm_finding"],
                    "label": fetch_term(data["pathological_tnm_finding"])["label"]
                }
            ],
            "sampleOriginDetail": {
                "id": data["sample_origin_detail_id"],
                "label": fetch_term(data["sample_origin_detail_id"])["label"]
            },
            "sampleOriginType": {
                "id": data["sample_origin_type_id"],
                "label": fetch_term(data["sample_origin_type_id"])["label"]
            },
            "tumorProgression": {
                "id": data["tumor_progression_id"],
                "label": fetch_term(data["tumor_progression_id"])["label"]
            },
            "info": {},
            "notes": ""
        }
    biosamples.append(biosample)

print(json.dumps(biosamples, indent=2))

[
  {
    "id": "UNQ_1-1",
    "individualId": "UNQ_1-1",
    "biosampleStatus": {
      "id": "SNOMED:365641003",
      "label": "Minor blood groups - finding"
    },
    "collectionDate": "2019-04-23",
    "collectionMoment": "P32Y6M1D",
    "histologicalDiagnosis": {
      "id": "SNOMED:719046005",
      "label": "12q14 microdeletion syndrome"
    },
    "obtentionProcedure": {
      "procedureCode": {
        "id": "NCIT:C157179",
        "label": "FGFR1 Mutation Analysis"
      }
    },
    "pathologicalTnmFinding": [
      {
        "id": "NCIT:C48725",
        "label": "T2a Stage Finding"
      }
    ],
    "sampleOriginDetail": {
      "id": "SNOMED:258497007",
      "label": "Abscess swab"
    },
    "sampleOriginType": {
      "id": "SNOMED:31675002",
      "label": "Capillary blood"
    },
    "tumorProgression": {
      "id": "NCIT:C84509",
      "label": "Primary Malignant Neoplasm"
    },
    "info": {},
    "notes": ""
  },
  {
    "id": "UNQ_1-2",
    "individualId": "U

In [12]:
runs_df = con.execute("SELECT * FROM runs").df()
runs_df

Unnamed: 0,id,biosample_id,individual_id,library_layout,library_selection,library_source,library_strategy,platform,platform_model,run_date
0,UNQ_1-1,UNQ_1-1,UNQ_1-1,PAIRED,RANDOM,GENEPIO:0001969,WGS,PacBio,OBI:0002012,2021-10-18
1,UNQ_1-2,UNQ_1-2,UNQ_1-2,PAIRED,RANDOM,GENEPIO:0001966,WGS,Illumina,OBI:0002048,2021-10-18
2,UNQ_1-3,UNQ_1-3,UNQ_1-3,PAIRED,RANDOM,GENEPIO:0001966,WGS,NanoPore,OBI:0002750,2021-10-18
3,UNQ_1-4,UNQ_1-4,UNQ_1-4,PAIRED,RANDOM,GENEPIO:0001969,WGS,NanoPore,OBI:0002750,2021-10-18
4,UNQ_1-5,UNQ_1-5,UNQ_1-5,PAIRED,RANDOM,GENEPIO:0001969,WGS,PacBio,OBI:0002012,2018-01-01
5,UNQ_1-6,UNQ_1-6,UNQ_1-6,PAIRED,RANDOM,GENEPIO:0001969,WGS,PacBio,OBI:0002012,2018-01-01
6,UNQ_1-7,UNQ_1-7,UNQ_1-7,PAIRED,RANDOM,GENEPIO:0001969,WGS,Illumina,OBI:0002048,2021-10-18
7,UNQ_1-8,UNQ_1-8,UNQ_1-8,PAIRED,RANDOM,GENEPIO:0001966,WGS,NanoPore,OBI:0002750,2021-10-18
8,UNQ_1-9,UNQ_1-9,UNQ_1-9,PAIRED,RANDOM,GENEPIO:0001966,WGS,Illumina,OBI:0002048,2018-01-01
9,UNQ_1-10,UNQ_1-10,UNQ_1-10,PAIRED,RANDOM,GENEPIO:0001969,WGS,Illumina,OBI:0002048,2022-08-08


In [13]:
runs = []

for data in runs_df.iterrows():
    idx, data = data
    data.fillna("", inplace=True)
    data = data.to_dict()
    run = {
            "id": data["id"],
            "biosampleId": data["biosample_id"],
            "individualId": data["individual_id"],
            "libraryLayout": data["library_layout"],
            "librarySelection": data["library_selection"],
            "librarySource": {
                "id": data["library_source"],
                "label": fetch_term(data["library_source"])["label"]
            },
            "libraryStrategy": data["library_strategy"],
            "platform": data["platform"],
            "platformModel": {
                "id": data["platform_model"],
                "label": fetch_term(data["platform_model"])["label"]
            },
            "runDate": data["run_date"],
        }
    runs.append(run)

print(json.dumps(runs, indent=2))

[
  {
    "id": "UNQ_1-1",
    "biosampleId": "UNQ_1-1",
    "individualId": "UNQ_1-1",
    "libraryLayout": "PAIRED",
    "librarySelection": "RANDOM",
    "librarySource": {
      "id": "GENEPIO:0001969",
      "label": "other library source"
    },
    "libraryStrategy": "WGS",
    "platform": "PacBio",
    "platformModel": {
      "id": "OBI:0002012",
      "label": "PacBio RS II"
    },
    "runDate": "2021-10-18"
  },
  {
    "id": "UNQ_1-2",
    "biosampleId": "UNQ_1-2",
    "individualId": "UNQ_1-2",
    "libraryLayout": "PAIRED",
    "librarySelection": "RANDOM",
    "librarySource": {
      "id": "GENEPIO:0001966",
      "label": "genomic source"
    },
    "libraryStrategy": "WGS",
    "platform": "Illumina",
    "platformModel": {
      "id": "OBI:0002048",
      "label": "Illumina HiSeq 3000"
    },
    "runDate": "2021-10-18"
  },
  {
    "id": "UNQ_1-3",
    "biosampleId": "UNQ_1-3",
    "individualId": "UNQ_1-3",
    "libraryLayout": "PAIRED",
    "librarySelection": "R

In [14]:
analyses_df = con.execute("SELECT * FROM analyses").df()
analyses_df

Unnamed: 0,id,individual_id,biosample_id,run_id,aligner,analysis_date,pipeline_name,pipeline_ref,variant_caller,vcf_sample_id
0,UNQ_1-1,UNQ_1-1,UNQ_1-1,UNQ_1-1,bwa-0.7.8,2020-2-15,pipeline 5,Example,SoapSNP,HG00096
1,UNQ_1-2,UNQ_1-2,UNQ_1-2,UNQ_1-2,minimap2,2019-3-17,pipeline 1,Example,GATK4.0,HG00097
2,UNQ_1-3,UNQ_1-3,UNQ_1-3,UNQ_1-3,minimap2,2018-10-2,pipeline 5,Example,GATK4.0,HG00099
3,UNQ_1-4,UNQ_1-4,UNQ_1-4,UNQ_1-4,bwa-0.7.8,2018-11-9,pipeline 5,Example,kmer2snp,HG00100
4,UNQ_1-5,UNQ_1-5,UNQ_1-5,UNQ_1-5,bowtie,2019-5-27,pipeline 3,Example,GATK4.0,HG00101
5,UNQ_1-6,UNQ_1-6,UNQ_1-6,UNQ_1-6,bwa-0.7.8,2021-11-22,pipeline 1,Example,SoapSNP,HG00102
6,UNQ_1-7,UNQ_1-7,UNQ_1-7,UNQ_1-7,bowtie,2018-1-8,pipeline 1,Example,SoapSNP,HG00103
7,UNQ_1-8,UNQ_1-8,UNQ_1-8,UNQ_1-8,minimap2,2022-3-6,pipeline 1,Example,GATK4.0,HG00105
8,UNQ_1-9,UNQ_1-9,UNQ_1-9,UNQ_1-9,bowtie,2021-2-17,pipeline 2,Example,SoapSNP,HG00106
9,UNQ_1-10,UNQ_1-10,UNQ_1-10,UNQ_1-10,bwa-0.7.8,2019-8-13,pipeline 1,Example,SoapSNP,HG00107


In [15]:
analyses = []

for data in analyses_df.iterrows():
    idx, data = data
    data.fillna("", inplace=True)
    data = data.to_dict()
    analysis = {
            "id": data["id"],
            "individualId": data["individual_id"],
            "biosampleId": data["biosample_id"],
            "runId": data["run_id"],
            "aligner": data["aligner"],
            "analysisDate": data["analysis_date"],
            "pipelineName": data["pipeline_name"],
            "pipelineRef": data["pipeline_ref"],
            "variantCaller": data["variant_caller"],
            "vcfSampleId": data["vcf_sample_id"],
        }
    analyses.append(analysis)

print(json.dumps(analyses, indent=2))

[
  {
    "id": "UNQ_1-1",
    "individualId": "UNQ_1-1",
    "biosampleId": "UNQ_1-1",
    "runId": "UNQ_1-1",
    "aligner": "bwa-0.7.8",
    "analysisDate": "2020-2-15",
    "pipelineName": "pipeline 5",
    "pipelineRef": "Example",
    "variantCaller": "SoapSNP",
    "vcfSampleId": "HG00096"
  },
  {
    "id": "UNQ_1-2",
    "individualId": "UNQ_1-2",
    "biosampleId": "UNQ_1-2",
    "runId": "UNQ_1-2",
    "aligner": "minimap2",
    "analysisDate": "2019-3-17",
    "pipelineName": "pipeline 1",
    "pipelineRef": "Example",
    "variantCaller": "GATK4.0",
    "vcfSampleId": "HG00097"
  },
  {
    "id": "UNQ_1-3",
    "individualId": "UNQ_1-3",
    "biosampleId": "UNQ_1-3",
    "runId": "UNQ_1-3",
    "aligner": "minimap2",
    "analysisDate": "2018-10-2",
    "pipelineName": "pipeline 5",
    "pipelineRef": "Example",
    "variantCaller": "GATK4.0",
    "vcfSampleId": "HG00099"
  },
  {
    "id": "UNQ_1-4",
    "individualId": "UNQ_1-4",
    "biosampleId": "UNQ_1-4",
    "runId"

In [16]:
submission = {
    "dataset": dataset,
    "assemblyId": "GRCH38",
    "individuals": individuals,
    "biosamples": biosamples,
    "runs": runs,
    "analyses": analyses
}

print(json.dumps(submission, indent=2))
json.dump(submission, open("submission.json", "w+"), indent=2)


{
  "dataset": {
    "id": "UNQ_1",
    "createDateTime": "2021-03-21T02:37:00-08:00",
    "dataUseConditions": {
      "duoDataUse": [
        {
          "id": "DUO:0000042",
          "label": "general research use",
          "version": "17-07-2016"
        }
      ]
    },
    "description": "Simulation set 1.",
    "externalUrl": "http://example.org/wiki/Main_Page",
    "info": {},
    "name": "Dataset with fake data",
    "updateDateTime": "2022-08-05T17:21:00+01:00",
    "version": "v1.1"
  },
  "assemblyId": "GRCH38",
  "individuals": [
    {
      "id": "UNQ_1-1",
      "ethnicity": {
        "id": "SNOMED:52075006",
        "label": "Congolese"
      },
      "geographicOrigin": {
        "id": "SNOMED:223688001",
        "label": "United States of America"
      },
      "diseases": [],
      "interventionsOrProcedures": [
        {
          "procedureCode": {
            "id": "NCIT:C79426",
            "label": "Cancer Diagnostic or Therapeutic Procedure"
          }
   