# GDI Example dataset

The GDI project has its own specific metadata based on DCAT and HealthDCAT-AP.
Here is an example using the following six fields:

1. Dataset Title
2. Dataset Description
3. Number of participants
4. Relevant phenotypes (covid status, sex, age?, smoking status?)
5. DUO Codes?
6. Ancestry / Population?

The first two are part of the Dublin core terms, the third is defined by HealthDCAT-AP.

Let's first import some stuff and define the HealthDCAT-AP namespace.

⚠️ The HealthDCAT-AP namespace is not formally defined yet, so we use a placeholder

In [None]:
from typing import List, Union

from pydantic import ConfigDict, Field
from rdflib import DCAT, DCTERMS, Namespace, URIRef
from rdflib.namespace import DefinedNamespace

from sempyro.dcat import DCATDataset
from sempyro.rdf_model import LiteralField


# Define HealthDCAT-AP namespace with some properties
class HEALTHDCAT(DefinedNamespace):
    minTypicalAge: int
    maxTypicalAge: int
    numberOfUniqueIndividuals: int
    numberOfRecords: int
    populationCoverage: List[LiteralField]

    # FIXME: This is a placeholder until official HealthDCAT-AP namespace is defined
    _NS = Namespace("http://example.com/ns/healthdcat#")

Second, we define a Dataset class for GDI MS8. This is based on the DCAT-AP dataset class, but with
a few additional properties we borrow from HealthDCAT-AP. In this case, we define the number of
participants as a mandatory property, and the population coverage description as an optional one.

In [2]:
class GDIDataset(DCATDataset):
    model_config = ConfigDict(
                              json_schema_extra={
                                  "$ontology": "https://healthdcat-ap.github.io/",
                                  "$namespace": str(HEALTHDCAT),
                                  "$IRI": DCAT.Dataset,
                                  "$prefix": "healthdcatap"
                              }
                              )
    min_typical_age: int = Field(
        description=" Minimum typical age of the population within the dataset",
        rdf_term=HEALTHDCAT.minTypicalAge,
        rdf_type="xsd:nonNegativeInteger",
    )
    max_typical_age: int = Field(
        description="Maximum typical age of the population within the dataset",
        rdf_term=HEALTHDCAT.maxTypicalAge,
        rdf_type="xsd:nonNegativeInteger",
    )
    no_unique_individuals: int = Field(
        description="Number of participants in study",
        rdf_term=HEALTHDCAT.numberOfUniqueIndividuals,
        rdf_type="xsd:nonNegativeInteger",
    )
    no_records: int = Field(
        description="Size of the dataset in terms of the number of records.",
        rdf_term=HEALTHDCAT.numberOfRecords,
        rdf_type="xsd:nonNegativeInteger",
    )
    population_coverage: List[Union[str, LiteralField]] = Field(
        default=None,
        description="A definition of the population within the dataset",
        rdf_term=HEALTHDCAT.populationCoverage,
        rdf_type="rdfs_literal",
    )

Create list of URIs for distribution

In [3]:
STORAGE_IDS_FILE = "/home/ubuntu/fdp/storage-stuff/file_accession_id_sorted.txt"
DATASET_ID = "GDID-becadf5a-a1b2"
STORAGE_DOWNLOAD_URL = "https://download.gdi.biodata.pt"
MEDIA_TYPE_VCF = "https://www.iana.org/assignments/media-types/application/vcf"
MEDIA_TYPE_CSV = "https://www.iana.org/assignments/media-types/text/csv"

with open(STORAGE_IDS_FILE, "r") as f:
    lines = f.read().split("\n")
    lines = list(filter(lambda x: x.strip() != "", lines)) # remove blanks

# format: (filename, accession_id)
storage_ids = []

for line in lines:
    line = line.replace(5*" ", "\t") # vim replaced some tabs for 5 spaces
    filename, accession_id = line.split("\t")
    storage_ids.append( (filename, accession_id) )
    
def get_file_access_url(storage_id):
    if len(storage_id) != 2:
        raise ValueError("Expecting tuple from storage_ids")
        
    accession_id = storage_id[1]
    return f"{STORAGE_DOWNLOAD_URL}/files/{accession_id}"

Now we are ready to define the dataset. We can do that as a Python dictionary.

⚠️ As DCAT supports multilingual, Literals must usually be defined as a list. Using the
`LiteralField` class, you can define a language for each string.

In [None]:
import datetime

from sempyro.dcat.dcat_distribution import DCATDistribution
from sempyro.foaf.agent import Agent
from sempyro.vcard.vcard import VCard

def get_dataset_definition(list_of_distribution_URIs):

    dataset_definition = {
        "contact_point": [VCard(hasEmail=["mailto:cto@biodata.pt"], full_name=["BioData.pt Chief Technology Officer"],
                            hasUID="https://ror.org/02q7abn51")],
        "creator": [Agent(name=["BioData.pt"], identifier="https://ror.org/02q7abn51")],
        "description": ["This dataset is being used as part of the GDI Milestone 8, containing VCFs and phenotypic data in CSV format about 41514 samples. The dataset consists only of synthetic data."],
        #"distribution": ["https://fdp.gdi.biodata.pt/gdi/distribution", ],
        "distribution": list_of_distribution_URIs,
        "release_date": datetime.datetime(2024, 7, 7, 11, 11, 11, tzinfo=datetime.timezone.utc),
        "keyword": ["COVID"],
        "identifier": [DATASET_ID],
        "update_date": datetime.datetime(2024,11,4,10,20,5, tzinfo=datetime.timezone.utc),
        "publisher": [Agent(name=["BioData.pt"], identifier="https://ror.org/02q7abn51")],
        "theme": [URIRef("http://publications.europa.eu/resource/authority/data-theme/HEAL")],
        "title": ["COVID-19 GWAS and Allele Frequency Lookup Dataset with Population 12 (Italian) subset 1"],
        "license": URIRef("https://creativecommons.org/licenses/by-sa/4.0/"),
        "no_unique_individuals": 41514,
        "no_records": 18382376,
        "population_coverage": ["This test dataset covers no real population."],
        "min_typical_age": 18,
        "max_typical_age": 64,
    }
    
    return dataset_definition

def get_distribution_definition(media_type_url, chromosome, access_url) -> DCATDistribution:

    distribution_definition = {
        "title": [f"GWAS and Allele Frequency Lookup Data Distribution for GDI MS8 - Population 12 (ITA), subset 1, Chromosome {chromosome}."],
        "description": ["VCF file containing GWAS and allele frequency lookup data of synthetic COVID-19."
        f"cases and controls for GDI MS8 demonstration. VCF for chromosome {chromosome} in population 12 (ITA), sub1"],
        #"access_url": ["https://example.com/dataset/GDI-MS8-COVID19.vcf"],
        "access_url": [access_url],
        #"media_type": "https://www.iana.org/assignments/media-types/application/vcf",
        "media_type": media_type_url,
        "license": URIRef("https://creativecommons.org/licenses/by-sa/4.0/"),
        #"identifier": ["GDIF-12345678-90ab-defg"]
    }
    
    return DCATDistribution(**distribution_definition)

def get_distribution_uri(accession_id):
    return f"https://fdp.gdi.biodata.pt/gdi/distribution/{accession_id}"

def get_distribution_subject(accession_id):
    distribution_subject = URIRef(get_distribution_uri(accession_id))
    return distribution_subject

dataset_subject = URIRef("https://fdp.gdi.biodata.pt/gdi/dataset")

In [None]:
# create dataset stuff

# build distribution list

dist_accession_ids = [e[1] for e in storage_ids]

dist_uris = [get_distribution_uri(e) for e in dist_accession_ids]

dataset_definition = get_dataset_definition(dist_uris)

example_dataset = GDIDataset(**dataset_definition)
print(example_dataset)


create distribution stuff

In [None]:

dist_subjects = []
dist_definitions = []
dist_graphs = []
count=0
for ids in storage_ids:
    count += 1
    if count == 23:
        chromosome = "X"
    else:
        chromosome = str(count)
    
    filename = ids[0]
    accession_id = ids[1]
    
    # add dist sub to list
    new_dist_sub = get_distribution_subject(accession_id=accession_id)
    dist_subjects.append(new_dist_sub)
    
    # add dist def to list
    access_url = get_file_access_url(ids)
    new_dist_def = get_distribution_definition(media_type_url=MEDIA_TYPE_VCF, chromosome=chromosome, access_url=access_url)
    dist_definitions.append(new_dist_def)
    
    # create dist graph
    new_dist_graph = new_dist_def.to_graph(new_dist_sub)
    dist_graphs.append(new_dist_graph)
    

credentials and connection


In [None]:
fdp_parent_catalog = "https://fdp.gdi.biodata.pt/catalog/88e146bb-0fb9-4f45-ad57-26f928750773"
fdp_baseurl = "https://fdp.gdi.biodata.pt"
fdp_user = "albert.einstein@example.com"
with open("/home/ubuntu/fdp/passwd.txt", "r") as f:
    passwd = f.read().strip()
fdp_pass = passwd

import fairclient.fdpclient

# Log in to the FAIR Data Point
fdpclient = fairclient.fdpclient.FDPClient(base_url=fdp_baseurl, username=fdp_user, password=fdp_pass)


example_dataset_graph = example_dataset.to_graph(URIRef("https://fdp.gdi.biodata.pt/gdi/dataset"))
# Add a reference to the parent catalog to make the FDP happy
example_dataset_graph.add((dataset_subject, DCTERMS.isPartOf, URIRef(fdp_parent_catalog)))




publish

In [None]:
# publish dataset
new_dataset = fdpclient.create_and_publish("dataset", example_dataset_graph)
print(new_dataset)

In [None]:
dist_graphs
dist_fdp_ids = []

for idx in range(len(dist_graphs)):
    dist_sub = dist_subjects[idx]
    dist_graph = dist_graphs[idx]
    
    dist_graph.add((dist_sub, DCTERMS.isPartOf, URIRef(f"{new_dataset}")))
    print(f"dist_sub = {dist_sub}")
    print(f"dist_def = {dist_definitions[idx]}")
    print(f"dist_graph = {dist_graph}")
    dist_fdp_id = fdpclient.create_and_publish(resource_type="distribution", metadata=dist_graph)
    dist_fdp_ids.append(dist_fdp_id)
    print(dist_fdp_id)



## phenotype distribution

In [7]:
# harcoded dataset URI
new_dataset = "https://fdp.gdi.biodata.pt/dataset/661352e6-b08f-42a4-9192-160b3f9e5043"


In [None]:
pheno_accession_id = "GDIF-faca7026-a1b5-11ef"
access_url = get_file_access_url( ("Pop12Sub1_pheno.csv.c4gh", pheno_accession_id) )

pheno_dist_definition = {
        "title": [f"GWAS and Allele Frequency Lookup Data Distribution for GDI MS8 - Population 12 (ITA), subset 1, Phenotypes of the individuals."],
        "description": ["CSV file containing phenotypes in population 12 (ITA), sub1"],
        #"access_url": ["https://example.com/dataset/GDI-MS8-COVID19.vcf"],
        "access_url": [access_url],
        #"media_type": "https://www.iana.org/assignments/media-types/application/vcf",
        "media_type": MEDIA_TYPE_CSV,
        "license": URIRef("https://creativecommons.org/licenses/by-sa/4.0/"),
        #"identifier": ["GDIF-12345678-90ab-defg"]
}

pheno_dist = DCATDistribution(**pheno_dist_definition)
pheno_sub = get_distribution_subject(pheno_accession_id)

pheno_graph = pheno_dist.to_graph(pheno_sub)
pheno_graph.add((pheno_sub, DCTERMS.isPartOf, URIRef(f"{new_dataset}")))





In [None]:
# publish phenotypic data distribution
pheno_fdp_id = fdpclient.create_and_publish(resource_type="distribution", metadata=pheno_graph)
print(pheno_fdp_id)