* for a given BCO-DMO project (I am working with this project to start: 826178), I need the list of datasets for that project

* For each dataset, I need the following:

- Name of dataset

- DOI

- the data itself (the CSV file)

# each dataset has a set of parameters, and I can pull the parameter short names from the data file. For each parameter I need:

-- short name, unit, description, data type, format (I see these are listed as field information in the website)



In [None]:
%%capture
!pip install frictionless
!pip install frictionless[excel]
!pip install -q sparqlwrapper


In [None]:
import pandas as pd
import requests
import os
import json

from datetime import datetime, timedelta, timezone

from SPARQLWrapper import SPARQLWrapper, POST, JSON

from frictionless import describe, Package

In [None]:
"""CONSTANTS"""

SPARQL_ENDPOINT = 'https://lod.bco-dmo.org/sparql'

PROJECT_URI = 'http://lod.bco-dmo.org/id/project/826178'

In [None]:
""" HELPER FUNCTIONS """
def rfc3339_datetime_str():
    """
    Construct an RFC3339-compliant datetime
    """
    return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")

def get_sparql_dataframe(service, query):
    """
    Helper function to convert SPARQL results into a Pandas data frame.
    """
    sparql = SPARQLWrapper(service)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    result = sparql.query()
    processed_results = json.load(result.response)
    cols = processed_results['head']['vars']

    out = []
    for row in processed_results['results']['bindings']:
        item = []
        for c in cols:
            item.append(row.get(c, {}).get('value'))
        out.append(item)

    return pd.DataFrame(out, columns=cols)



In [None]:
"""Get the Projects Datasets"""

PROJECT_DATASETS_QUERY = """
SELECT ?dataset ?title ?doi
WHERE {
  VALUES ?project { <""" + PROJECT_URI + """>}
  ?project a odo:Project .
  ?project odo:hasDataset ?dataset .
  ?dataset a odo:Dataset .
  OPTIONAL { ?dataset odo:datasetTitle ?title }
  OPTIONAL { ?dataset bibo:doi ?doi }
}
ORDER BY ?dataset ?data_file ?data_file_type"""

metadata = get_sparql_dataframe(SPARQL_ENDPOINT, PROJECT_DATASETS_QUERY)
metadata.head(10)

Unnamed: 0,dataset,title,doi
0,http://lod.bco-dmo.org/id/dataset/853440,ZooSCAN images of zooplankton collected during...,10.26008/1912/bco-dmo.853440.1
1,http://lod.bco-dmo.org/id/dataset/854077,ZooSCAN biovolume to biomass from imaged zoopl...,10.26008/1912/bco-dmo.854077.1
2,http://lod.bco-dmo.org/id/dataset/857891,ZooSCAN output from of imaged zooplankton coll...,10.26008/1912/bco-dmo.857891.1
3,http://lod.bco-dmo.org/id/dataset/861266,BIOS-SCOPE survey biogeochemical data as colle...,10.26008/1912/bco-dmo.861266.1
4,http://lod.bco-dmo.org/id/dataset/920443,Chemical analyses of size-fractionated particl...,10.26008/1912/bco-dmo.920443.1
5,http://lod.bco-dmo.org/id/dataset/964684,Size fractionated Amino Acid data collected in...,10.26008/1912/bco-dmo.964684.1
6,http://lod.bco-dmo.org/id/dataset/964801,Size fractionated carbohydrate data collected ...,10.26008/1912/bco-dmo.964801.1
7,http://lod.bco-dmo.org/id/dataset/964826,Size Fractionated Particulate Organic Carbon (...,10.26008/1912/bco-dmo.964826.1


In [None]:
"""QUERIES for dataset data files and dataset parameters"""

DATASET_FILES_QUERY = """
SELECT ?url (STR(?is_primary_data_file) as ?is_primary_data_file)  ?bytesize ?type ?mimetype ?type_abbreviation ?type_name
WHERE {
  VALUES ?dataset { <{dataset_uri}>}
  ?dataset a odo:Dataset .
  ?dataset odo:dataFile ?data_file .
  OPTIONAL { ?data_file odo:isPrimaryDataFile ?is_primary_data_file }
  ?data_file odo:usesFileDescriptor [
    odo:bytesize ?bytesize ;
    odo:downloadUrl ?url ;
    odo:fileType ?type ;
  ] .
  ?type skos:prefLabel ?type_name .
  OPTIONAL { ?type odo:mimetype ?mimetype }
  OPTIONAL { ?type skos:altLabel ?type_abbreviation }
}
ORDER BY ?data_file ?data_file_type"""

DATASET_PARAMS_QUERY = """
SELECT ?supplied_name ?supplied_definition ?datatype ?units ?format
WHERE {
  VALUES ?dataset { <{dataset_uri}>}
  ?dataset a odo:Dataset .
  ?dataset odo:storesValuesFor ?dataset_param .
  ?dataset_param skos:prefLabel ?supplied_name .
    OPTIONAL { ?dataset_param skos:definition ?supplied_definition }
    OPTIONAL { ?dataset_param odo:hasUnitOfMeasure/rdf:value ?units }
    OPTIONAL { ?dataset_param odo:datatype/odo:frictionlessdataDatatype ?datatype }
    OPTIONAL { ?dataset_param odo:valueFormat ?format }
}
ORDER BY ?dataset_param"""

In [None]:
""" Create a Frictionless Data Package """

biosscope = Package(name='biosscope-bcodmo-datasets', profile='data-package')
biosscope.title = 'Bermuda Institute of Ocean Sciences Simons Collaboration on Ocean Processes and Ecology'
biosscope.description = 'BIOSSCOPE datasets from BCO-DMO'
biosscope.created = rfc3339_datetime_str()
biosscope.sources = []

for index, dataset in metadata.iterrows():

  # Save the dataset as a 'source' in the Package
  source = {
    'path': dataset['dataset'],
    'title': dataset['title'],
    'doi': dataset['doi']
  }
  biosscope.sources.append(source)


  # Get the BCO-DMO parameters
  parameters = get_sparql_dataframe(SPARQL_ENDPOINT, DATASET_PARAMS_QUERY.replace('{dataset_uri}', dataset['dataset']))
  schema = []
  for param_index, parameter in parameters.iterrows():
    param = {}
    param['bcodmo:name'] = parameter['supplied_name']
    if parameter['supplied_definition'] is not None:
      param['bcodmo:description'] = parameter['supplied_definition']
    if parameter['datatype'] is not None:
      param['bcodmo:datatype'] = parameter['datatype']
    if parameter['units'] is not None:
      param['bcodmo:units'] = parameter['units']
    if parameter['format'] is not None:
      param['bcodmo:valueFormat'] = parameter['format']
    schema.append(param)

  # Get the 'data' files for a Dataset (skip any supplemental documentation)
  files = get_sparql_dataframe(SPARQL_ENDPOINT, DATASET_FILES_QUERY.replace('{dataset_uri}', dataset['dataset']))
  for file_index, file in files.iterrows():

    # Use Frictionless to describe the file
    datafile = describe(file['url'])

    # Get Table stats
    if datafile.type == 'table':
      datafile.infer(stats=True)

    # Specify which dataset this file belongs to
    datafile.sources = [source]

    # If the file is marked as the primary file for the dataset, attach the parameters to the file
    if schema is not None and file['is_primary_data_file'] == 'true':
      datafile.custom['bcodmo:parameters'] = schema

    # Add the file to the package
    biosscope.add_resource(datafile)

# Save the package
print(biosscope.to_json())
biosscope.to_json('datapackage.json')

{
  "name": "biosscope-bcodmo-datasets",
  "title": "Bermuda Institute of Ocean Sciences Simons Collaboration on Ocean Processes and Ecology",
  "description": "BIOSSCOPE datasets from BCO-DMO",
  "profile": "data-package",
  "sources": [
    {
      "path": "http://lod.bco-dmo.org/id/dataset/853440",
      "title": "ZooSCAN images of zooplankton collected during BATS MOCNESS tows during R/V Atlantic Explorer cruises AE1614, AE1712, AE1830, and AE1819 in the vicinity of the Bermuda Atlantic Time-series Study from 2016 to 2018",
      "doi": "10.26008/1912/bco-dmo.853440.1"
    },
    {
      "path": "http://lod.bco-dmo.org/id/dataset/854077",
      "title": "ZooSCAN biovolume to biomass from imaged zooplankton collected during MOCNESS tows during various R/V Atlantic Explorer cruises and small boat deployments in the Sargasso Sea betwen 2016 to 2019",
      "doi": "10.26008/1912/bco-dmo.854077.1"
    },
    {
      "path": "http://lod.bco-dmo.org/id/dataset/857891",
      "title": "Zoo

'{\n  "name": "biosscope-bcodmo-datasets",\n  "title": "Bermuda Institute of Ocean Sciences Simons Collaboration on Ocean Processes and Ecology",\n  "description": "BIOSSCOPE datasets from BCO-DMO",\n  "profile": "data-package",\n  "sources": [\n    {\n      "path": "http://lod.bco-dmo.org/id/dataset/853440",\n      "title": "ZooSCAN images of zooplankton collected during BATS MOCNESS tows during R/V Atlantic Explorer cruises AE1614, AE1712, AE1830, and AE1819 in the vicinity of the Bermuda Atlantic Time-series Study from 2016 to 2018",\n      "doi": "10.26008/1912/bco-dmo.853440.1"\n    },\n    {\n      "path": "http://lod.bco-dmo.org/id/dataset/854077",\n      "title": "ZooSCAN biovolume to biomass from imaged zooplankton collected during MOCNESS tows during various R/V Atlantic Explorer cruises and small boat deployments in the Sargasso Sea betwen 2016 to 2019",\n      "doi": "10.26008/1912/bco-dmo.854077.1"\n    },\n    {\n      "path": "http://lod.bco-dmo.org/id/dataset/857891",\n