## Specification aggregator for bioschemas

The DDE intentionally does not allow existing namespaces to be used by others. This is a feature that ensures that user-generated schemas or user-customized schemas are not confused for existing, registered schemas. Unfortunately, this means that users looking to update an existing bioschemas cannot use the bioschemas namespace when creating a schema within the DDE. The aggregator includes a function will replace the temporary namespace with the bioschemas namespace for the merge.

### What this script does

1. Loads the list of jsonschema specification files to ingest
2. Replaces the temporary namespace in the merged json document
3. Include a check for multiple same classes: 
  * A profile might reference another profile. In order for the profile to work, a dummy class of the referenced profile may need to be created. This dummy profile should be included in the merged file ONLY IF the actual class (which should have a validation section of its own) does not exist
  * Use the existence of a validation to determine which class to keep in the event of the same class coming from different profiles (real or dummy)
4. Includes a check for the subclass of and update it to match the list 
  * This is in anticipation of the use of the DDE to update an existing profile
5. Includes a check for multiple same properties:
  * A single property might be used across different bioschemas classes
  * This means that the `"schema:domainIncludes"` property should be updated to reflect ALL of the classes that use it, rather than just the first class that uses it.
  * eg- "bioschemas:output" might be a property in "ComputationalTool" and "ComputationalWorkflow". Since these profiles are developed separately, the one in "ComputationalTool" will include `"schema:domainIncludes": {"@id": "bioschemas:ComputationalTool"}` while the one from ComputationalWorkflow will include `"schema:domainIncludes": {"@id": "bioschemas:ComputationalWorkflow"}`. These will need to be merged into a single property with `"schema:domainIncludes": [{"@id": "bioschemas:ComputationalTool"},{"@id": "bioschemas:ComputationalWorkflow"}]`
6. Include a check for the proper url for bioschemas in @context
7. Automatically include `dct:conformsTo` to all classes definied by the schema
8. Automatically include `schema:schemaVersion` to all classes defined by the schema

Note - As of 2022.02.17 - the script will split out types and profiles which will be saved to different json files and ingested into the DDE under different namespaces.

  

### To do
1. Include a check for properties which already have a list for the domainIncludes
2. Ensure script works even with extra data in table
3. Add schema validation check before allowing it to commit




In [1]:
import json
import requests
import pandas as pd
from pandas import read_csv
import os
import pathlib
from datetime import datetime
from datetime import timedelta
from biothings_schema import Schema
from src.common import *

In [None]:
def check_for_updates(script_path,updateall=False):
    checktime = datetime.now()
    profile_file = os.path.join(script_path,'profile_list.txt')
    profile_draft_file = os.path.join(script_path,'draft_profile_list.txt')
    type_file = os.path.join(script_path,'type_list.txt')
    type_draft_file = os.path.join(script_path,'draft_type_list.txt')
    deprecated = os.path.join(script_path,'deprecated.txt')
    filelist = [profile_file,profile_draft_file,type_file,type_draft_file,deprecated]
    updatedlist = []
    if updateall==True:
        updatedlist = filelist
    else:
        for eachfile in filelist:
            last_modified = datetime.fromtimestamp(os.path.getmtime(eachfile))
            timediff = checktime-last_modified
            if timediff < timedelta(hours=3):
                updatedlist.append(eachfile)
    if len(updatedlist)==0:
        updatedlist = False
    return updatedlist 

In [4]:
def run_update(script_path,updateall=False):
    if updateall == True:
        updatedlist = check_for_updates(script_path,True)
    else:
        updatedlist = check_for_updates(script_path,False)
    if updatedlist != False:
        for eachfile in updatedlist:
            speclist = read_csv(eachfile,delimiter='\t',header=0)
            bioschemas_json = remove_NaN_fields(merge_specs(speclist))
            jsonstring = json.dumps(bioschemas_json)
            cleanstring = remove_NaN_fields(jsonstring)
            cleandict = json.loads(cleanstring)
            prettystring = json.dumps(cleandict, indent=2)
            #### Check specification list file name to determine where to save
            if "deprecated" in eachfile:
                ####treat as deprecated
                bioschemasfile = os.path.join(script_path,'bioschemasdeprecated.json')
            if "type" in eachfile:
                if "draft" in eachfile:
                    #### draft type treat as type 
                    bioschemasfile = os.path.join(script_path,'bioschemastypesdrafts.json')
                else:
                    ####treat as type
                    bioschemasfile = os.path.join(script_path,'bioschemastypes.json')
            if "profile" in eachfile:
                if "draft" in eachfile:
                    ####treat as draft profile
                    bioschemasfile = os.path.join(script_path,'bioschemasdrafts.json')
                else:
                    bioschemasfile = os.path.join(script_path,'bioschemas.json')
            sc = Schema(cleandict, base_schema=["schema.org","bioschemastypes","bioschemas",
                              "bioschemasdrafts","bioschemastypesdrafts",
                              "bioschemasdeprecated"])
            with open(bioschemasfile,'w') as outfile:
                outfile.write(prettystring)
        

In [13]:
def get_raw_url(url):
    if 'raw' not in url:
        rawrawurl = url.replace('github.com','raw.githubusercontent.com')
        if 'master' in rawrawurl:
            rawurl = rawrawurl.replace('/blob/master/','/master/').replace('/tree/master/','/master/')
        elif 'main' in rawrawurl:
            rawurl = rawrawurl.replace('/blob/main/','/main/').replace('/tree/master/','/master/')
    else:
        rawurl = url
    return rawurl

In [None]:
def rename_namespace(spec_list,eachurl,rawtext):
    tmpinfo = spec_list.loc[spec_list['url']==eachurl]
    tmpnamespace = tmpinfo.iloc[0]['namespace']
    if 'DEPRECATED' in tmpinfo.iloc[0]['version']:
        if tmpnamespace!='bioschemasdeprecated':
            tmptext = '"@id": "'+tmpnamespace+':'
            cleantext = rawtext.replace(tmptext,'"@id": "bioschemasdeprecated:')
        else:
            cleantext = rawtext 
    elif ((tmpinfo.iloc[0]['type']=='Profile') and ('RELEASE' in tmpinfo.iloc[0]['version'])):
        if tmpnamespace!='bioschemas':
            tmptext = '"@id": "'+tmpnamespace+':'
            cleantext = rawtext.replace(tmptext,'"@id": "bioschemas:')
        else:
            cleantext = rawtext
    elif ((tmpinfo.iloc[0]['type']=='Profile') and ('DRAFT' in tmpinfo.iloc[0]['version'])):
        if tmpnamespace!='bioschemasdrafts':
            tmptext = '"@id": "'+tmpnamespace+':'
            cleantext = rawtext.replace(tmptext,'"@id": "bioschemasdrafts:')
        else:
            cleantext = rawtext  
    elif ((tmpinfo.iloc[0]['type']=='Type') and ('RELEASE' in tmpinfo.iloc[0]['version'])):
        if tmpnamespace!='bioschemastypes':
            tmptext = '"@id": "'+tmpnamespace+':'
            cleantext = rawtext.replace(tmptext,'"@id": "bioschemastypes:')
        else:
            cleantext = rawtext
    elif ((tmpinfo.iloc[0]['type']=='Type') and ('DRAFT' in tmpinfo.iloc[0]['version'])):
        if tmpnamespace!='bioschemastypesdrafts':
            tmptext = '"@id": "'+tmpnamespace+':'
            cleantext = rawtext.replace(tmptext,'"@id": "bioschemastypesdrafts:')
        else:
            cleantext = rawtext
    return(cleantext, tmpnamespace)


def generate_base_context():
    allcontext = {
        "schema": "http://schema.org/",
        "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
        "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
        "owl":"http://www.w3.org/2002/07/owl/",
        "bioschemas":"https://discovery.biothings.io/view/bioschemas/",
        "bioschemasdrafts":"https://discovery.biothings.io/view/bioschemasdrafts/",
        "bioschemastypes":"https://discovery.biothings.io/view/bioschemastypes/",
        "bioschemastypesdrafts":"https://discovery.biothings.io/view/bioschemastypesdrafts/",
        "bioschemasdeprecated":"https://discovery.biothings.io/view/bioschemasdeprecated/",
        "dct":"http://purl.org/dc/terms/"
    }
    return allcontext

def check_context_url(allcontext,spec_json,tmpnamespace):
    now = datetime.now()
    contextInfo = spec_json['@context']
    for key in list(contextInfo.keys()):
        if key != tmpnamespace: 
            if key not in list(allcontext.keys()):
                allcontext[key] = contextInfo[key]
    allcontext["@dateModified"] = now.strftime("%m/%d/%Y, %H:%M:%S")
    return allcontext

def update_subclass(spec_list,eachurl,cleantext):
    spec_json = json.loads(cleantext)
    tmpinfo = spec_list.loc[spec_list['url']==eachurl]
    tmpsubclass = tmpinfo.iloc[0]['subClassOf']
    classname = tmpinfo.iloc[0]['name']
    truesubclass = {"@id": tmpsubclass}
    for x in spec_json['@graph']:
        if x['@id']=="bioschemas:"+classname:
            x['rdfs:subClassOf']=truesubclass
        if x['@id']=="bioschemasdrafts:"+classname:
            x['rdfs:subClassOf']=truesubclass
        if x['@id']=="bioschemastypes:"+classname:
            x['rdfs:subClassOf']=truesubclass
        if x['@id']=="bioschemastypesdrafts:"+classname:
            x['rdfs:subClassOf']=truesubclass
        if x['@id']=="bioschemasdeprecated:"+classname:
            x['rdfs:subClassOf']=truesubclass
    return spec_json

In [None]:
def deletenamespace(x):
    oldname = x['@id']
    if "bioschemastypesdrafts" in oldname:
        cleanname = oldname.replace("bioschemastypesdrafts:","")        
    elif "bioschemastypes" in oldname:
        cleanname = oldname.replace("bioschemastypes:","")
    elif "bioschemasdrafts" in oldname:
        cleanname = oldname.replace("bioschemasdrafts:","")
    elif "bioschemasdeprecated" in oldname:
        cleanname = oldname.replace("bioschemasdeprecated:","")
    elif "bioschemas" in oldname:
        cleanname = oldname.replace("bioschemas:","")
    return cleanname

In [None]:
def add_conformsTo(spec_list,x):
    cleanname = deletenamespace(x)
    spec_info = spec_list.loc[spec_list['name'] == cleanname]
    spec_url = spec_info.iloc[0]['url']
    conformsTodict = {
            "description": "This is used to state the Bioschemas profile that the markup relates to. The identifier can be the url for the version of this bioschemas class on github: "+spec_url,
            "$ref": "#/definitions/conformsDefinition"
          }
    conformdef={
                "@type": "CreativeWork",
                "type": "object",
                "properties": {
                  "identifier":{
                    "description": "The url of the version bioschemas profile that was used. For jsonschema, set @id to the identifier",
                    "oneOf": [
                      {
                        "enum": [spec_url] 
                      },
                      {
                        "type": "string",
                        "format": "uri"
                      }
                    ]
                  }
                },
                "required": [
                  "identifier"
                ]              
        }
    x['$validation']['properties']['conformsTo'] = conformsTodict
    requirementlist = x['$validation']['required']
    requirementlist.append('conformsTo')
    x['$validation']['required'] = requirementlist
    try:
        definitiondict = x['$validation']['definitions']
    except:
        definitiondict = {}
    definitiondict["conformsDefinition"]=conformdef
    x['$validation']['definitions']=definitiondict
    return x

In [None]:
def remove_conformsTo(x):
    if 'conformsTo' in list(x['$validation']['properties'].keys()):
        del x['$validation']['properties']['conformsTo']
        requirementlist = [i for i in x['$validation']['required'] if i!='conformsTo']
        x['$validation']['required'] = requirementlist
    if 'definitions' in list(x['$validation']):
        if 'conformsTo' in list(x['$validation']['definitions'].keys()):
            del x['$validation']['definitions']['conformsTo']
        if 'conformsDefinition' in list(x['$validation']['definitions'].keys()):
             del x['$validation']['definitions']['conformsDefinition']
    return x

In [None]:
def add_schemaVersion(spec_list,x):
    cleanname = deletenamespace(x)
    spec_info = spec_list.loc[spec_list['name'] == cleanname]
    spec_url = spec_info.iloc[0]['url']
    baseurl = "https://bioschemas.org"
    versionurl = baseurl+'/'+spec_info.iloc[0]['type'].lower()+'s/'+spec_info.iloc[0]['name']+'/'+spec_info.iloc[0]['version']
    try:
        existingversions = x["schema:schemaVersion"]
        if isinstance(schemaversions, list) == False:
            schemaversions = existingversions.strip("[").strip("]").split(",")
        else:
            schemaversions = existingversions
    except:
        schemaversions = []
    schemaversions.append(versionurl)
    schemaversions.append(spec_url)
    ## Ensure uniqueness of elements
    x["schema:schemaVersion"] = list(set(schemaversions))
    return x

In [None]:
def add_specification_type(spec_list,x):
    cleanname = deletenamespace(x)
    spec_info = spec_list.loc[spec_list['name'] == cleanname]
    if spec_list.iloc[0]['type']=='Type':
        baseurl = 'https://bioschemas.org/types#nav-'
    elif spec_list.iloc[0]['type']=='Profile':
        baseurl = 'https://bioschemas.org/profiles#nav-'
    if 'deprecated' in spec_info.iloc[0]['version'].lower():
        typeurl = baseurl+'deprecated'
    elif 'release' in spec_info.iloc[0]['version'].lower():
        typeurl = baseurl+'release'
    elif 'draft' in spec_info.iloc[0]['version'].lower():
        typeurl = baseurl+'draft'
    x['additional_type'] = typeurl
    return x

In [10]:
def merge_specs(spec_list):
    bioschemas_json = {}
    graphlist = []
    classlist = []
    propertylist = []
    allcontext = generate_base_context()
    for eachurl in spec_list['url']:
        rawurl = get_raw_url(eachurl)
        r = requests.get(rawurl)
        if r.status_code == 200:
            cleantext,tmpnamespace = rename_namespace(spec_list,eachurl,r.text)
            spec_json = json.loads(cleantext)
            allcontext = check_context_url(allcontext,spec_json,tmpnamespace)
            for x in spec_json['@graph']:
                graphlist.append(x)
                if x["@type"]=="rdfs:Class":
                    classlist.append(x["@id"])
                if x["@type"]=="rdf:Property":
                    propertylist.append(x["@id"])
        else:
            print(r.status_code, rawurl)
    cleanclassgraph = clean_duplicate_classes(spec_list,graphlist,classlist)
    cleanpropsgraph = clean_duplicate_properties(graphlist, propertylist)
    cleangraph = []
    for z in cleanclassgraph:
        cleangraph.append(z)
    for a in cleanpropsgraph:
        cleangraph.append(a)
    conformsTo = define_conformsTo(classlist)
    cleangraph.append(conformsTo)
    bioschemas_json['@context'] = allcontext
    bioschemas_json['@graph']=cleangraph
    return bioschemas_json 

In [None]:
def clean_duplicate_classes(spec_list,graphlist,classlist):
    duplicates = [i for i in set(classlist) if classlist.count(i) > 1]
    nondupes = [x for x in classlist if x not in duplicates]
    cleanclassgraph = []
    if len(duplicates)>0:  ## There are duplicate classes to clean up
        for x in graphlist:
            if x["@id"] in nondupes:
                x = add_specification_type(spec_list,x)
                x = add_schemaVersion(spec_list,x)
                if "$validation" in x.keys():
                    x = remove_conformsTo(x)
                    x = add_conformsTo(spec_list,x)
                cleanclassgraph.append(x)
            for eachclass in duplicates:
                if x["@id"]==eachclass:
                    x = add_specification_type(spec_list,x)
                    x = add_schemaVersion(spec_list,x)
                    if "$validation" in x.keys():
                        x = remove_conformsTo(x)
                        x = add_conformsTo(spec_list,x)
                    cleanclassgraph.append(x)
    else:  ## There are no duplicate classes to clean up
        for x in graphlist:
            if x["@id"] in nondupes:
                x = add_specification_type(spec_list,x)
                x = add_schemaVersion(spec_list,x)
                if "$validation" in x.keys():
                    x = remove_conformsTo(x)
                    x = add_conformsTo(spec_list,x)
                cleanclassgraph.append(x)        
    return cleanclassgraph 

def clean_duplicate_properties(graphlist, propertylist):
    if 'conformsTo' in propertylist:
        propertylist.remove('conformsTo')
    if 'dct:conformsTo' in propertylist:
        propertylist.remove('dct:conformsTo')
    duplicates = [i for i in set(propertylist) if propertylist.count(i) > 1]
    nondupes = [x for x in propertylist if x not in duplicates]
    cleanpropsgraph = []
    dupepropsgraph = []
    if len(duplicates)>0:  ## There are duplicate properties to clean up
        for x in graphlist:
            if x["@id"] in nondupes:
                x = remove_NaN_fields(x)
                cleanpropsgraph.append(x)
            elif x["@id"] in duplicates:
                x = remove_NaN_fields(x)
                dupepropsgraph.append(x)
        #dupepropsgraph[0]["dummyProp"]={"@id":"dummyValue"} #### creates dummy property for testing only
        dupepropsdf = pd.DataFrame(dupepropsgraph)
        for eachprop in duplicates:
            tmpdf = dupepropsdf.loc[dupepropsdf['@id']==eachprop].copy()
            domainlist = []
            domainlist = [y for y in tmpdf["schema:domainIncludes"] if y not in domainlist]
            #### Get the row with the least number of NaNs (ie- the row with the most properties) to serve as the base property
            tmpdf["nullcount"]=tmpdf.isnull().sum(axis=1)
            tmpdf.sort_values("nullcount",ascending=True,inplace=True)
            tmpdict = tmpdf.iloc[0].to_dict()
            del tmpdict["nullcount"]
            tmpdict["schema:domainIncludes"]=domainlist #### Set the domainIncludes list
            cleanpropsgraph.append(tmpdict)       
    else:
        for x in graphlist:
            if x["@id"] in nondupes:
                x = remove_NaN_fields(x)
                cleanpropsgraph.append(x)
    return cleanpropsgraph   

In [None]:
def define_conformsTo(classlist):
    uniqueclasses =  list(set(classlist))
    classidlist = [{"@id":x} for x in classlist]
    conformsTo = {
      "@id": "dct:conformsTo",
      "@type": "rdf:Property",
      "rdfs:comment": "Used to state the Bioschemas profile that the markup relates to. The versioned URL of the profile must be used. Note that we use a CURIE in the table here but the full URL for Dublin Core terms must be used in the markup (http://purl.org/dc/terms/conformsTo), see example.",
      "rdfs:label": "conformsTo",
      "schema:domainIncludes": classidlist,
      "schema:rangeIncludes": [
        {"@id": "schema:CreativeWork"},{"@id": "schema:Text"},{"@id": "schema:Thing"}
      ]
    }
    return conformsTo

In [None]:
def remove_NaN_fields(propdef):
    cleandict = {}
    if isinstance(propdef,dict):
        for k, v in propdef.items():
            if k != "schema:sameAs":
                cleandict[k]=v
            elif k == "schema:sameAs": 
                if isinstance(v,type(None))==False:
                    cleandict[k]=v
    if isinstance(propdef,str):
        cleandict = propdef.replace(', "schema:sameAs": NaN','')
        cleandict = cleandict.replace('"schema:sameAs": NaN, ','')
    return cleandict

## Test script

In [2]:
import json
import requests
import pandas as pd
from pandas import read_csv
import os
import pathlib
from datetime import datetime
from datetime import timedelta
from biothings_schema import Schema
from src.common import *

#### Main
script_path = '' #pathlib.Path(__file__).parent.absolute()
run_update(script_path,True)

True ['type_list.txt', 'draft_type_list.txt', 'profile_list.txt', 'draft_profile_list.txt', 'deprecated.txt']




SchemaValidationError: field "codeRepository" in "$validation" is not defined in this class or any of its parent classes

In [None]:
## Validate a generated json
script_
inputfile = os.path.join(script_path,'drafts','gbhBioSample.json')
with open(inputfile,'rb') as infile:
    definedterm = json.load(infile)

#print(definedterm)
sc = Schema(definedterm)
sc.validation

In [None]:
#### For additional Testing if "run_update" fails

spec_list = speclist
duplicates = [i for i in set(classlist) if classlist.count(i) > 1]
nondupes = [x for x in classlist if x not in duplicates]
cleanclassgraph = []
if len(duplicates)>0:  ## There are duplicate classes to clean up
    for x in graphlist:
        if x["@id"] in nondupes:
            cleanname = deletenamespace(x)
            spec_info = spec_list.loc[spec_list['name'] == cleanname]
            if spec_list.iloc[0]['type']=='Type':
                baseurl = 'https://bioschemas.org/types#nav-'
            elif spec_list.iloc[0]['type']=='Profile':
                baseurl = 'https://bioschemas.org/profiles#nav-'
            if 'deprecated' in spec_info.iloc[0]['version'].lower():
                typeurl = baseurl+'deprecated'
            elif 'release' in spec_info.iloc[0]['version'].lower():
                typeurl = baseurl+'release'
            elif 'draft' in spec_info.iloc[0]['version'].lower():
                typeurl = baseurl+'draft'
            x['additional_type'] = typeurl            
            #x = add_schemaVersion(spec_type,x)
            if "$validation" in x.keys():
                x = add_conformsTo(spec_type,x)
            cleanclassgraph.append(x)
        for eachclass in duplicates:
            if x["@id"]==eachclass:
                cleanname = deletenamespace(x)
                spec_info = spec_list.loc[spec_list['name'] == cleanname]
                if spec_list.iloc[0]['type']=='Type':
                    baseurl = 'https://bioschemas.org/types#nav-'
                elif spec_list.iloc[0]['type']=='Profile':
                    baseurl = 'https://bioschemas.org/profiles#nav-'
                if 'deprecated' in spec_info.iloc[0]['version'].lower():
                    typeurl = baseurl+'deprecated'
                elif 'release' in spec_info.iloc[0]['version'].lower():
                    typeurl = baseurl+'release'
                elif 'draft' in spec_info.iloc[0]['version'].lower():
                    typeurl = baseurl+'draft'
                x['additional_type'] = typeurl   
                #x = add_schemaVersion(spec_type,x)
                if "$validation" in x.keys():
                    x = add_conformsTo(spec_list,x)
                cleanclassgraph.append(x)

else:  ## There are not duplicate classes to clean up
    for x in graphlist:
        if x["@id"] in nondupes:
            cleanname = deletenamespace(x)
            print(cleanname)
            spec_info = spec_list.loc[spec_list['name'] == cleanname]
            print(spec_info)
            if spec_list.iloc[0]['type']=='Type':
                baseurl = 'https://bioschemas.org/types#nav-'
            elif spec_list.iloc[0]['type']=='Profile':
                baseurl = 'https://bioschemas.org/profiles#nav-'
            if 'deprecated' in spec_info.iloc[0]['version'].lower():
                typeurl = baseurl+'deprecated'
            elif 'release' in spec_info.iloc[0]['version'].lower():
                typeurl = baseurl+'release'
            elif 'draft' in spec_info.iloc[0]['version'].lower():
                typeurl = baseurl+'draft'
            x['additional_type'] = typeurl   
            #x = add_schemaVersion(spec_type,x)     
            if "$validation" in x.keys():
                x = add_conformsTo(spec_list,x)
            cleanclassgraph.append(x) 
            


In [12]:
#### For additional Testing if "run_update" fails

updateall=True
if updateall == True:
    updatedlist = check_for_updates(script_path,True)
else:
    updatedlist = check_for_updates(script_path,False)
    print(updatedlist)
if updatedlist == False:
    print("no updates pushed")
else:
    for eachfile in updatedlist:
        print("")
        print(eachfile)
        speclist = read_csv(eachfile,delimiter='\t',header=0)
        print(speclist)
        tmpinfo = merge_specs(speclist)
        bioschemas_json = remove_NaN_fields(tmpinfo)
        jsonstring = json.dumps(bioschemas_json)
        cleanstring = remove_NaN_fields(jsonstring)
        cleandict = json.loads(cleanstring)
        prettystring = json.dumps(cleandict, indent=2)
        #### Check specification list file name to determine where to save
        if "deprecated" in eachfile:
            print("deprecated: ", eachfile)
            ####treat as deprecated
            bioschemasfile = os.path.join(script_path,'bioschemasdeprecated.json')
        elif (("type" in eachfile) and ("draft" not in eachfile)):
            print("type, release: ", eachfile)
            ####treat as type
            bioschemasfile = os.path.join(script_path,'bioschemastypes.json')
        elif (("type" in eachfile) and ("draft" in eachfile)):
            print("type, draft: ", eachfile)
            #### draft type treat as type 
            bioschemasfile = os.path.join(script_path,'bioschemastypesdrafts.json')
        elif (("profile" in eachfile) and ("draft" not in eachfile)):
            print("profile, release: ", eachfile)
            bioschemasfile = os.path.join(script_path,'bioschemas.json')
        elif (("profile" in eachfile) and ("draft" in eachfile)):
            ####treat as draft profile
            print("profile, draft: ", eachfile)
            bioschemasfile = os.path.join(script_path,'bioschemasdrafts.json')
        with open(bioschemasfile,'w') as outfile:
            outfile.write(prettystring)


profile_list.txt
     namespace                   name                             subClassOf  \
0   bioschemas      ChemicalSubstance               schema:ChemicalSubstance   
1   bioschemas                   Gene                            schema:Gene   
2   bioschemas                  Taxon                           schema:Taxon   
3   bioschemas        MolecularEntity                 schema:MolecularEntity   
4   bioschemas                Protein                         schema:Protein   
5   bioschemas        FormalParameter        bioschemastypes:FormalParameter   
6   bioschemas      ComputationalTool             schema:SoftwareApplication   
7   bioschemas  ComputationalWorkflow  bioschemastypes:ComputationalWorkflow   
8   bioschemas            DataCatalog                     schema:DataCatalog   
9   bioschemas                Dataset                         schema:Dataset   
10  bioschemas                 Sample                 bioschemastypes:Sample   

       type       ver

404 https://raw.githubusercontent.com/BioSchemas/specifications/master/ComputationalWorkflow/jsonld/ComputationalWorkflow_v1.0-RELEASE-Type.json
404 https://raw.githubusercontent.com/BioSchemas/specifications/master/FormalParameter/jsonld/FormalParameter_v1.0-RELEASE-type.json
404 https://raw.githubusercontent.com/BioSchemas/specifications/master/Taxon/jsonld/Taxon_v0.3-RELEASE-2019_11_18.json
404 https://raw.githubusercontent.com/BioSchemas/specifications/tree/master/BioChemEntity/jsonld/type/BioChemEntity_v0.7-RELEASE.json
404 https://raw.githubusercontent.com/BioSchemas/specifications/tree/master/BioSample/jsonld/type/BioSample_v0.1-RELEASE.json
404 https://raw.githubusercontent.com/BioSchemas/specifications/tree/master/ChemicalSubstance/jsonld/type/ChemicalSubstance_v0.3-RELEASE-2019_09_02.json
404 https://raw.githubusercontent.com/BioSchemas/specifications/tree/master/Gene/jsonld/type/Gene_v0.3-RELEASE-2019_09_02.json
404 https://raw.githubusercontent.com/BioSchemas/specifications

## Test a schema's compatibility with the DDE

To do this, you will need to install the biothings schema tools: pip install git+https://github.com/biothings/biothings_schema.py#egg=biothings_schema


In [6]:
from biothings_schema import Schema

script_path = ''
#url = "https://raw.githubusercontent.com/gtsueng/DDE_bioschemas/main/draft_validations/ProteinStructure_v0.5-DRAFT-2018_08_15.json"
#url = "https://raw.githubusercontent.com/BioSchemas/bioschemas-dde/main/bioschemas.json"
bioschemasfile = os.path.join(script_path,'bioschemas.json')
with open(bioschemasfile,'r') as infile:
    url = json.load(infile)

sc = Schema(url)
sc.validation


{'https://discovery.biothings.io/view/bioschemas/ChemicalSubstance': {'$schema': 'http://json-schema.org/draft-07/schema#',
  'type': 'object',
  'properties': {'potentialUse': {'description': 'Intended use of the BioChemEntity by humans.',
    'oneOf': [{'@type': 'DefinedTerm',
      'type': 'object',
      'properties': {'url': {'type': 'string', 'format': 'uri'},
       'name': {'type': 'string'},
       'termCode': {'type': 'string'},
       'identifier': {'type': 'string'}},
      'required': []},
     {'type': 'array',
      'items': {'@type': 'DefinedTerm',
       'type': 'object',
       'properties': {'url': {'type': 'string', 'format': 'uri'},
        'name': {'type': 'string'},
        'termCode': {'type': 'string'},
        'identifier': {'type': 'string'}},
       'required': []}}],
    'owl:cardinality': 'many'},
   'chemicalComposition': {'description': 'The chemical composition describes the identity and relative ratio of the chemical elements that make up the substance

In [11]:
from biothings_schema import Schema
import os
import json

script_path = os.getcwd()
parent_path = os.path.dirname(script_path)
repo_path = os.path.join(parent_path,'specifications')
spec_name = 'LabProtocol'
file_name = 'LabProtocol_v0.8-DRAFT.json'
file_path = os.path.join(repo_path,spec_name,'jsonld',file_name)
with open(file_path,'r') as infile:
    url = json.load(infile)

#url = "https://raw.githubusercontent.com/gtsueng/DDE_bioschemas/main/draft_validations/ProteinStructure_v0.5-DRAFT-2018_08_15.json"
#url = "https://raw.githubusercontent.com/BioSchemas/bioschemas-dde/main/bioschemas.json"


sc = Schema(url, base_schema=["schema.org","bioschemastypes","bioschemas",
                              "bioschemasdrafts","bioschemastypesdrafts",
                              "bioschemasdeprecated"])
sc.validation

## Deprecated functions
(Do not use)

In [None]:
#### Note, while this version of the update script looks cleaner, it fails to perform as expected

def run_update(script_path,updateall=False):
    if updateall == True:
        updatedlist = check_for_updates(script_path,True)
    else:
        updatedlist = check_for_updates(script_path,False)
    if updatedlist != False:
        for eachfile in updatedlist:
            speclist = read_csv(eachfile,delimiter='\t',header=0)
            bioschemas_json = remove_NaN_fields(merge_specs(speclist))
            jsonstring = json.dumps(bioschemas_json)
            cleanstring = remove_NaN_fields(jsonstring)
            cleandict = json.loads(cleanstring)
            prettystring = json.dumps(cleandict, indent=2)
            #### Check specification list file name to determine where to save
            if "deprecated" in eachfile:
                ####treat as deprecated
                bioschemasfile = os.path.join(script_path,'bioschemasdeprecated.json')
            elif (("type" in eachfile) and ("draft" not in eachfile)):
                ####treat as type
                bioschemasfile = os.path.join(script_path,'bioschemastypes.json')
            elif (("type" in eachfile) and ("draft" in eachfile)):
                #### draft type treat as type 
                bioschemasfile = os.path.join(script_path,'bioschemastypesdrafts.json')
            elif (("profile" in eachfile) and ("draft" not in eachfile)):
                bioschemasfile = os.path.join(script_path,'bioschemas.json')
            elif (("profile" in eachfile) and ("draft" in eachfile)):
                ####treat as draft profile
                bioschemasfile = os.path.join(script_path,'bioschemasdrafts.json')
            with open(bioschemasfile,'w') as outfile:
                outfile.write(prettystring)


In [None]:
#### No longer in use

def sort_speclist(spec_list):
    spec_type = spec_list.loc[((spec_list['type']=='Type') and (spec_list['version'].str.contains('RELEASE')))].copy()
    spec_profs = spec_list.loc[((spec_list['type']=='Profile') and (spec_list['version'].str.contains('RELEASE')))].copy()
    draft_type = spec_list.loc[((spec_list['type']=='Type') and (spec_list['version'].str.contains('DRAFT')))].copy()
    draft_profs = spec_list.loc[((spec_list['type']=='Profile') and (spec_list['version'].str.contains('DRAFT')))].copy()
    return(spec_type,spec_profs,draft_type,draft_profs)

In [None]:
#### Old method, do not use

def update_specs(script_path):
    spec_list = read_csv('specifications_list.txt',delimiter='\t',header=0)
    spec_type,spec_profs = sort_speclist(spec_list)
    bioschemas_json = remove_NaN_fields(merge_specs(spec_profs))
    bioschemasfile = os.path.join(script_path,'bioschemas.json')
    jsonstring = json.dumps(bioschemas_json)
    cleanstring = remove_NaN_fields(jsonstring)
    with open(bioschemasfile,'w') as outfile:
        outfile.write(cleanstring)
    bioschemastype_json = remove_NaN_fields(merge_specs(spec_type))
    bioschemastypefile = os.path.join(script_path,'bioschemastypes.json')
    typejsonstring = json.dumps(bioschemastype_json)
    typecleanstring = remove_NaN_fields(typejsonstring)
    with open(bioschemastypefile,'w') as typeoutfile:
        typeoutfile.write(typecleanstring)
        
## main
script_path = ""
#script_path = pathlib.Path(__file__).parent.absolute()
update_specs(script_path)