In [1]:
import sys
!{sys.executable} -m pip install rdflib



## Modify context and remove unnecessary keys from the JSON-LD file

In [122]:
import urllib.request, json

In [123]:
def remove_key (key_to_remove, data):
  if key_to_remove in data:
    #removed_value = data.pop(key_to_remove)
    del data[key_to_remove]

In [124]:
def replace_string_in_json(search_value, replace_value, data):
  if isinstance(data, dict):  # Check if data is a dictionary
    for key, value in data.items():
      if isinstance(value, str) and search_value in value:  # Check if value is a string containing search_value
        data[key] = value.replace(search_value, replace_value)
      elif isinstance(value, (dict, list)):  # Recursively call for nested dictionaries or lists
        replace_string_in_json(search_value, replace_value, value)
  elif isinstance(data, list):  # Check if data is a list
    for item in data:
      replace_string_in_json(search_value, replace_value, item) # Recursively call for list items

In [148]:
def process_context(types_data, bs_ns, bs_url) :
    #remove all DDE bioschemas namespaces
    remove_key("bioschemas", types_data["@context"])
    remove_key("bioschemasdrafts", types_data["@context"])
    remove_key("bioschemastypes", types_data["@context"])
    remove_key("bioschemastypesdrafts", types_data["@context"])
    remove_key("bioschemasdeprecated", types_data["@context"])
    
    #also remove schema as it should point to https://schema.org rather than http://schema.org
    remove_key("schema", types_data["@context"])
    
    #add the canonical Bioschemas types namespace and corresponding url
    types_data["@context"]["bioschemas"] = "https://bioschemas.org/terms/"
    try :
        types_data["@context"][bs_ns]
    except KeyError :
        types_data["@context"][bs_ns] = bs_url
    
    #also add back schema pointing to https://schema.org
    types_data["@context"]["schema"] = "https://schema.org/"

In [141]:
def process_graph(types_data, bs_ns, dde_ns) :
    #remove additional_type from every element in the graph, used on the website for navigation but not needed in the JSON-LD
    for item in types_data["@graph"]:
        remove_key("additional_type", item)
        remove_key("schema:additionalType", item)

    #now use the new bioschemas namespace instedad the old one bioschemastypes
    replace_string_in_json(dde_ns, bs_ns, types_data["@graph"])
    replace_string_in_json("bioschemastypes", "bioschemas", types_data["@graph"])

In [133]:
def process_json(types_data, bs_ns, bs_url, dde_ns) :
    process_context(types_data, bs_ns, bs_url)
    process_graph(types_data, bs_ns, dde_ns)

In [134]:
def process_all_json(origin, destination, bs_ns, bs_url, dde_ns) :
    #Get the bioschemas types file from the Bioschemas DDE GitHub repository
    with urllib.request.urlopen(origin) as types_file:
        types_data = json.load(types_file)
    
    process_json(types_data, bs_ns, bs_url, dde_ns)
    
    #save modified json to local temp file
    with open(destination, 'w') as temp_types_file:
        json.dump(types_data, temp_types_file)

### Process JSON-LD file for Bioschemas types

In [149]:
origin = "https://github.com/BioSchemas/bioschemas-dde/raw/main/bioschemastypes.json"
destination = "bioschemas_types_temp.json"
bs_ns = "bioschemas"
bs_url = "https://bioschemas.org/terms/"
dde_ns = "bioschemastypes"
dde_url = "https://discovery.biothings.io/view/bioschemastypes/"
w3id_url = "https://w3id.org/bioschemas/terms/"
process_all_json(origin, destination, bs_ns, bs_url, dde_ns)

### Process JSON-LD file for Bioschemas draft types

In [150]:
origin = "https://github.com/BioSchemas/bioschemas-dde/raw/main/bioschemastypesdrafts.json"
destination = "bioschemas_draft_types_temp.json"
bs_ns = "bioschemas_draft"
bs_url = "https://bioschemas.org/draft_terms/"
dde_ns = "bioschemastypesdrafts"
dde_url = "https://discovery.biothings.io/view/bioschemastypesdrafts/"
w3id_url = "https://w3id.org/bioschemas/draft_terms/"
process_all_json(origin, destination, bs_ns, bs_url, dde_ns)

## Add same-as to DDE and w3id

In [2]:
from rdflib import Graph
from rdflib import URIRef
from rdflib.namespace import RDF, RDFS, OWL

In [18]:
def add_equivalence(g, eq_dde_type, eq_w3id_type, eq_dde_type_draft, eq_w3id_type_draft, bs_ns, bs_ns_draft) :

    #Classes and properties already aincluded in schema.org
    types_in_schema = ["BioChemEntity", "ChemicalSubstance", "Gene", "MolecularEntity", "Protein", "Taxon"]
    properties_in_schema = ["bioChemInteraction", "bioChemSimilarity", "biologicalRole", "hasBioChemEntityPart", "hasMolecularFunction", "hasRepresentation", "isEncodedByBioChemEntity", "isInvolvedInBiologicalProcess", "isLocatedInSubcellularLocation", "isPartOfBioChemEntity", "taxonomicRange", "alternativeOf", "encodesBioChemEntity", "expressedIn", "hasBioPolymerSequence", "chemicalComposition", "chemicalRole", "potentialUse", "chemicalRole", "inChi", "inChiKey", "iupacName", "molecularFormula", "molecularWeight", "monoisotopicMolecularWeight", "smiles", "childTaxon", "parentTaxon", "taxonRank"]
    eq_schema = URIRef("https://schema.org/")
    
    #iterates over all classes in the bioschemas namespace
    rdfs_class = URIRef("http://www.w3.org/2000/01/rdf-schema#Class")
    eq_class = URIRef("http://www.w3.org/2002/07/owl#equivalentClass")        
    for s in g.subjects(object=rdfs_class, unique=True) :
        sufix = str(s).split('/')[-1]
        #print("type: ", sufix)
        if bs_ns in s :
            g.add((s, eq_class, eq_dde_type + sufix))
            g.add((s, eq_class, eq_w3id_type + sufix))
            if sufix in types_in_schema :
                g.add((s, eq_class, eq_schema + sufix))
        if bs_ns_draft in s :
            g.add((s, eq_class, eq_dde_type_draft + sufix))
            g.add((s, eq_class, eq_w3id_type_draft + sufix))      
    
    #iterates over all properties in the bioschemas namespace
    schema_domain = URIRef("https://schema.org/domainIncludes")
    eq_prop = URIRef("http://www.w3.org/2002/07/owl#equivalentProperty")
    for s in g.subjects(predicate=schema_domain, unique=True) :
        sufix = str(s).split('/')[-1]
        #print("property: ", sufix)
        if bs_ns in s :
            g.add((s, eq_prop, eq_dde_type + sufix))
            g.add((s, eq_prop, eq_w3id_type + sufix))
            if sufix in properties_in_schema :
                g.add((s, eq_prop, eq_schema + sufix))
        if bs_ns_draft in s :
            g.add((s, eq_prop, eq_dde_type_draft + sufix))
            g.add((s, eq_prop, eq_w3id_type_draft + sufix))


In [14]:
def process_equivalences(g_file, json_file, ttl_file) :
    #equivalences for types
    eq_dde_type = URIRef("https://discovery.biothings.io/view/bioschemastypes/")
    eq_w3id_type = URIRef("https://w3id.org/bioschemas/terms/")
    
    #equivalences for draft types
    eq_dde_type_draft = URIRef("https://discovery.biothings.io/view/bioschemastypesdrafts/")
    eq_w3id_type_draft = URIRef("https://w3id.org/bioschemas/draft_terms/")
    
    #bioschemas namespace for types and draft types
    bs_ns = "https://bioschemas.org/terms/"
    bs_ns_draft = "https://bioschemas.org/draft_terms/"

    #load graph
    g = Graph()
    g.parse(g_file)
    
    #add equivalences
    add_equivalence(g, eq_dde_type, eq_w3id_type, eq_dde_type_draft, eq_w3id_type_draft, bs_ns, bs_ns_draft)
    
    #save as json
    g.serialize(destination=json_file, format="json-ld", auto_compact=True, indent=2)
    #save as turtle
    g.serialize(destination=ttl_file, format="turtle")

In [21]:
g_file = "bioschemas_types_temp.json"
json_file="bioschemas_types_temp.jsonld"
ttl_file="bioschemas_types_temp.ttl"
process_equivalences(g_file, json_file, ttl_file)

In [20]:
g_file = "bioschemas_draft_types_temp.json"
json_file="bioschemas_draft_types_temp.jsonld"
ttl_file="bioschemas_draft_types_temp.ttl"
process_equivalences(g_file, json_file, ttl_file)