# Merging all profiles in a single file

In [38]:
from rdflib import Dataset, Graph, URIRef
from rdflib.namespace import RDF

from jinja2 import Template

import os
from pathlib import Path
import requests
import yaml
from tqdm.notebook import tqdm
import json

In [39]:
class Profile:
    # TODO doc class
    # TODO getters for class attributes

    # cache = {}

    def __init__(
        self,
        shape_name,
        target_classes,
        min_props,
        rec_props,
        opt_props,
        ref_profile,
        description,
        latest=True,
        deprecated=False,
    ):
        self.shape_name = shape_name
        self.target_classes = target_classes
        self.min_props = min_props
        self.rec_props = rec_props
        self.opt_props = opt_props
        self.ref_profile = ref_profile
        self.latest = latest
        self.deprecated = deprecated
        self.description = description

        self.shacl_shape = self.gen_SHACL_from_profile()

        self.nb_min = len(self.min_props)
        self.nb_rec = len(self.rec_props)
        self.nb_opt = len(self.opt_props)

    def get_name(self):
        return self.shape_name

    def get_target(self):
        return self.target_classes

    def get_required(self):
        return self.min_props

    def get_recommended(self):
        return self.rec_props
    
    def get_optional(self):
        return self.opt_props

    def get_ref_profile(self):
        return self.ref_profile

    def get_shacl_shape(self):
        return self.shacl_shape

    def get_is_deprecated(self):
        return self.deprecated

    def get_latest_profile(self):
        return self.latest

    def gen_SHACL_from_profile(self):
        shape_name = self.shape_name
        target_classes = self.target_classes
        min_props = self.min_props
        rec_props = self.rec_props
        opt_props = self.opt_props
        ref_profile = self.ref_profile
        description = self.description

        # print(shape_name)
        # print(target_classes)
        # print(min_props)
        # print(rec_props)

        # @prefix bsc: <https://bioschemas.org/> .
        # @prefix bsc: <https://discovery.biothings.io/view/bioschemas/> .

        shape_template = """
            @prefix fc: <https://fair-checker.france-bioinformatique.fr#> .
            @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
            @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
            @prefix sc: <http://schema.org/> .
            @prefix scs: <https://schema.org/> .
            @prefix bsc: <https://discovery.biothings.io/view/bioschemas/> .
            @prefix dct: <http://purl.org/dc/terms/> .
            @prefix sh: <http://www.w3.org/ns/shacl#> .
            @prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
            @prefix edam: <http://edamontology.org/> .
            @prefix biotools: <https://bio.tools/ontology/> .
            @prefix bioschemasdrafts: <https://discovery.biothings.io/view/bioschemasdrafts/> .
            @prefix bioschemastypes: <https://discovery.biothings.io/view/bioschemastypes/> .
            @prefix bioschemastypesdrafts: <https://discovery.biothings.io/view/bioschemastypesdrafts/> .
            @prefix bh2022GH: <https://discovery.biothings.io/view/bh2022GH/> .
            @prefix dwc: <http://rs.tdwg.org/dwc/terms/> .
            @prefix ppeo: <http://purl.org/ppeo/PPEO.owl#> .

            <{{ref_profile}}>
                a sh:NodeShape ;
                sh:description "{{description}}" ;

                {% for c in target_classes %}
                sh:targetClass {{c}}, {{c.replace("sc:", "scs:")}} ;
                {% endfor %}

                {% for min_prop in min_props %}
                sh:property [
                    {% if min_prop.id.startswith("sc:") %}
                    sh:path [sh:alternativePath({{min_prop.id}} {{min_prop.id.replace("sc:", "scs:")}})] ;
                    {% else %}
                    sh:path {{min_prop.id}} ;
                    {% endif %}
                    sh:minCount 1 ;
                    sh:severity sh:Violation ; 
                    sh:description "{{min_prop.desc}}" 
                ] ;
                {% endfor %}

                {% for rec_prop in rec_props %}
                sh:property [
                    {% if rec_prop.id.startswith("sc:") %}
                    sh:path [sh:alternativePath({{rec_prop.id}} {{rec_prop.id.replace("sc:", "scs:")}})] ;
                    {% else %}
                    sh:path {{rec_prop.id}} ;
                    {% endif %}
                    sh:minCount 1 ;
                    sh:severity sh:Warning ;
                    sh:description "{{rec_prop.desc}}" 
                ] ;
                {% endfor %}
                
                {% for opt_prop in opt_props %}
                sh:property [
                    {% if opt_prop.id.startswith("sc:") %}
                    sh:path [sh:alternativePath({{opt_prop.id}} {{opt_prop.id.replace("sc:", "scs:")}})] ;
                    {% else %}
                    sh:path {{opt_prop.id}} ;
                    {% endif %}
                    sh:description "{{opt_prop.desc}}" 
                ] ;
                {% endfor %}
.
        """

        # [sh: alternativePath(ex:father ex: mother  )]

        template = Template(shape_template)
        #print(shape_name)
        #print(target_classes)
        #print(min_props)
        #print(rec_props)
        #print(opt_props)
        #print(ref_profile)
        shape = template.render(
            shape_name=shape_name,
            target_classes=target_classes,
            min_props=min_props,
            rec_props=rec_props,
            opt_props=opt_props,
            ref_profile=ref_profile, 
            description=description
        )

        return shape

In [40]:
def request_profile_versions():
    response = requests.get(
        "https://raw.githubusercontent.com/BioSchemas/bioschemas.github.io/master/_data/profile_versions.yaml"
    )
    content = response.text
    dict_content = yaml.safe_load(content)
    return dict_content


def profile_file_parser(url_profile):
    response = requests.get(url_profile)
    if response.status_code == 200:
        profiles_dict = {}
        profiles_jsonld = response.json()

        for element in profiles_jsonld["@graph"]:
            profile_dict = {
                "name": "",
                "target_classes": [],
                "file": "",
                "required": [],
                "recommended": [],
                "optional": [],
                "id": "",
                "ref_profile": "",
            }
            profiles_versions = request_profile_versions()

            # for element in profiles_jsonld["@graph"]:
            if element["@type"] == "rdfs:Class":
                # print("Class: " + element["@id"])
                name = element["rdfs:label"]
                profile_dict["id"] = element["@id"].replace("bioschemas", "bsc")
                profile_dict["name"] = name
                
                p_desc = element["rdfs:comment"]
                p_desc = ' '.join(p_desc.split())
                p_desc = p_desc.replace('"', '\\"')
                p_desc = p_desc.replace('\n', '')
                p_desc = p_desc.replace('\t', '')
                p_desc = p_desc.replace('\cr', '')
                p_desc = p_desc.replace(' \ ', '')
                
                profile_dict["description"] = p_desc

                sc_type = element["rdfs:subClassOf"]["@id"]

                # replace DDE prefix by schema.org prefix for Schema.org types
                replace_prefix = {
                    "bioschemastypes:": "sc:",
                    # "bioschemastypesdrafts:": "sc:",
                    "schema:": "sc:",
                }
                for i, j in replace_prefix.items():
                    sc_type = sc_type.replace(i, j)

                profile_dict["target_classes"].append(sc_type)
                if "schema:schemaVersion" in element.keys():
                    for url in element["schema:schemaVersion"]:
                        if "https://bioschemas.org" in url:
                            profile_dict["ref_profile"] = url

                            status_code = requests.head(url).status_code
                            if status_code != 200:
                                print(url)
                                # print(status_code)
                        else:
                            raw_file_base = "https://raw.githubusercontent.com/BioSchemas/specifications/master/"
                            file_url = url
                            file_url_path = file_url.split("/master/")[-1]
                            raw_file_url = raw_file_base + file_url_path
                            profile_dict["file"] = raw_file_url

                            # status_code = requests.head(raw_file_url).status_code
                            # if status_code != 200:
                            print(raw_file_url)
                            # print(status_code)
                        profile_dict["version"] = url
                else:
                    if profiles_versions[name]["latest_release"]:
                        latest_version = profiles_versions[name]["latest_release"]
                    else:
                        latest_version = profiles_versions[name]["latest_publication"]

                    bs_profile_url_base = ("https://bioschemas.org/profiles/" + name + "/")
                    bs_profile_url_path = bs_profile_url_base + latest_version
                    profile_dict["latest_version"] = latest_version
                    profile_dict["ref_profile"] = bs_profile_url_path

                importance_levels = ["required", "recommended", "optional"]

                for importance in importance_levels:
                    if "$validation" in element:
                        if importance in element["$validation"]:
                            for property in element["$validation"][importance]:
                                
                                desc = element["$validation"]["properties"][property]["description"]
                                desc = ' '.join(desc.split())
                                desc = desc.replace('"', '\\"')
                                desc = desc.replace('\n', '')
                                desc = desc.replace('\t', '')
                                desc = desc.replace('\cr', '')
                                desc = desc.replace(' \ ', '')
                                
                                added = False
                                            
                                # Identifying non Schema properties
                                for elem in profiles_jsonld["@graph"]:
                                    
                                    if (
                                        elem["@type"] == "rdf:Property"
                                        and property == elem["rdfs:label"]
                                    ):

                                        
                                        profile_dict[importance].append(
                                            # Maybe change prefix in shape gen instead
                                            {"id": elem["@id"].replace("bioschemas:", "bsc:"), 
                                             "desc": desc}
                                        )
                                        added = True
                                if added:
                                    continue
                                profile_dict[importance].append({"id":"sc:" + property, "desc":desc})

                profiles_dict[profile_dict["ref_profile"]] = profile_dict

    # for compatibility with existing code
    # profile_dict["min_props"] = profile_dict.pop("required")
    # profile_dict["rec_props"] = profile_dict.pop("recommended")

    # print(json.dumps(profiles_dict, indent=2))
    # print(len(profiles_dict))
    return profiles_dict


def get_profiles_from_dde():
    url_profiles = [
        "https://raw.githubusercontent.com/BioSchemas/bioschemas-dde/main/bioschemas.json",
        "https://raw.githubusercontent.com/BioSchemas/bioschemas-dde/main/bioschemasdrafts.json",
        # "https://raw.githubusercontent.com/BioSchemas/bioschemas-dde/main/bioschemastypes.json",
        # "https://raw.githubusercontent.com/BioSchemas/bioschemas-dde/main/bioschemastypesdrafts.json"
    ]
    results = {}
    profiles_names_list = []
    for url_profile in tqdm(url_profiles):
        profiles_dict = profile_file_parser(url_profile)

        for profile_key in tqdm(profiles_dict):
            if profiles_dict[profile_key]["name"] not in profiles_names_list:
                results[profile_key] = profiles_dict[profile_key]
                profiles_names_list.append(profiles_dict[profile_key]["name"])
    return results

def load_profiles():
    
    output_file = Path("profiles/bs_profiles.json")
    output_file.parent.mkdir(exist_ok=True, parents=True)

    if not os.path.exists("profiles/bs_profiles.json"):
        print("Updating Bioschemas profiles from github")
        profiles = get_profiles_from_dde()
        # profiles = get_profiles_specs_from_github()
        with open("profiles/bs_profiles.json", "w") as outfile:
            json.dump(profiles, outfile)
        print("Profiles updated")
    else:
        print("Reading Bioschemas profiles from local file")
        # Opening JSON file
        with open("profiles/bs_profiles.json", "r") as openfile:
            # Reading from json file
            profiles = json.load(openfile)
    return profiles

In [41]:
profiles = load_profiles()

Reading Bioschemas profiles from local file


In [45]:
all_shapes_g = Graph()
for profile_key in tqdm(profiles.keys()):
    #print(profiles[profile_key])
    #break
    shape_name=profiles[profile_key]["name"]
    profile = Profile(
        shape_name=shape_name,
        description=profiles[profile_key]["description"],
        target_classes=profiles[profile_key]["target_classes"],
        min_props=profiles[profile_key]["required"],
        rec_props=profiles[profile_key]["recommended"],
        opt_props=profiles[profile_key]["optional"],
        ref_profile=profiles[profile_key]["ref_profile"],
    )
    shape = profile.gen_SHACL_from_profile()
    
    #print(shape)
    shape_g = Graph().parse(data=shape, format="turtle")
    all_shapes_g += shape_g
    shape_version = profiles[profile_key]["ref_profile"].rpartition("/")[-1]
    shape_g.serialize(destination="profiles/"+shape_name+"-"+shape_version+"-shacl.ttl", format="turtle")    
    shape_g.serialize(destination="profiles/"+shape_name+"-"+shape_version+"-shacl.jsonld", format="json-ld")

all_shapes_g.serialize(destination="profiles/bioschemas_profiles_shacl.ttl", format="turtle")
all_shapes_g.serialize(destination="profiles/bioschemas_profiles_shacl.jsonld", format="json-ld")

  0%|          | 0/32 [00:00<?, ?it/s]

<Graph identifier=N14c5ec2686d541ac8556c105cbcea5d6 (<class 'rdflib.graph.Graph'>)>

In [43]:
def check_url_status(url):
    try:
        response = requests.get(url, timeout=5)
        if response.status_code == 200:
            print(f"URL {url} is accessible (Status: 200 OK)")
            return True
        else:
            print(f"URL {url} returned status: {response.status_code}")
            return False
    except requests.exceptions.RequestException as e:
        print(f"Error accessing {url}: {e}")
        return False
    
    
def check_profiles_URIS():
    versions = request_profile_versions()
#    print(versions)

    for v in versions.keys():
        name = versions[v]["name"]
        latest = versions[v]["latest_publication"]
        uri = f"https://bioschemas.org/profiles/{name}/{latest}"
        check_url_status(uri)
        #print(uri)
        if versions[v]["latest_release"]:
            rel = versions[v]["latest_release"]
            uri = f"https://bioschemas.org/profiles/{name}/{rel}"
            #print(uri)
            check_url_status(uri)