In [1]:
from typing import List
import pandas as pd
import requests
from cache_decorator import Cache
from tqdm.auto import tqdm
import json
import yaml
from downloaders import BaseDownloader
from glob import glob
from bs4 import BeautifulSoup
from userinput import userinput
import compress_json
import string

In [126]:
with open("../graph/src/url_utilities/mod.rs", "r") as f:
    file_content = f.read()

for file_name in glob("../graph/src/url_utilities/*.rs"):
    if "/mod.rs" in file_name:
        continue
    module_name = file_name.split("/")[-1].split(".")[0]
    if module_name in file_content:
        continue
    print("mod {};".format(module_name))
    print("pub use {}::*;".format(module_name))

mod c_elegans_phenotype_url_utilities;
pub use c_elegans_phenotype_url_utilities::*;


In [15]:
all_urls = {
    "yaml": [
        "https://raw.githubusercontent.com/prefixcommons/biocontext/master/registry/biocaddie-context.yaml",
        "https://raw.githubusercontent.com/monarch-initiative/dipper/master/dipper/curie_map.yaml",
        "https://archive.monarchinitiative.org/latest/translationtable/curie_map.yaml",
    ],
    "prefixes_yaml": [
        "https://raw.githubusercontent.com/biolink/biolink-model/master/biolink-model.yaml"
    ],
    "context_json": [
        "https://raw.githubusercontent.com/prefixcommons/biocontext/master/registry/commons_context.jsonld",
        "https://raw.githubusercontent.com/prefixcommons/biocontext/master/registry/globi_context.jsonld",
        "https://raw.githubusercontent.com/prefixcommons/biocontext/master/registry/go_context.jsonld",
        "https://raw.githubusercontent.com/prefixcommons/biocontext/master/registry/go_obo_context.jsonld",
        "https://raw.githubusercontent.com/prefixcommons/biocontext/master/registry/idot_context.jsonld",
        "https://raw.githubusercontent.com/prefixcommons/biocontext/master/registry/idot_nr_context.jsonld",
        "https://raw.githubusercontent.com/prefixcommons/biocontext/master/registry/minerva_context.jsonld",
        "https://raw.githubusercontent.com/prefixcommons/biocontext/master/registry/monarch_context.jsonld",
        "https://raw.githubusercontent.com/prefixcommons/biocontext/master/registry/obo_context.jsonld",
        "https://raw.githubusercontent.com/prefixcommons/biocontext/master/registry/semweb_context.jsonld"
    ]
}

In [16]:
kgcovid_node_list_url = "https://github.com/justaddcoffee/madam_curie/blob/main/merged-kg_nodes.tsv.gz?raw=true"
monarch_node_list = "https://archive.monarchinitiative.org/202012/kgx/sri-reference-kg_nodes.tsv.gz"

In [17]:
# BaseDownloader(verbose=2, auto_extract=False).download(monarch_node_list)

In [18]:
kg_covid_nodes = pd.read_csv("downloads/merged-kg_nodes.tsv.gz", sep="\t", usecols=[0], dtype=str)
monarch_nodes = pd.read_csv("downloads/sri-reference-kg_nodes.tsv.gz", sep="\t", usecols=[0], dtype=str)
all_nodes = pd.concat([
    kg_covid_nodes,
    monarch_nodes
])

In [19]:
all_nodes = all_nodes.astype(str)

In [20]:
@Cache(cache_dir="cached_webpages")
def download(url:str)->str:
    """Retrieve content of webpage at given URL."""
    return requests.get(url)

def download_text(url:str)->str:
    """Retrieve content of webpage at given URL."""
    return download(url).text

In [21]:
from typing import Dict

def parse_yaml(body: str) -> Dict[str, str]:
    """Parse a YAML file."""
    return yaml.safe_load(body)

In [22]:
from typing import Dict

def parse_prefixes_yaml(body: str) -> Dict[str, str]:
    """Parse a YAML file."""
    return yaml.safe_load(body)['prefixes']

In [23]:
def parse_context_json(body: str) -> Dict[str, str]:
    """Parse JSON containing context."""
    return json.loads(body)["@context"]

In [24]:
def parse_body(body: str, body_type:str)->Dict[str, str]:
    """Dispatches body parsing according to body type."""
    return {
        "yaml": parse_yaml,
        "context_json": parse_context_json,
        "prefixes_yaml": parse_prefixes_yaml
    }[body_type](body)

In [25]:
already_supported = {
    key.upper() for key in (
        "DOI",
        "PMID",
        "BIOGRID",
        "biolink",
        "CHEBI",
        "Coriell",
        "DrugCentral",
        "ENSEMBL",
        "FlyBase",
        "GO",
        "JAX",
        "MMRRC",
        "MGI",
        "ClinVarVariant",
        "NCBIGene",
        "MESH",
        "NCBITaxon",
        "OMIM",
        "PMID",
        "RGD",
        "REACT",
        "SO",
        "ttd.drug",
        "UniProtKB",
        "WD",
        "WormBase",
        "ZFIN"
    )
}

In [100]:
body = {}
for body_type, urls in tqdm(all_urls.items(), desc="Parsing different body types"):
    for url in tqdm(urls, desc="Retrieving urls", leave=False):
        body.update(parse_body(
            download_text(url),
            body_type
        ))

body = {
    key.upper(): value
    for key, value in body.items()
    if key and key.upper() not in already_supported
}

Parsing different body types:   0%|          | 0/3 [00:00<?, ?it/s]

Retrieving urls:   0%|          | 0/3 [00:00<?, ?it/s]

Retrieving urls:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving urls:   0%|          | 0/10 [00:00<?, ?it/s]

In [27]:
sources = {
    node.split(":", maxsplit=1)[0].upper()
    for node in tqdm(all_nodes.id)
    if not node.startswith(":")
}

  0%|          | 0/17149476 [00:00<?, ?it/s]

In [28]:
nodes_per_source = {source:[] for source in sources}
id_per_source = {source:[] for source in sources}

for node in tqdm(all_nodes.id, leave=False):
    if node.startswith(":") or ":" not in node:
        continue
    node_source, node_id = node.split(":", maxsplit=1)
    id_per_source[node_source.upper()].append(node_id)
    nodes_per_source[node_source.upper()].append(node)

nodes_per_source = {
    source: node_names
    for source, node_names in tqdm(nodes_per_source.items())
    if node_names
}
id_per_source = {
    source: node_names
    for source, node_names in tqdm(id_per_source.items())
    if node_names
}

  0%|          | 0/17149476 [00:00<?, ?it/s]

  0%|          | 0/3425 [00:00<?, ?it/s]

  0%|          | 0/3425 [00:00<?, ?it/s]

In [29]:
def detect_common_root(node_names: List[str]):
    """Return the possible common prefix of all the given nodes."""
    if len(node_names) < 2:
        return None
    found_prefix = False
    # The starting prefix is the first character of the first
    # node in the list
    current_prefix = node_names[0][0]
    if current_prefix.isnumeric():
        return None
    while True:
        if all(
            node_name.startswith(current_prefix)
            for node_name in node_names
        ):
            # We can increase the tentative prefix
            found_prefix = True
            if len(current_prefix) == len(node_names[0]):
                return current_prefix
            if node_names[0][len(current_prefix)].isnumeric():
                return current_prefix
            current_prefix = node_names[0][:len(current_prefix) + 1]
        else:
            if found_prefix:
                return current_prefix
            else:
                return None

In [30]:
def detect_common_numeric_id_length(node_names: List[str]):
    """Return the possible common prefix of all the given nodes."""
    if len(node_names) < 2:
        return None
    first_node_len = len(node_names[0])
    if any(
        len(node_name) != first_node_len
        for node_name in node_names
    ):
        return None
    common_root = detect_common_root(node_names)
    if common_root is None:
        common_root = ""
    if all(
        node_name[len(common_root):].isnumeric()
        for node_name in node_names
    ):
        return first_node_len - len(common_root)
    return None

In [31]:
def detect_common_base_length(node_names: List[str]):
    """Return the possible common prefix of all the given nodes."""
    if len(node_names) < 2:
        return None
    first_node_len = len(node_names[0])
    if any(
        len(node_name) != first_node_len
        for node_name in node_names
    ):
        return None
    return first_node_len

In [109]:
def to_code_friendly_library_name(library_name: str)->str:
    for term in (",", ";", ":", "/"):
        library_name = library_name.replace(term, "")
    return library_name.lower().replace(" ", "_")

In [33]:
common_base_lengths_nodes = {
    source: root
    for source, root in {
        source: detect_common_base_length(node_names)
        for source, node_names in tqdm(nodes_per_source.items())
    }.items()
    if root
}

  0%|          | 0/194 [00:00<?, ?it/s]

In [34]:
common_base_lengths_ids = {
    source: root
    for source, root in {
        source: detect_common_base_length(node_names)
        for source, node_names in tqdm(id_per_source.items())
    }.items()
    if root
}

  0%|          | 0/194 [00:00<?, ?it/s]

In [35]:
common_roots_ids = {
    source: root.upper()
    for source, root in {
        source: detect_common_root(node_names)
        for source, node_names in tqdm(id_per_source.items())
    }.items()
    if root
}

  0%|          | 0/194 [00:00<?, ?it/s]

In [36]:
common_ids_length = {
    source: common_id_len
    for source, common_id_len in {
        source: detect_common_numeric_id_length(node_names)
        for source, node_names in tqdm(id_per_source.items())
    }.items()
    if common_id_len
}

  0%|          | 0/194 [00:00<?, ?it/s]

In [37]:
validation_function_body = """
#[automatically_generated_function]
/// Returns whether the given node name respects the {library_name} nodes pattern.
///
/// # Arguments
/// * `node_name`: &str - Node name to check pattern with.
///
/// # Example
/// To validate a node you can use:
/// ```rust
/// # use graph::*;
/// let this_library_node_name = "{example_node_name}";
/// let not_this_library_node_name = "PizzaQuattroStagioni";
/// assert!(is_valid_{code_friendly_library_name}_node_name(this_library_node_name));
/// assert!(!is_valid_{code_friendly_library_name}_node_name(not_this_library_node_name));
/// ```
pub fn is_valid_{code_friendly_library_name}_node_name(node_name: &str) -> bool {{
    is_valid_node_name_from_seeds(
        node_name,
        {library_prefix},
        {base_curie_length},
        Some(":"),
        {id_acronym},
        {id_length},
        {numeric_part_length}
    )
    .is_ok()
}}
"""

In [42]:
full_library_names = compress_json.local_load("full_library_names.json.gz")

In [47]:
additional_nodes_per_source = compress_json.local_load("additional_nodes_per_source.json.gz")
nodes_per_source.update(additional_nodes_per_source)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/lucacappelletti/github/ensmallen/notebooks_and_scripts/additional_nodes_per_source.json.gz'

In [54]:
def is_already_present(prefix: str) -> bool:
    for path in glob("../graph/src/url_utilities/*.rs"):
        with open(path, "r") as f:
            if f"\"{prefix}:" in f.read().upper():
                return True
    return False

In [55]:
def generate_validate_node_name(library_prefix: str):
    def validate_node_name(node_name_candidate: str) -> bool:
        return node_name_candidate.startswith(library_prefix + ":")
    return validate_node_name

In [105]:
black_list_prefix = set([
    "GDC.CASE",
    "GDC.FILE",
    "APB",
    "PATO-PROPERTY",
    "BMRDB"
])

In [86]:
obo_ontologies = {
    ontology["id"].upper(): ontology["title"]
    for ontology in parse_yaml(download_text("http://www.obofoundry.org/registry/ontologies.yml"))["ontologies"]
}

In [103]:
from typing import Optional

def optionify(term: Optional[str])->str:
    return "None" if term is None else f"Some({term})"

In [120]:
counter = 0

body = {
    library_prefix: url
    for library_prefix, url in body.items()
    if not (
        #is_already_present(library_prefix) or
        library_prefix in black_list_prefix# or
        #nodes_per_source.get(library_prefix) is None
    )
}

library_names = []

for library_prefix, url in tqdm(body.items()):
    #if is_already_present(library_prefix):
    #    continue
    if library_prefix in black_list_prefix:
        continue
    node_name = None
    node_names = nodes_per_source.get(library_prefix)
    if node_names:
        node_name = node_names[0]
    
    if not node_name:
        continue
        print(url)
        node_name = userinput(
            name=f"{library_prefix}_node_name",
            label=f"Please insert the node name for {library_prefix}",
            validator=generate_validate_node_name(library_prefix)
        )
        nodes_per_source[library_prefix] = [node_name]
        additional_nodes_per_source[library_prefix] = [node_name]
        compress_json.local_dump(additional_nodes_per_source, "additional_nodes_per_source.json.gz")
    
    library_name = full_library_names.get(library_prefix)
    
    if not library_name:
        library_name = obo_ontologies.get(library_prefix)
    
    if not library_name:
        print(url, node_name)
        library_name = userinput(
            name=f"{library_prefix}_library_name",
            label=f"Please insert the library name for {library_prefix}",
        )
        full_library_names[library_prefix] = library_name
        compress_json.local_dump(full_library_names, "full_library_names.json.gz")
        
    code_friendly_name = to_code_friendly_library_name(library_name)
    parameters = dict(
        library_name=library_name,
        library_prefix= optionify(None if library_prefix is None else f"&[\"{library_prefix}\"]"),
        base_curie_length=optionify(common_base_lengths_nodes.get(library_prefix)),
        id_acronym=optionify(common_roots_ids.get(library_prefix)),
        id_length=optionify(common_base_lengths_ids.get(library_prefix)),
        numeric_part_length=optionify(common_ids_length.get(library_prefix)),
        code_friendly_library_name=code_friendly_name,
        url=url,
        example_node_name=node_name
    )
    
    formatted = full_file_body.format(**parameters)
    library_names.append(
        piece_of_get_node_source_url_from_node_name.format(**parameters)
    )
    
    #with open(f"../graph/src/url_utilities/{code_friendly_name}_url_utilities.rs", "w") as f:
    #    f.write(formatted)

print(f"Generated {counter} functions")

  0%|          | 0/1020 [00:00<?, ?it/s]

Generated 0 functions


In [118]:
piece_of_get_node_source_url_from_node_name = """
if is_valid_{code_friendly_library_name}_node_name(node_name) {{
    return Ok(unsafe {{ format_{code_friendly_library_name}_url_from_node_name(node_name) }});
}}
"""

In [119]:
piece_of_get_node_repository_from_node_name = """
if is_valid_{code_friendly_library_name}_node_name(node_name) {{
    return Ok("{library_name}");
}}
"""

In [44]:
{
    node
    for node in tqdm(nodes.id)
    if "CORD" in node
}

  0%|          | 0/460730 [00:00<?, ?it/s]

{'CORD:0fb1c9d614eb4029db693c30a6af748c825fd6d7',
 'CORD:PMC7128372',
 'CORD:62a21a7986566a3c746c462c1cf142ce2bc072e1',
 'CORD:17dba3d819679a6a4d8118ec3bf0c49fc53ed4a3',
 'CORD:131902a67f5e00ae13b9239427325dfd53413349',
 'CORD:ddc66d5f9cd63772748db0b1de5f890eb25531b2',
 'CORD:dec75f95aef9acb532d86be6093c59a59ef3eb41',
 'CORD:PMC7094597',
 'CORD:PMC7338960',
 'CORD:7fc2ccb839b1abe6ba707ac84c1077eb282dcd0c',
 'CORD:99a6e13eb12cf2da182eb299b724360a3c3c8a37',
 'CORD:ee106840a4b1667303e4b84d25795de3617bd7ef',
 'CORD:PMC6914129',
 'CORD:819e7acc5ff5ebc39db48a4d4fc27806421a6232',
 'CORD:ef7da732693b891252f300767d661e8ff7013db3',
 'CORD:d76a837b2a11992c6a9efc8ecad21b67eccd4763',
 'CORD:PMC7187825',
 'CORD:PMC7157949',
 'CORD:abe0c8d252338f5b1071c70faf60aef694aeb2ec',
 'CORD:aa86d2456df121fae6294596799da9a6cd88d050',
 'CORD:a3f9d6933e737f7f0a7bd27a54a9bb4882eca041',
 'CORD:bbe1b85f2ddd462aa41a504c8c736d3b7655bc00',
 'CORD:94ccf97f5df377bf432fa3a831e38f11985965bd',
 'CORD:38918b16fd885fb86e2ecec

In [127]:
len("CORD:PMC7187825")

15

In [128]:
len("PMC7187825")

10