## Fair Scraper

In [5]:
"""
Scrapes information from FAIRsharing.org
"""
import os
import requests
from lxml import html
import pandas as pd
import json
#from ncats_translator_dqa import config

In [11]:
import sys
sys.path.insert(0, '/Users/pedrohserrano/NCATS-Translator-DQA') 
import ncats_translator_dqa as dqa

#### fair_scraper(url)
FAIRsharing.org for some basic information.

    Scrapes FAIRsharing.org for some basic information, including title, scope and data types, terminology artifacts,
    and conditions of use.

    :param url: String url to page to scrape
    :return: FAIRPrelimStats object

- Which url

In [46]:
url = 'https://fairsharing.org/biodbcore-000015'

In [47]:
# output message
#if config.verbose:
#    print('Scraping: ' + url)

In [48]:
# load the page
page = requests.get(url)

In [49]:
# parse the HTML
html_content = html.fromstring(page.content)

- Get the title

In [50]:
title = html_content.xpath('//div[@class="title-text"]/h2/text()[last()]')

In [51]:
title = title[0].strip()

In [52]:
title = [title]
title

['ChEMBL: a large-scale bioactivity database for drug discovery']

- Get the tags

In [53]:
# Find the listed items under Scope and data types
# <li class="bio-tag domain">
#     <span class="bio-icon-tag"style="padding-right: 5px"></span>
#     Approved drug
# </li>
sad = html_content.xpath('//li[@class="bio-tag domain"]/text()[last()]')

In [54]:
sad = [x.strip() for x in sad]
sad

['Approved drug', 'Biomedical Science', 'Peptide', 'Small molecule']

- Get the terminology artifacts

In [55]:
# Find the list items under Terminology Artifacts
# <p><span class="heavier">Terminology Artifacts</span></p>
# <ul class="record-list-link">
# 	<li class="small"><a href="/bsg-s000039" target="_blank">Chemical Entities of Biological Interest</a></li>
# 	<li class="small"><a href="/bsg-s000136" target="_blank">PSI Molecular Interaction Controlled Vocabulary</a></li>
# </ul>
ta = html_content.xpath('//span[text()="Terminology Artifacts"]/../../ul/li/a/text()')
ta = [x.strip() for x in ta]

In [56]:
ta

['Chemical Entities of Biological Interest',
 'PSI Molecular Interaction Controlled Vocabulary']

- Get the license

In [57]:
# Get license
# <div class="standard-unit">
    # <p class="section-title"><span class="heavier">Conditions of Use</span></p>
lic_groups = html_content.xpath('//span[text()="Conditions of Use"]/../../span[@class="section-header"]')

lic_info = []
for lic_group in lic_groups:
    applies_to = lic_group.xpath('text()') # Get the "Applies to" text and fix weird whitespace
    applies_to = ' '.join(applies_to[0].split())
    licenses = lic_group.xpath('following-sibling::ul[1]/li/span//text()')     # Get the licenses
    licenses = [x.strip() for x in licenses]
    lic_info.append((applies_to, licenses))     # Add the license information as a tuple

In [58]:
lic_info

[('Applies to: Data use',
  ['Creative Commons Attribution-ShareAlike 3.0 Unported (CC BY-SA 3.0)'])]

In [59]:
lic_strings = []
sep = '; '

In [60]:
for lic in lic_info:
    lic_strings.append(lic[0] + " = {" + sep.join(lic[1]) + "}")
    lic_string = sep.join(lic_strings)

In [61]:
licence = [lic_string]
licence

['Applies to: Data use = {Creative Commons Attribution-ShareAlike 3.0 Unported (CC BY-SA 3.0)}']

- FAIR Scrapper elements  
url, title, sad, ta, lic_info

#### fair_table(fpss, file_output)
Writes a list of preliminary statistics from multiple FAIRsharing.org urls to a CSV file

    :param fpss: List of FAIRPrelimStats
    :param file_output: Path to output file to write to (String)
    :return:

In [62]:
fpss = [[url], title, sad, ta, licence]
fpss

[['https://fairsharing.org/biodbcore-000015'],
 ['ChEMBL: a large-scale bioactivity database for drug discovery'],
 ['Approved drug', 'Biomedical Science', 'Peptide', 'Small molecule'],
 ['Chemical Entities of Biological Interest',
  'PSI Molecular Interaction Controlled Vocabulary'],
 ['Applies to: Data use = {Creative Commons Attribution-ShareAlike 3.0 Unported (CC BY-SA 3.0)}']]

In [63]:
num_fpss = len(fpss)
num_fpss

5

In [64]:
titles = ['url', 'title', 'scope and data types', 'terminology artifacts', 'license']

In [65]:
summary = {key: value for (key, value) in zip(titles, fpss)}

In [66]:
summary

{'license': ['Applies to: Data use = {Creative Commons Attribution-ShareAlike 3.0 Unported (CC BY-SA 3.0)}'],
 'scope and data types': ['Approved drug',
  'Biomedical Science',
  'Peptide',
  'Small molecule'],
 'terminology artifacts': ['Chemical Entities of Biological Interest',
  'PSI Molecular Interaction Controlled Vocabulary'],
 'title': ['ChEMBL: a large-scale bioactivity database for drug discovery'],
 'url': ['https://fairsharing.org/biodbcore-000015']}

In [67]:
def writeToJSONFile(path, fileName, data):
    filePathNameWExt = './' + path + '/' + fileName + '.json'
    with open(filePathNameWExt, 'w') as fp:
        json.dump(data, fp)

In [66]:
writeToJSONFile('./','metrics',summary)

In [59]:
#df[str(titles[3])] = fpss[3]

In [69]:
# Make sure the output directory exists
#directory = os.path.split(file_output)[0]
#if not os.path.exists(directory):
#    os.mkdir(directory)

# Write the results
#df.to_csv(file_output, sep='\t')

#if config.verbose:
#    print('Tabulated results: ' + file_output)

## Prelim Stats RDF

In [4]:
from rdflib import Graph, Literal, URIRef, Namespace, RDF
from rdflib.namespace import DCTERMS, XSD

In [3]:
#!conda install -c conda-forge rdflib -y
#do it once

Write out dataset data quality metrics in RDF using W3C data vocabulary.

            :param dataset_id: ID to be used in URI for this data set (String)
            :param fps: FAIRsharing preliminary stats (FAIRPrelimStats) [optional]
            :param down_url: Download URL of dataset (String) [optional]
            :param byte_size: Size of dataset in bytes [optional]
            :return: None

Converting preliminary statistics to W3C DQV

In [21]:
# Define namespaces
ns_local = Namespace("http://ncats.nih.gov/")
ns_dcat = Namespace("http://www.w3.org/ns/dcat#")
ns_dqv = Namespace("http://www.w3.org/ns/dqv#")

In [9]:
g = Graph()

In [10]:
!pwd

/Users/pedrohserrano/NCATS-Translator-DQA/notebooks


In [14]:
# Read in the pre-defined turtle file from resources
#file_predefined = os.path.join(config.resource_path, "dqv_definitions.ttl")
file_predefined = os.path.join('/Users/pedrohserrano/NCATS-Translator-DQA/notebooks', "dqv_definitions.ttl")

In [22]:
file_predefined

'/Users/pedrohserrano/NCATS-Translator-DQA/notebooks/dqv_definitions.ttl'

In [23]:
!touch  dqv_definitions.ttl

In [24]:
!ls -la

total 64
drwxr-xr-x  6 pedrohserrano  staff    192 Feb  8 12:23 [34m.[m[m
drwxr-xr-x  8 pedrohserrano  staff    256 Feb  7 11:21 [34m..[m[m
drwxr-xr-x  4 pedrohserrano  staff    128 Feb  7 17:27 [34m.ipynb_checkpoints[m[m
-rw-r--r--  1 pedrohserrano  staff      0 Feb  8 12:23 dqv_definitions.ttl
-rw-r--r--  1 pedrohserrano  staff    454 Feb  7 17:27 metrics.json
-rw-r--r--  1 pedrohserrano  staff  25160 Feb  8 12:22 testing.ipynb


In [26]:
g.parse(file_predefined, format="ttl")

<Graph identifier=N434ca763648e4704abe7d6f72d4bf550 (<class 'rdflib.graph.Graph'>)>

In [31]:
# Create new resources for the data set and distribution
dataset_id = 'fairsharing.org/biodbcore-000015'
dataset = ns_local[dataset_id]
distribution = ns_local[dataset_id + 'Distribution']

In [34]:
distribution

rdflib.term.URIRef('http://ncats.nih.gov/fairsharing.org/biodbcore-000015Distribution')

In [35]:
# Add information about the data set
g.add((dataset, RDF.type, ns_dcat.Dataset))
g.add((dataset, ns_dcat.distribution, distribution))

In [39]:
# Add information about the distribution
g.add((distribution, RDF.type, ns_dcat.Distribution))
g.add((distribution, ns_dcat.mediaType, Literal("application/rdf")))

In [42]:
# Set the download URL
url = 'https://fairsharing.org/biodbcore-000015' #URL to the data set download
#self.add_download_url(down_url)
#Adds dcat:downloadURL to distribution
#        :param url: URL to the data set download (String)
if len(url) > 0:
    g.add((distribution, ns_dcat.downloadURL, URIRef(url)))

In [43]:
# Set the byte size
byte_size = -1
#Adds dcat:byteSize to distribution
#:param byte_size: Size of the data set in bytes (Float)
if byte_size >= 0:
    g.add((distribution, ns_dcat.byteSize, Literal(str(byte_size), datatype=XSD.decimal)))

In [44]:
# Measurement count, counter
n_measurements = 0

In [None]:
# Add licensing information
fps = fpss
#Adds preliminary statistics scraped from FAIRsharing.org to the graph
#:param fps: FAIRPrelimStats object
if fps is not None:
    add_fair_prelim_stats(fps)
    
         # Add title
        if len(fps.title) > 0:
            self.add_title(fps.title)

        # Add license
        if len(fps.license) > 0:
            # Convert the license data structure into a string
            lic_string = fps.get_license_string()
            self.add_licensing_metric(lic_string)

        # Add information on scopes and data types
        self.add_scopes_and_data_types(fps.scope_and_data_types)

        # Add information on terminology artifacts
        self.add_terminology_artifacts(fps.terminology_artifacts)

In [45]:
fps

NameError: name 'fps' is not defined

In [40]:
for i in g:
    print(i)

(rdflib.term.URIRef('http://ncats.nih.gov/fairsharing.org/biodbcore-000015'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://www.w3.org/ns/dcat#Dataset'))
(rdflib.term.URIRef('http://ncats.nih.gov/fairsharing.org/biodbcore-000015Distribution'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://www.w3.org/ns/dcat#Distribution'))
(rdflib.term.URIRef('http://ncats.nih.gov/fairsharing.org/biodbcore-000015'), rdflib.term.URIRef('http://www.w3.org/ns/dcat#distribution'), rdflib.term.URIRef('http://ncats.nih.gov/fairsharing.org/biodbcore-000015Distribution'))
(rdflib.term.URIRef('http://ncats.nih.gov/fairsharing.org/biodbcore-000015Distribution'), rdflib.term.URIRef('http://www.w3.org/ns/dcat#mediaType'), rdflib.term.Literal('application/rdf'))


In [None]:
import sys
import os
from rdflib import Graph, Literal, URIRef, Namespace, RDF
from rdflib.namespace import DCTERMS, XSD
from ncats_translator_dqa import config


class  :
    def __init__(self, dataset_id, fps=None, down_url='', byte_size=-1):
        """Write out dataset data quality metrics in RDF using W3C data vocabulary.

            :param dataset_id: ID to be used in URI for this data set (String)
            :param fps: FAIRsharing preliminary stats (FAIRPrelimStats) [optional]
            :param down_url: Download URL of dataset (String) [optional]
            :param byte_size: Size of dataset in bytes [optional]
            :return: None
            """
        # Output message
        if config.verbose:
            print('Converting preliminary statistics to W3C DQV')

        self.dataset_id = dataset_id

        # Define namespaces
        self.__ns_local = Namespace("http://ncats.nih.gov/")
        self.__ns_dcat = Namespace("http://www.w3.org/ns/dcat#")
        self.__ns_dqv = Namespace("http://www.w3.org/ns/dqv#")

        # Create a new graph
        self.g = Graph()

        # Read in the pre-defined turtle file from resources
        file_predefined = os.path.join(config.resource_path, "dqv_definitions.ttl")
        self.g.parse(file_predefined, format="ttl")

        # Create new resources for the data set and distribution
        self.__dataset = self.__ns_local[self.dataset_id]
        self.__distribution = self.__ns_local[self.dataset_id + 'Distribution']

        # Add information about the data set
        self.g.add((self.__dataset, RDF.type, self.__ns_dcat.Dataset))
        self.g.add((self.__dataset, self.__ns_dcat.distribution, self.__distribution))

        # Add information about the distribution
        self.g.add((self.__distribution, RDF.type, self.__ns_dcat.Distribution))
        self.g.add((self.__distribution, self.__ns_dcat.mediaType, Literal("application/rdf")))

        # Set the download URL
        self.add_download_url(down_url)

        # Set the byte size
        self.add_byte_size(byte_size)

        # Measurement count
        self.__n_measurements = 0

        # Add licensing information
        if fps is not None:
            self.add_fair_prelim_stats(fps)

    def add_title(self, title):
        """Adds dcterms:title to dataset and distribution nodes

        :param title: Title (String)
        """
        self.g.add((self.__dataset, DCTERMS.title, Literal(title, lang="en")))
        self.g.add((self.__distribution, DCTERMS.title, Literal(title)))

    def add_download_url(self, url):
        """Adds dcat:downloadURL to distribution

        :param url: URL to the data set download (String)
        """
        if len(url) > 0:
            self.g.add((self.__distribution, self.__ns_dcat.downloadURL, URIRef(url)))

    def add_byte_size(self, byte_size):
        """Adds dcat:byteSize to distribution

        :param byte_size: Size of the data set in bytes (Float)
        """
        if byte_size >= 0:
            self.g.add((self.__distribution, self.__ns_dcat.byteSize, Literal(str(byte_size), datatype=XSD.decimal)))

    def __add_measurement(self, measurement_label=''):
        """Creates and adds a new measurement to the graph

        :param measurement_label: A unique label for the measurement. Leave empty for auto naming.
        :return: The new measurement node
        """
        if len(measurement_label) == 0:
            # Create a new measurement label
            self.__n_measurements += 1
            measurement_label = 'measurement' + '%04d' % self.__n_measurements

        # Create a new measurement node and add it to the graph
        measurement = self.__ns_local[measurement_label]
        self.g.add((measurement, RDF.type, self.__ns_dqv.QualityMeasurement))
        self.g.add((measurement, self.__ns_dqv.computedOn, self.__distribution))
        self.g.add((self.__distribution, self.__ns_dqv.hasQualityMeasurement, measurement))

        return measurement

    def add_licensing_metric(self, license_string):
        """Adds a licensingMetric measurement

        :param license_string: String representing the license
        """
        if len(license_string) > 0:
            measurement = self.__add_measurement()
            self.g.add((measurement, self.__ns_dqv.isMeasurementOf, self.__ns_local.licensingMetric))
            self.g.add((measurement, self.__ns_dqv.value, Literal(license_string, datatype=XSD.string)))

    def add_scopes_and_data_types(self, sads):
        """Adds a list of scopes and data types to the graph as scopeAndDatatypesMetric

        :param sads: List of strings representing the scopes and data types
        """
        if sads is None:
            return

        for sad in sads:
            measurement = self.__add_measurement()
            self.g.add((measurement, self.__ns_dqv.isMeasurementOf, self.__ns_local.scopeAndDatatypesMetric))
            self.g.add((measurement, self.__ns_dqv.value, Literal(sad, datatype=XSD.string)))

    def add_terminology_artifacts(self, tas):
        """Adds a list of terminology artifacts to the graph as terminologyArtifactsMetric

        :param tas: List of strings representing the terminology artifacts
        :return:
        """
        if tas is None:
            return

        for ta in tas:
            measurement = self.__add_measurement()
            self.g.add((measurement, self.__ns_dqv.isMeasurementOf, self.__ns_local.terminologyArtifactsMetric))
            self.g.add((measurement, self.__ns_dqv.value, Literal(ta, datatype=XSD.string)))

    def add_fair_prelim_stats(self, fps):
        """Adds preliminary statistics scraped from FAIRsharing.org to the graph

        :param fps: FAIRPrelimStats object
        :return:
        """
        if fps is None:
            return

        # Add title
        if len(fps.title) > 0:
            self.add_title(fps.title)

        # Add license
        if len(fps.license) > 0:
            # Convert the license data structure into a string
            lic_string = fps.get_license_string()
            self.add_licensing_metric(lic_string)

        # Add information on scopes and data types
        self.add_scopes_and_data_types(fps.scope_and_data_types)

        # Add information on terminology artifacts
        self.add_terminology_artifacts(fps.terminology_artifacts)

    def serialize(self, file, format='ttl'):
        """Writes the RDF graph to file in the specified format

        :param file: Path to the file to write to (String)
        :param format: RDF format (default: 'ttl')
        :return:
        """
        try:
            # Write out turtle file
            self.g.serialize(destination=file, format=format)

            # Output message
            if config.verbose:
                print('Preliminary statistics in W3C DQV written to: ' + file)
        except IOError:
            sys.stderr.write('Error while trying to serialize preliminary stats RDF graph to file: ' + file + '\n')

