## Fair Scraper

In [2]:
"""
Scrapes information from FAIRsharing.org
"""
import os
import requests
from lxml import html
import pandas as pd
import json
#from ncats_translator_dqa import config

In [3]:
import sys
sys.path.insert(0, '/Users/pedrohserrano/NCATS-Translator-DQA') 
import ncats_translator_dqa as dqa

#### fair_scraper(url)
FAIRsharing.org for some basic information.

    Scrapes FAIRsharing.org for some basic information, including title, scope and data types, terminology artifacts,
    and conditions of use.

    :param url: String url to page to scrape
    :return: FAIRPrelimStats object

- Which url

In [4]:
url = 'https://fairsharing.org/biodbcore-000015'

In [5]:
# output message
#if config.verbose:
#    print('Scraping: ' + url)

In [6]:
# load the page
page = requests.get(url)

In [7]:
# parse the HTML
html_content = html.fromstring(page.content)

- Get the title

In [8]:
title = html_content.xpath('//div[@class="title-text"]/h2/text()[last()]')

In [9]:
title = title[0].strip()

In [10]:
title = [title]
title

['ChEMBL: a large-scale bioactivity database for drug discovery']

- Get the tags

In [11]:
# Find the listed items under Scope and data types
# <li class="bio-tag domain">
#     <span class="bio-icon-tag"style="padding-right: 5px"></span>
#     Approved drug
# </li>
sad = html_content.xpath('//li[@class="bio-tag domain"]/text()[last()]')

In [12]:
sad = [x.strip() for x in sad]
sad

['Approved drug', 'Biomedical Science', 'Peptide', 'Small molecule']

- Get the terminology artifacts

In [13]:
# Find the list items under Terminology Artifacts
# <p><span class="heavier">Terminology Artifacts</span></p>
# <ul class="record-list-link">
# 	<li class="small"><a href="/bsg-s000039" target="_blank">Chemical Entities of Biological Interest</a></li>
# 	<li class="small"><a href="/bsg-s000136" target="_blank">PSI Molecular Interaction Controlled Vocabulary</a></li>
# </ul>
ta = html_content.xpath('//span[text()="Terminology Artifacts"]/../../ul/li/a/text()')
ta = [x.strip() for x in ta]

In [14]:
ta

['Chemical Entities of Biological Interest',
 'PSI Molecular Interaction Controlled Vocabulary']

- Get the license

In [15]:
# Get license
# <div class="standard-unit">
    # <p class="section-title"><span class="heavier">Conditions of Use</span></p>
lic_groups = html_content.xpath('//span[text()="Conditions of Use"]/../../span[@class="section-header"]')

lic_info = []
for lic_group in lic_groups:
    applies_to = lic_group.xpath('text()') # Get the "Applies to" text and fix weird whitespace
    applies_to = ' '.join(applies_to[0].split())
    licenses = lic_group.xpath('following-sibling::ul[1]/li/span//text()')     # Get the licenses
    licenses = [x.strip() for x in licenses]
    lic_info.append((applies_to, licenses))     # Add the license information as a tuple

In [16]:
lic_info

[('Applies to: Data use',
  ['Creative Commons Attribution-ShareAlike 3.0 Unported (CC BY-SA 3.0)'])]

In [17]:
lic_strings = []
sep = '; '

In [18]:
for lic in lic_info:
    lic_strings.append(lic[0] + " = {" + sep.join(lic[1]) + "}")
    lic_string = sep.join(lic_strings)

In [19]:
licence = [lic_string]
licence

['Applies to: Data use = {Creative Commons Attribution-ShareAlike 3.0 Unported (CC BY-SA 3.0)}']

- FAIR Scrapper elements  
url, title, sad, ta, lic_info

#### fair_table(fpss, file_output)
Writes a list of preliminary statistics from multiple FAIRsharing.org urls to a CSV file

    :param fpss: List of FAIRPrelimStats
    :param file_output: Path to output file to write to (String)
    :return:

In [20]:
fpss = [[url], title, sad, ta, licence]
fpss

[['https://fairsharing.org/biodbcore-000015'],
 ['ChEMBL: a large-scale bioactivity database for drug discovery'],
 ['Approved drug', 'Biomedical Science', 'Peptide', 'Small molecule'],
 ['Chemical Entities of Biological Interest',
  'PSI Molecular Interaction Controlled Vocabulary'],
 ['Applies to: Data use = {Creative Commons Attribution-ShareAlike 3.0 Unported (CC BY-SA 3.0)}']]

In [21]:
num_fpss = len(fpss)
num_fpss

5

In [22]:
titles = ['url', 'title', 'scope and data types', 'terminology artifacts', 'license']

In [23]:
summary = {key: value for (key, value) in zip(titles, fpss)}

In [24]:
summary

{'license': ['Applies to: Data use = {Creative Commons Attribution-ShareAlike 3.0 Unported (CC BY-SA 3.0)}'],
 'scope and data types': ['Approved drug',
  'Biomedical Science',
  'Peptide',
  'Small molecule'],
 'terminology artifacts': ['Chemical Entities of Biological Interest',
  'PSI Molecular Interaction Controlled Vocabulary'],
 'title': ['ChEMBL: a large-scale bioactivity database for drug discovery'],
 'url': ['https://fairsharing.org/biodbcore-000015']}

In [96]:
def writeToJSONFile(path, fileName, data):
    filePathNameWExt = './' + path + '/' + fileName + '.json'
    with open(filePathNameWExt, 'w') as fp:
        json.dump(data, fp)

In [97]:
writeToJSONFile('./','metrics',summary)

In [98]:
#df[str(titles[3])] = fpss[3]

In [99]:
# Make sure the output directory exists
#directory = os.path.split(file_output)[0]
#if not os.path.exists(directory):
#    os.mkdir(directory)

# Write the results
#df.to_csv(file_output, sep='\t')

#if config.verbose:
#    print('Tabulated results: ' + file_output)

## FAIRSharing metrics

In [100]:
from rdflib import Graph, Literal, URIRef, Namespace, RDF
from rdflib.namespace import DCTERMS, XSD

In [101]:
#!conda install -c conda-forge rdflib -y
#do it once

Write out dataset data quality metrics in RDF using W3C data vocabulary.

            :param dataset_id: ID to be used in URI for this data set (String)
            :param fps: FAIRsharing preliminary stats (FAIRPrelimStats) [optional]
            :param down_url: Download URL of dataset (String) [optional]
            :param byte_size: Size of dataset in bytes [optional]
            :return: None

Converting preliminary statistics to W3C DQV

In [102]:
# Define namespaces
ns_local = Namespace("http://ncats.nih.gov/")
ns_dcat = Namespace("http://www.w3.org/ns/dcat#")
ns_dqv = Namespace("http://www.w3.org/ns/dqv#")

In [103]:
g = Graph()

In [104]:
!pwd

/Users/pedrohserrano/NCATS-Translator-DQA/notebooks


In [105]:
# Read in the pre-defined turtle file from resources
#file_predefined = os.path.join(config.resource_path, "dqv_definitions.ttl")
file_predefined = os.path.join('/Users/pedrohserrano/NCATS-Translator-DQA/notebooks', "dqv_definitions.ttl")

In [106]:
file_predefined

'/Users/pedrohserrano/NCATS-Translator-DQA/notebooks/dqv_definitions.ttl'

In [107]:
#!touch  dqv_definitions.ttl

In [108]:
!ls -la

total 72
drwxr-xr-x  6 pedrohserrano  staff    192 Feb  9 10:59 [34m.[m[m
drwxr-xr-x  8 pedrohserrano  staff    256 Feb  7 11:21 [34m..[m[m
drwxr-xr-x  4 pedrohserrano  staff    128 Feb  7 17:27 [34m.ipynb_checkpoints[m[m
-rw-r--r--  1 pedrohserrano  staff      0 Feb  8 12:23 dqv_definitions.ttl
-rw-r--r--  1 pedrohserrano  staff    454 Feb  9 10:59 metrics.json
-rw-r--r--  1 pedrohserrano  staff  32218 Feb  9 10:59 testing.ipynb


In [109]:
g.parse(file_predefined, format="ttl")

<Graph identifier=N3fba8606a56e41df9e02df8615d1fe67 (<class 'rdflib.graph.Graph'>)>

In [110]:
# Create new resources for the data set and distribution
#automatic 
dataset_id = 'http://bio2rdf.org#CHEMBL' #'fairsharing.org/biodbcore-000015'
dataset = ns_local[dataset_id]
distribution = ns_local[dataset_id + 'Distribution'] #THIS IS OK

In [111]:
dataset

rdflib.term.URIRef('http://ncats.nih.gov/http://bio2rdf.org#CHEMBL')

In [112]:
distribution

rdflib.term.URIRef('http://ncats.nih.gov/http://bio2rdf.org#CHEMBLDistribution')

In [120]:
# Set the download URL ()
# https://www.w3.org/TR/prov-o/#wasDerivedFrom
dataset = 'https://fairsharing.org/biodbcore-000015' #object
wasDerive = 'https://www.w3.org/TR/prov-o/#wasDerivedFrom' #predicate
source = 'https://fairsharing.org/biodbcore-000015' #subject
#self.add_download_url(down_url)
#Adds dcat:downloadURL to distribution
#        :param url: URL to the data set download (String)
#if len(url) > 0:
    #g.add((dataset, wasDerive, URIRef(source)))

In [113]:
# Add information about the data set
g.add((dataset, RDF.type, ns_dcat.Dataset))
g.add((dataset, ns_dcat.distribution, distribution))

In [114]:
# Add information about the distribution
g.add((distribution, RDF.type, ns_dcat.Distribution))
g.add((distribution, ns_dcat.mediaType, Literal("application/rdf")))

In [1]:
# Measurement count, counter
n_measurements = 0

## We have metrics

- Licensing
- Interoperability
- Relevancy

In [None]:
Prefix dqv: <http://www.w3.org/ns/dqv#>
Prefix hcls: <http://www.w3.org/hcls#>
Prefix bio2rdf: <http://bio2rdf.org#>
Prefix skos: <http://www.w3.org/2004/02/skos/core#>
Prefix xsd: <http://www.w3.org/2001/XMLSchema#>
Prefix prov: <https://www.w3.org/ns/prov#>
Prefix dcat: <http://www.w3.org/ns/dcat#>
        <http://purl.org/dc/terms

In [27]:
summary#['title']

{'license': ['Applies to: Data use = {Creative Commons Attribution-ShareAlike 3.0 Unported (CC BY-SA 3.0)}'],
 'scope and data types': ['Approved drug',
  'Biomedical Science',
  'Peptide',
  'Small molecule'],
 'terminology artifacts': ['Chemical Entities of Biological Interest',
  'PSI Molecular Interaction Controlled Vocabulary'],
 'title': ['ChEMBL: a large-scale bioactivity database for drug discovery'],
 'url': ['https://fairsharing.org/biodbcore-000015']}

In [None]:
title  = 'ChEMBL: a large-scale bioactivity database for drug discovery'

In [None]:
dataset = 

In [None]:
namespace = '<http://bio2rdf.org#'

In [None]:
prefix_rdf = '<http://www.w3.org/1999/02/22-rdf-syntax-ns#>'
#rdf type

### Dataset
<http://bio2rdf.org#CHEMBL>	<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>	<http://www.w3.org/ns/dcat#Dataset> .
chembl is type dataset
<http://bio2rdf.org#CHEMBL>	<http://purl.org/dc/terms/title>	"CHEMBL: a large-scale bioactivity..."@en .
chembl has title *title
<http://bio2rdf.org#CHEMBL>	<http://www.w3.org/ns/dcat#distribution>	<http://bio2rdf.org#CHEMBLdatasetDistribution> .
chembl has distribution *http://bio2rdf.org#CHEMBLDistribution

### Distribution
<http://bio2rdf.org#CHEMBLdatasetDistribution>	<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>	<http://www.w3.org/ns/dcat#Distribution> .
chembl distribution is type dcat:Distributio
<http://bio2rdf.org#CHEMBLdatasetDistribution>	<http://www.w3.org/ns/dcat#downloadURL>	"https://fairsharing.org/biodbcore-000015"@en .  #register the parent
source https://fairsharing.org/biodbcore-000015
<http://bio2rdf.org#CHEMBLdatasetDistribution>	<http://purl.org/dc/terms/title>	"CHEMBL dataset" .
chamble distribution has title "CHEMBL dataset"@en.
<http://bio2rdf.org#CHEMBLdatasetDistribution>	<http://www.w3.org/ns/dqv#hasQualityMeasurement>	<http://www.w3.org/hcls#coverage> .

### Mesurement N
<http://www.w3.org/hcls#coverage>	<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>	<http://www.w3.org/ns/dqv#QualityMeasurement> .
<http://www.w3.org/hcls#coverage>	<http://www.w3.org/ns/dqv#computedOn>	<http://bio2rdf.org#CHEMBLdatasetDistribution> . # registring the parent
<http://www.w3.org/hcls#coverage>	<http://www.w3.org/ns/dqv#value>	"Approved Drug"@en .
<http://www.w3.org/hcls#coverage>	<http://www.w3.org/ns/dqv#value>	"Biomedical Science"@en .
<http://www.w3.org/hcls#coverage>	<http://www.w3.org/ns/dqv#value>	"Peptide"@en .
<http://www.w3.org/hcls#coverage>	<http://www.w3.org/ns/dqv#value>	"Small Molecule"@en .
<http://www.w3.org/hcls#coverage>	<http://www.w3.org/ns/dqv#isMeasurementOf>	<http://www.w3.org/ns/dqv#R2metric> .

### Coverage (Metric)
<http://www.w3.org/ns/dqv#R2metric>	<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>	<http://www.w3.org/ns/dqv#Metric> .
<http://www.w3.org/ns/dqv#R2metric>	<http://www.w3.org/2004/02/skos/core#definition>	"Coverage of scope and datatypes in the dataset."@en .
<http://www.w3.org/ns/dqv#R2metric>	<http://www.w3.org/ns/dqv#expectedDatatype>	<http://www.w3.org/2001/XMLSchema#string> .
<http://www.w3.org/ns/dqv#R2metric>	<http://www.w3.org/ns/dqv#inDimension>	<http://www.w3.org/ns/dqv#Relevancy> . #HERE 
This metric is in dimension relevancy


### Dimension
<http://www.w3.org/ns/dqv#Relevancy>	<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>	<http://www.w3.org/ns/dqv#Dimension> .
<http://www.w3.org/ns/dqv#Relevancy>	<http://www.w3.org/2004/02/skos/core#prefLabel>	"Relevancy" . #just like putting a nice name
<http://www.w3.org/ns/dqv#Relevancy>	<http://www.w3.org/2004/02/skos/core#definition>	"Relevancy refers to the provision of information which is in accordance with the task at hand and important to the users’ query"@en .
<http://www.w3.org/ns/dqv#Relevancy>	<http://www.w3.org/ns/dqv#inCategory>	<http://www.w3.org/ns/dqv#Contextual> . #this is linked to the next one and viceversa
<http://www.w3.org/ns/dqv#Contextual> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type>	<http://www.w3.org/ns/dqv#Category> .



In [125]:
# Add licensing information
#Adds preliminary statistics scraped from FAIRsharing.org to the graph
#:param fps: FAIRPrelimStats object
#if fps is none do:

In [124]:
# Add title to nodes
g.add((dataset, DCTERMS.title, Literal(title, lang="en")))
g.add((distribution, DCTERMS.title, Literal(title)))

In [None]:
# Add byte_size: Size of the data set in bytes (Float)
g.add((distribution, ns_dcat.byteSize, Literal(str(byte_size), datatype=XSD.decimal)))

In [None]:
#Adds dcat:downloadURL to distribution
g.add((distribution, ns_dcat.downloadURL, URIRef(url)))

In [None]:
#reates and adds a new measurement to the graph
#:param measurement_label: A unique label for the measurement. Leave empty for auto naming.
#:return: The new measurement node
if len(measurement_label) == 0:
    # Create a new measurement label
    n_measurements += 1
    measurement_label = 'measurement' + '%04d' % n_measurements

In [None]:
# Add license
g.add((measurement, ns_dqv.isMeasurementOf, ns_local.licensingMetric))
g.add((measurement, ns_dqv.value, Literal(license_string, datatype=XSD.string)))

In [None]:
# Add information on scopes and data types
g.add((measurement, ns_dqv.isMeasurementOf, ns_local.scopeAndDatatypesMetric))
g.add((measurement, ns_dqv.value, Literal(sad, datatype=XSD.string)))

In [None]:
# Add information on terminology artifacts
#for ta in tas:
g.add((measurement, ns_dqv.isMeasurementOf, ns_local.terminologyArtifactsMetric))
g.add((measurement, ns_dqv.value, Literal(ta, datatype=XSD.string)))

In [None]:
def serialize(self, file, format='ttl'):
    """Writes the RDF graph to file in the specified format

    :param file: Path to the file to write to (String)
    :param format: RDF format (default: 'ttl')
    :return:
    """
    try:
        # Write out turtle file
        self.g.serialize(destination=file, format=format)

        # Output message
        if config.verbose:
            print('Preliminary statistics in W3C DQV written to: ' + file)
    except IOError:
        sys.stderr.write('Error while trying to serialize preliminary stats RDF graph to file: ' + file + '\n')

In [126]:
for i in g:
    print(i)

(rdflib.term.URIRef('http://ncats.nih.gov/http://bio2rdf.org#CHEMBL'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://www.w3.org/ns/dcat#Dataset'))
(rdflib.term.URIRef('http://ncats.nih.gov/http://bio2rdf.org#CHEMBL'), rdflib.term.URIRef('http://www.w3.org/ns/dcat#distribution'), rdflib.term.URIRef('http://ncats.nih.gov/http://bio2rdf.org#CHEMBLDistribution'))
(rdflib.term.URIRef('http://ncats.nih.gov/http://bio2rdf.org#CHEMBLDistribution'), rdflib.term.URIRef('http://www.w3.org/ns/dcat#mediaType'), rdflib.term.Literal('application/rdf'))
(rdflib.term.URIRef('http://ncats.nih.gov/http://bio2rdf.org#CHEMBLDistribution'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://www.w3.org/ns/dcat#Distribution'))


## Testing

In [None]:
# FAIRsharing.org URLs to test
urls = ['https://biosharing.org/biodbcore-000015',
        'https://biosharing.org/biodbcore-000037',
        'https://biosharing.org/biodbcore-000081',
        'https://biosharing.org/biodbcore-000095',
        'https://biosharing.org/biodbcore-000104',
        'https://biosharing.org/biodbcore-000137',
        'https://biosharing.org/biodbcore-000155',
        'https://biosharing.org/biodbcore-000156',
        'https://biosharing.org/biodbcore-000173',
        'https://biosharing.org/biodbcore-000304',
        'https://biosharing.org/biodbcore-000329',
        'https://biosharing.org/biodbcore-000330',
        'https://biosharing.org/biodbcore-000341',
        'https://biosharing.org/biodbcore-000417',
        'https://biosharing.org/biodbcore-000438',
        'https://biosharing.org/biodbcore-000441',
        'https://biosharing.org/biodbcore-000455',
        'https://biosharing.org/biodbcore-000470',
        'https://biosharing.org/biodbcore-000495',
        'https://biosharing.org/biodbcore-000525',
        'https://biosharing.org/biodbcore-000544',
        'https://biosharing.org/biodbcore-000552',
        'https://biosharing.org/biodbcore-000663',
        'https://biosharing.org/biodbcore-000730',
        'https://biosharing.org/biodbcore-000805',
        'https://biosharing.org/biodbcore-000826',
        'https://biosharing.org/biodbcore-000842',
        'https://fairsharing.org/biodbcore-000618',
        'https://fairsharing.org/biodbcore-000340']

# Write the results to the configured output folder
dir_output = config.path_output
if not os.path.exists(dir_output):
    os.mkdir(dir_output)

# List of preliminary statistics results
stats_list = []

# Process each url
for url in urls:
    # Scrape the page
    stats = fair_scraper.fair_scraper(url)
    stats_list.append(stats)

    # Output filename based on url
    filename = url.split('/')[-1] + '_rdf.ttl'
    output_file = os.path.join(dir_output, filename)

    # Use the dataset title as the local identifier
    dataset_id = "".join([c for c in stats.title if c.isalnum()]) + 'Dataset'

    # Write out preliminary statistics using W3C DQV
    stats_rdf = prelim_stats_rdf.PrelimStatsRDF(dataset_id, stats)
    stats_rdf.serialize(output_file, format='ttl')

# Run the scraper and write the results to CSV
file_output = os.path.join(dir_output, 'FAIRsharing_table.csv')
#fair_scraper.fair_table(stats_list, file_output)