# Fingerprint an rdf-file

First exploration of how we could gather information from an rdf-resource that gives an insight in its structure and extend. 

In [1]:
from rdflib import ConjunctiveGraph, URIRef, Namespace

def get_quads(url, rdf_format='nquads'):
    """
    Read the rdf from the given url into a rdflib.ConjunctiveGraph.
    
    :param url: the url to read
    :param rdf_format: the format of the rdf at the url
    
    :return: the rdflib.ConjunctiveGraph at the given url
    """
    g = ConjunctiveGraph()
    g.parse(url, format=rdf_format)
    return g

In [5]:
from rdflib import Literal
import requests
from IPython.core.display import display, HTML
import urllib
from urllib.parse import urlparse


def fr(x):
    return '{:12,}'.format(x).replace(',', '.')

def isresolvable(url):
    try:
        response = requests.get(url)
        return response.ok
    except:
        return False

class GraphProps(object):
    
    def __init__(self, g):
        self.statements = len(g)
        self.contexts = list(g.contexts())
        self.netlocs_s = dict()
        self.netlocs_p = dict()
        self.netlocs_o = dict()
        self.literals = 0
        self.predicates = dict()
        for s,p,o in g.triples((None,None,None)):
            self.predicates[str(p)] = self.predicates.get(str(p), 0) + 1
            
            nl = urlparse(str(s)).netloc
            self.netlocs_s[nl] = self.netlocs_s.get(nl, 0) + 1
            
            nl = urlparse(str(p)).netloc
            self.netlocs_p[nl] = self.netlocs_p.get(nl, 0) + 1
            
            if isinstance(o, Literal):
                self.literals += 1
            else:
                nl = urlparse(str(o)).netloc
                self.netlocs_o[nl] = self.netlocs_o.get(nl, 0) + 1
    
    def print_props(self):
        print('contexts  :', fr(len(self.contexts)))
        print('statements:', fr(self.statements))
        print('literals  :', fr(self.literals))
        nar = self.literals/self.statements
        print('literal ratio:', '{:7.2f}'.format(nar).replace('.', ','))
        
        print()
        print('subject netlocs:', len(self.netlocs_s), self.netlocs_s)
        print('object netlocs:', len(self.netlocs_o), self.netlocs_o)
        
        print()
        edo = 'example.org'
        edos = self.netlocs_s.get(edo, 0) + self.netlocs_p.get(edo, 0) + self.netlocs_o.get(edo, 0)
        print('example.org', '| s', self.netlocs_s.get(edo, 0), '| p', self.netlocs_p.get(edo, 0), '| o', self.netlocs_o.get(edo, 0), '| total', edos)
        
        print()
        print('predicate netlocs:', len(self.netlocs_p), '|', self.netlocs_p)
        print('predicates:', len(self.predicates))
        for key in sorted(self.predicates):
            
            if urlparse(key).netloc != 'example.org' and isresolvable(key):
                display(HTML("""<a target="_blank" href="{}">{}</a>""".format(key, key) + ': ' + str(self.predicates[key])))
            else:
                print('%s: %s' % (key, self.predicates[key]))

In [6]:
def fingerprint(resource_url):
    display(HTML("""<h3>fingerprint</h3>"""))
    display(HTML("""<a target="_blank" href="{}">{}</a>""".format(resource_url, resource_url)))
    g = get_quads(resource_url)
    gp = GraphProps(g)
    gp.print_props()

In [9]:
fingerprint('https://data.anansi.clariah.nl/v5/resourcesync/u74ccc032adf8422d7ea92df96cd4783f0543db3b/gemeentegeschiedenisnl/dataset.nq')

contexts  :            1
statements:       45.783
literals  :       19.891
literal ratio:    0,43

subject netlocs: 7 {'www.opengis.net': 2873, 'gemeentegeschiedenis.nl': 32898, 'triply.cc': 8619, 'cbs.nl': 1388, 'www.w3.org': 2, 'rdfs.org': 2, 'www.gemeentegeschiedenis.nl': 1}
object netlocs: 7 {'triply.cc': 5746, 'gemeentegeschiedenis.nl': 15882, 'cbs.nl': 1387, 'www.opengis.net': 2873, 'rdfs.org': 1, 'www.w3.org': 2, 'www.gemeentegeschiedenis.nl': 1}

example.org | s 0 | p 0 | o 0 | total 0

predicate netlocs: 8 | {'www.w3.org': 17399, 'gemeentegeschiedenis.nl': 18311, 'www.opengis.net': 8619, 'cbs.nl': 1440, 'timbuctoo.huygens.knaw.nl': 2, 'rdfs.org': 4, 'purl.org': 6, 'xmlns.com': 2}
predicates: 21


http://timbuctoo.huygens.knaw.nl/static/v5/vocabulary#hasIndexConfig: 2


https://cbs.nl/def/gemeenteCode: 1440
https://gemeentegeschiedenis.nl/def/geometry_type: 2873
https://gemeentegeschiedenis.nl/def/id: 2881
https://gemeentegeschiedenis.nl/def/temporalExtension: 5804
https://gemeentegeschiedenis.nl/def/validSince: 3144
https://gemeentegeschiedenis.nl/def/validUntil: 3609


## Xplain

- `example.org` - how many statements have this as a namespace
- predicates are tested on resolvability. If the URI is resolvable it is linked/printed in blue. Resolvability in this sense means 'a request with that URI does return without error', which is not the same as 'gives intelligible information on that predicate'.

## ToDo

- extend resolvability to object-URIs.
- also look for descriptions of i.e. predicate-URIs inside the dataset.