# Functions to explore the ResourceSync interface of Timbuctoo

In [1]:
!pip install rdflib -U

Requirement already up-to-date: rdflib in /anaconda3/lib/python3.6/site-packages (4.2.2)


In [2]:
import rdflib
import requests
import xml.etree.ElementTree as ET
import pandas as pd
import datetime
import dateutil.parser

In [3]:
ANANSI_URL = 'https://data.anansi.clariah.nl/.well-known/resourcesync'
HUYDAT_URL = 'https://repository.huygens.knaw.nl/v5/resourcesync/sourceDescription.xml'

ANANSI_GIQL = 'https://data.anansi.clariah.nl/static/graphiql'
HUYDAT_GIQL = 'https://repository.huygens.knaw.nl/static/graphiql'

RSNS = {'sm': 'http://www.sitemaps.org/schemas/sitemap/0.9',
          'rs': 'http://www.openarchives.org/rs/terms/'}

def convert_to_date(xml_string):
    if xml_string:
        return dateutil.parser.parse(xml_string)
    else:
        return None

In [4]:
class RsUrl(object):
    
    def __init__(self, element):
        self.loc = element.find('sm:loc', RSNS).text
        rsmd = element.find('rs:md', RSNS)
        self.capability = rsmd.get('capability')
        self.datetime = convert_to_date(rsmd.get('datetime'))
        self.type = rsmd.get('type')
        self.change = rsmd.get('change')
        
        self.describedby_href = None
        self.describedby_type = None
        self.patch_href = None
        self.patch_type = None
        for ln in element.findall('rs:ln', RSNS):
            if ln.get('rel') == 'describedby':
                self.describedby_href = ln.get('href')
                self.describedby_type = ln.get('type')
            if ln.get('rel') == 'http://www.openarchives.org/rs/terms/patch':
                self.patch_href = ln.get('href')
                self.patch_type = ln.get('type')

    def __repr__(self):
        return str(self.__dict__)

In [5]:
class Sitemap(object):
    
    def __init__(self, loc, text):
        self.loc = loc
        root = ET.fromstring(text)
        rsmd = root.find('rs:md', RSNS)
        self.capability = rsmd.get('capability')
        self.at = convert_to_date(rsmd.get('at'))
        self.completed = convert_to_date(rsmd.get('completed'))
        self.from_ = convert_to_date(rsmd.get('from'))
        self.until = convert_to_date(rsmd.get('until'))
        self.up_href = None
        self.describedby_href = None
        self.describedby_type = None
        for ln in root.findall('rs:ln', RSNS):
            if ln.get('rel') == 'up':
                self.up_href = ln.get('href')
            if ln.get('rel') == 'describedby':
                self.describedby_href = ln.get('href')
                self.describedby_type = ln.get('type')
        self.rs_urls = []
        for url_el in root.findall('sm:url', RSNS):
            self.rs_urls.append(RsUrl(url_el))
                
    def __str__(self):
        return str(self.__dict__)

In [6]:
def get_sitemap(url):
    response = requests.get(url)
    if response.status_code == requests.codes.ok:
        text = str(response.content, 'utf-8', errors='replace')
        return Sitemap(url, text)
    else:
        raise Exception("Error response from %s: %d %s" % (url, response.status_code, response.reason))

In [7]:
def list_datasets(url):
    sitemap = get_sitemap(url)
    return map(lambda x: x.loc.split('/')[6], sitemap.rs_urls)

In [8]:
for ds in list_datasets(HUYDAT_URL):
    print(ds)

emplaces_25geonames
emplaces
dwc
charterportaal
bioport
migratiegids
vocopvarenden2
prizepapers
emlo_oppole
emlo_oppole20180627
demo
emdates_places
emlo_oppole20180625
cor19_7
cor19_8
ckcc_a
clusius
ppds
emlo_smalltest2
emlo_smalltest
soundtoll_11
raa
hg_2
ogt_2
constitutionele_commissies
rsg_2
plakaatboek
netwerk_verwey
donb
women_writers
declercq
test_ww
missiezending
gp_2
opsporingverificatie
amstelveen_1
sport
test_wwdocument
emigratie


In [9]:
for ds in list_datasets(ANANSI_URL):
    print(ds)

wwriters_nl
gemeentegeschiedenisnl
dwc
nlgis
gemeentegeschiedenis
bioport
tbi
tic2new
personen
concepten
plaatsen


In [10]:
def get_graph(url, d_type):
    g = rdflib.Graph()
    g.parse(url, format=d_type)
    return g

In [11]:
#desc_url = 'https://data.anansi.clariah.nl/v5/resourcesync/u74ccc032adf8422d7ea92df96cd4783f0543db3b/dwc/description.xml'
#desc_url = 'https://data.anansi.clariah.nl/v5/resourcesync/u74ccc032adf8422d7ea92df96cd4783f0543db3b/nlgis/description.xml'
desc_url = 'https://data.anansi.clariah.nl/v5/resourcesync/u74ccc032adf8422d7ea92df96cd4783f0543db3b/bioport/description.xml'
desc_type = 'application/rdf+xml'
g = get_graph(desc_url, desc_type)
print(g)
for pred in g.predicates():
    print(pred)
        
for pred, obj in g.predicate_objects():
    print(pred, obj)
        
print(len(g))

[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'IOMemory']].
http://purl.org/dc/terms/abstract
http://purl.org/dc/terms/rightsHolder
http://purl.org/dc/terms/title
http://schema.org/ContactPoint
http://purl.org/dc/terms/description
http://purl.org/dc/terms/license
http://purl.org/dc/terms/provenance
http://purl.org/dc/terms/abstract https://data.anansi.clariah.nl/datasets/u74ccc032adf8422d7ea92df96cd4783f0543db3b/bioport/summaryProperties
http://purl.org/dc/terms/rightsHolder https://data.anansi.clariah.nl/datasets/u74ccc032adf8422d7ea92df96cd4783f0543db3b/bioport/rightsHolder
http://purl.org/dc/terms/title Biography portal of the Netherlands
http://schema.org/ContactPoint https://data.anansi.clariah.nl/datasets/u74ccc032adf8422d7ea92df96cd4783f0543db3b/bioport/contactPerson
http://purl.org/dc/terms/description An online collection of reference works and data sets currently scattered over the internet, containing biographical information on notable persons in Dutch history, fr

from [https://easy.dans.knaw.nl/schemas/md/2018/03/ddm.xsd](https://easy.dans.knaw.nl/schemas/md/2018/03/ddm.xsd)

```xml
    <xs:complexType name="profileType">
        <xs:annotation>
            <xs:documentation xml:lang="en">
                Detailed specification of the information that is essential for profiling the dataset in the Easy application.
            </xs:documentation>
        </xs:annotation>
        <xs:sequence>
            <xs:element ref="dc:title" maxOccurs="unbounded"/>
            <xs:element ref="dc:description" maxOccurs="unbounded"/>
            <xs:element ref="dc:creator" maxOccurs="unbounded"/>
            <xs:element ref="ddm:created"/>                        <!--substitutionGroup="dcterms:created" type="dcterms:W3CDTF-->
            <xs:element ref="ddm:available"/>                      <!--substitutionGroup="dcterms:available" type="dcterms:W3CDTF-->
            <xs:element ref="ddm:audience" maxOccurs="unbounded"/> <!--substitutionGroup="dcterms:audience" type="narcis:DisciplineType"-->
            <xs:element ref="ddm:accessRights"/>                   <!--substitutionGroup="dcterms:accessRights" type="ddm:EasyAccessRightsType"-->
        </xs:sequence>
    </xs:complexType>
```

In [9]:
# ds_url = 'https://data.anansi.clariah.nl/v5/resourcesync/u74ccc032adf8422d7ea92df96cd4783f0543db3b/gemeentegeschiedenisnl/dataset.nq'
# ds_type = 'application/n-quads'

# g = get_graph(ds_url, ds_type)

# print(len(g))

# for pred in g.predicates():
#     print(pred)

0


In [12]:
from rdflib import ConjunctiveGraph, URIRef, Namespace

def get_quads(url):
    g = ConjunctiveGraph()
    g.parse(url, format="nquads")
    return g

In [11]:
# ds_url = 'https://data.anansi.clariah.nl/v5/resourcesync/u74ccc032adf8422d7ea92df96cd4783f0543db3b/gemeentegeschiedenisnl/dataset.nq'

# g = get_quads(ds_url)

# print(g, len(g))

[a rdflib:ConjunctiveGraph;rdflib:storage [a rdflib:Store;rdfs:label 'IOMemory']] 45783


In [5]:
#!pip install bsddb3

Collecting bsddb3
  Using cached https://files.pythonhosted.org/packages/e9/fc/ebfbd4de236b493f9ece156f816c21df0ae87ccc22604c5f9b664efef1b9/bsddb3-6.2.6.tar.gz
    Complete output from command python setup.py egg_info:
    Can't find a local Berkeley DB installation.
    (suggestion: try the --berkeley-db=/path/to/bsddb option)
    
    ----------------------------------------
[31mCommand "python setup.py egg_info" failed with error code 1 in /private/var/folders/75/tvwk70950hbbx4nj15q99_t40000gn/T/pip-install-9cyqy794/bsddb3/[0m
