In [1]:
# Import the various libraries
import pandas as pd
import os
from SPARQLWrapper import SPARQLWrapper, JSON

import time
from datetime import datetime
current_date = datetime.now()
formatted_date = current_date.strftime('%d%m%Y')

In [2]:
# Settings for files
folderlink = '..//data//'
input_folder = 'input//'
folder_output = 'output//'

In [3]:
# Add sparql endpoint
sparql_endpoint = "http://data.bibliotheken.nl/sparql"

In [4]:
# In order to link the individuals from CLERUS with book titles, this script allows to generate a datadump of booktitle metadata from the Koninklijke Bibliotheek. The dataset that is of interest to connect CLERUS with is the "Short Title Catalogue Netherlands" STCN. 
# At the request of the Lead Applicant of the project the following fields have been identified as interesting. The LA requested an excelsheet with one unique row per booktiltle. Since, not the dataset contains many one to many relationships, every field has been put into one SPARQL query and in case of multiple childs these have been put in the same field. 

# Below an overview on the various Queries
# q1 the primary key i.e. the recordnummers
# q2 taal - schema:inLanguage 
# q3 jaar - schema:publication/schema:startDate 
# q4 land - schema:publication/schema:location
# q1 titelprimair - schema:name
# q5 titelsecundair - schema:alternateName
# q5 titelextra - schema:alternateName
# q6 auteurpersoon - schema:author/schema:name
# q7 auteursecundair - schema:contributor/schema:name
# q8 drukkervermelding - schema:publication/schema:description
# q9 drukkerpersoon - schema:publication/schema:publishedBy/schema:name
# q10 formaat - <http://data.bibliotheken.nl/def#bibliographicFormat> 
# q11 trefwoord - schema:about/skos:label

In [5]:
# The current script generates a dump for the STCN dataset with the fieldnames requested by the the Lead applicant from the project.
# In order to generate a similar datadump with the same fields for the Nederlandse Bibliografie Totaal (NBT) "stcn" should be changed into "nbt"

dataset_kb = "stcn"

In [6]:
# Get a list with all the records and a title
primary_key = """
SELECT ?title_id ?title WHERE {
?title_id a schema:Book .
?title_id schema:name ?title .
?title_id schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/"""+dataset_kb+"""> .
}

"""

In [7]:
sparql = SPARQLWrapper(sparql_endpoint)
sparql.setQuery(primary_key)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
bindings = results["results"]["bindings"]
data = []
for item in bindings:
    row = {}
    for key in item:
        row[key] = item[key]["value"]
    data.append(row)
df = pd.DataFrame(data)

In [8]:
# Get a list with all the records and languages if present (can be more then one per item)
query2 = {"language":
"""
select ?title_id ?language where {
  ?title_id schema:inLanguage ?language .

  ?title_id schema:name ?title .
  ?title_id schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/"""+dataset_kb+"""> .
  }
  
"""}

In [9]:
# Get a list with all the records and a title and the year of publication if present
query3 = {"pub_year":
"""
select ?title_id ?pub_year where {
  ?title_id schema:publication/schema:startDate ?pub_year .

  ?title_id schema:name ?title .
  ?title_id schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/"""+dataset_kb+"""> .
  }
  
"""}

In [10]:
# Get a list with all the records and a title and the publication location if present
query4 = {"pub_location":
"""
select ?title_id ?pub_location {
  ?title_id schema:publication/schema:location ?pub_location .

  ?title_id schema:name ?title .
  ?title_id schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/"""+dataset_kb+"""> .
  }
  
"""}

In [11]:
# Get a list with all the records and an alternative title if present
query5 = {"alt_title":
"""
select ?title_id ?alt_title where {
  ?title_id schema:alternateName ?alt_title .

  ?title_id schema:name ?title .
  ?title_id schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/"""+dataset_kb+"""> .
  }
  
"""}

In [12]:
# Get a list with all the records and the main author 
query6 = {"author":
"""
select ?title_id ?author ?authorid where {
  ?title_id schema:author/schema:name ?author .

  ?title_id schema:name ?title .
  ?title_id schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/"""+dataset_kb+"""> .  
  }
  

"""}

In [13]:
# Get a list with all the records and if present the co author(s)
query7 = {"co_author":
"""
select ?title_id ?co_author where {
  ?title_id schema:contributor/schema:name ?co_author.

  ?title_id schema:name ?title .
  ?title_id schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/"""+dataset_kb+"""> .
  }
  

"""}

In [14]:
# Get a list with all the records and additional infor about the publishing entity
query8 = {"pub_description":
"""
select ?title_id ?pub_description where {
  ?title_id schema:publication/schema:description ?pub_description .

  ?title_id schema:name ?title .
  ?title_id schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/"""+dataset_kb+"""> .
  }
  

"""}

In [15]:
# Get a list with all the records and the name of the publisher
query9 = {"publisher_name":
"""
select ?title_id ?publisher_name where {
  ?title_id schema:publication/schema:publishedBy/schema:name ?publisher_name .

  ?title_id schema:name ?title .
  ?title_id schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/"""+dataset_kb+"""> .
  }
  

"""}

In [16]:
# Get a list with all the records and the format of the publication
query10 = {"format":
"""
select ?title_id ?format where {
  ?title_id <http://data.bibliotheken.nl/def#bibliographicFormat> ?format .

  ?title_id schema:name ?title .
  ?title_id schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/"""+dataset_kb+"""> .
  }
  

"""}

In [17]:
# Get a list with all the records and the keywords that are assigned to them. 
query11 = {"keywords":
"""
select ?title_id ?keywords where {
  ?title_id schema:about/skos:label ?keywords .

  ?title_id schema:name ?title .
  ?title_id schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/"""+dataset_kb+"""> .
  }
  
"""}

In [18]:
query_dict = {**query2, **query3, **query4, **query5, **query6, **query7, **query8, **query9, **query10, **query11}

In [19]:
for name, query in query_dict.items():
    sparql = SPARQLWrapper(sparql_endpoint)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    bindings = results["results"]["bindings"]
    data = []
    for item in bindings:
        row = {}
        for key in item:
            row[key] = item[key]["value"]
        data.append(row)

    df_data = pd.DataFrame(data)
    
    result = df_data.groupby('title_id')[name].apply('; '.join).reset_index()
    df_join = pd.merge(df, result, left_on='title_id', right_on='title_id', how='left')
    df = df_join
    

In [20]:
# Panda settings for showing data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [21]:
df_join.to_csv(folderlink+folder_output+dataset_kb+"_Dump_"+formatted_date+".csv", sep=';', encoding='utf-8', index=False)

In [22]:
excel_export = folderlink+folder_output+dataset_kb+"_excel_"+formatted_date+".xlsx"
df_join.to_excel(excel_export, index=False, engine='openpyxl')
