In [143]:
# Import the various libraries
import pandas as pd
import os
from SPARQLWrapper import SPARQLWrapper, JSON

In [144]:
# Settings for files
folderlink = '..//data//'
input_folder = 'input//'
folder_output = 'output//'

In [145]:
# Add sparql endpoint
sparql_endpoint = "http://data.bibliotheken.nl/sparql"

In [146]:
# We have an old datadump from the STCN dataset with the following fields. Since in the meantime more titles have been added to the STCN dataset we want to generate a new dataset based with the same field based on the SPARQL endpoint.
# Since we want to be able to rerun it in the future this script can be rerun once more data has been added.
# A mapping of the fields in the original file is provided above. Since there are multiple 1 to many relationships we generate a table for every entry. We make sure that all have a recordnumber so it can be linked.

# recordnummer - recornumber
# q2 taal - schema:inLanguage 
# q3 jaar - schema:publication/schema:startDate 
# q4 land - schema:publication/schema:location
# q1 titelprimair - schema:name
# q5 titelsecundair - schema:alternateName
# q5 titelextra - schema:alternateName
# q6 auteurpersoon - schema:author/schema:name
# q7 auteursecundair - schema:contributor/schema:name
# q8 drukkervermelding - schema:publication/schema:description
# q9 drukkerpersoon - schema:publication/schema:publishedBy/schema:name
# q10 formaat - <http://data.bibliotheken.nl/def#bibliographicFormat> 
# aantekening - ?
# q11 trefwoord - schema:about/skos:label
# opmerking - ?
# signatuur - ?
# weblink - is "http://data.bibliotheken.nl/doc/nbt/p"+recordnumber

In [147]:
# Get a list with all the records and a title
primary_key = """
select ?title_id ?title where {
  ?p schema:mainEntityOfPage/owl:sameAs ?title_id .
  ?p schema:name ?title .
  ?p schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn> .
} LIMIT 100
"""

In [148]:
sparql = SPARQLWrapper(sparql_endpoint)
sparql.setQuery(primary_key)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
bindings = results["results"]["bindings"]
data = []
for item in bindings:
    row = {}
    for key in item:
        row[key] = item[key]["value"]
    data.append(row)
df = pd.DataFrame(data)

In [149]:
# Get a list with all the records and languages if present (can be more then one per item)
query2 = {"language":
"""
select ?title_id ?language where {
  ?p schema:mainEntityOfPage/owl:sameAs ?title_id .
  ?p schema:inLanguage ?language .
  ?p schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn> .
} LIMIT 100
"""}

In [150]:
# Get a list with all the records and a title and the year of publication if present
query3 = {"pub_year":
"""
select ?title_id ?pub_year where {
  ?p schema:mainEntityOfPage/owl:sameAs ?title_id .
  ?p schema:publication/schema:startDate ?pub_year .
  ?p schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn> .
} LIMIT 100
"""}

In [151]:
# Get a list with all the records and a title and the publication location if present
query4 = {"pub_location":
"""
select ?title_id ?pub_location {
  ?p schema:mainEntityOfPage/owl:sameAs ?title_id .
  ?p schema:publication/schema:location ?pub_location .
  ?p schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn> .
} LIMIT 100 
"""}

In [152]:
# Get a list with all the records and an alternative title if present
query5 = {"alt_title":
"""
select ?title_id ?alt_title where {
  ?p schema:mainEntityOfPage/owl:sameAs ?title_id .
  ?p schema:alternateName ?alt_title .
  ?p schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn> .
} LIMIT 100
"""}

In [153]:
# Get a list with all the records and the main author 
query6 = {"author":
"""
select ?title_id ?author ?authorid where {
  ?p schema:mainEntityOfPage/owl:sameAs ?title_id .
  ?p schema:author/schema:name ?author .
  ?p schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn> .
} LIMIT 100
"""}

In [154]:
# Get a list with all the records and if present the co author(s)
query7 = {"co_author":
"""
select ?title_id ?co_author where {
  ?p schema:mainEntityOfPage/owl:sameAs ?title_id .
  ?p schema:contributor/schema:name ?co_author.
  ?p schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn> .
} LIMIT 100
"""}

In [155]:
# Get a list with all the records and additional infor about the publishing entity
query8 = {"pub_description":
"""
select ?title_id ?pub_description where {
  ?p schema:mainEntityOfPage/owl:sameAs ?title_id .
  ?p schema:publication/schema:description ?pub_description .
  ?p schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn> .
} LIMIT 100
"""}

In [156]:
# Get a list with all the records and the name of the publisher
query9 = {"publisher_name":
"""
select ?title_id ?publisher_name where {
  ?p schema:mainEntityOfPage/owl:sameAs ?title_id .
  ?p schema:publication/schema:publishedBy/schema:name ?publisher_name .
  ?p schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn> .
} LIMIT 100
"""}

In [157]:
# Get a list with all the records and the format of the publication
query10 = {"format":
"""
select ?title_id ?format where {
  ?p schema:mainEntityOfPage/owl:sameAs ?title_id .
  ?p <http://data.bibliotheken.nl/def#bibliographicFormat> ?format .
  ?p schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn> .
} LIMIT 100
"""}

In [158]:
# Get a list with all the records and the keywords that are assigned to them. 
query11 = {"keywords":
"""
select ?title_id ?keywords where {
  ?p schema:mainEntityOfPage/owl:sameAs ?title_id .
  ?p schema:about/skos:label ?keywords .
  ?p schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn> .
} LIMIT 100
"""}

In [159]:
query_dict = {**query2, **query3, **query4, **query5, **query6, **query7, **query8, **query9, **query10, **query11}

In [160]:
for name, query in query_dict.items():
    sparql = SPARQLWrapper(sparql_endpoint)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    bindings = results["results"]["bindings"]
    data = []
    for item in bindings:
        row = {}
        for key in item:
            row[key] = item[key]["value"]
        data.append(row)

    df_data = pd.DataFrame(data)
    result = df_data.groupby('title_id')[name].apply('; '.join).reset_index()
    df_join = pd.merge(df, result, left_on='title_id', right_on='title_id', how='left')
    df = df_join
    

In [161]:
# Panda settings for showing data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [167]:
# Now I want to join it with the excelsheet that we got

# Path to the excel file
file_path = folderlink+input_folder+"STCNtotaalVERRIJKT.xlsx"

# Load the Excel sheet into a DataFrame
STCN_old = pd.read_excel(file_path, engine='openpyxl')

In [181]:
STCN_old['short_title_id'] = STCN_old['recordnummer'].str.replace(")","")
STCN_old['title_id'] = STCN_old['short_title_id'].str.replace("(0: ", "http://data.bibliotheken.nl/doc/nbt/p")

In [184]:
STCN_extra_data = STCN_old[['title_id','aantFred','confessie','genre','thema']]

In [189]:
STCN_joined = pd.merge(df_join, STCN_extra_data, left_on='title_id', right_on='title_id', how='left')

In [163]:
df_join.to_csv(folderlink+folder_output+"STCN_Dump.csv", sep=';', encoding='utf-8', index=False)

In [190]:
STCN_joined.to_csv(folderlink+folder_output+"STCN_Dump_extra.csv", sep=';', encoding='utf-8', index=False)

In [191]:
excel_STCN = folderlink+folder_output+"excel_STCN.xlsx"
df_join.to_excel(excel_STCN, index=False, engine='openpyxl')

excel_STCN_extra = folderlink+folder_output+"excel_STCN_extra.xlsx"
STCN_joined.to_excel(excel_STCN_extra, index=False, engine='openpyxl')