In [1]:
# Import the various libraries
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
import time

In [2]:
from datetime import datetime
current_date = datetime.now()
formatted_date = current_date.strftime('%d%m%Y')

In [3]:
# Settings for files
folderlink = '..//data//'
input_folder = 'input//'
folder_output = 'output//'

In [4]:
# Add sparql endpoint
sparql_endpoint = "http://data.bibliotheken.nl/sparql"

In [5]:
dataset_kb = "nbt"

In [6]:
def query_sparql_and_convert_to_df(sparql_endpoint, query):
    sparql = SPARQLWrapper(sparql_endpoint)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    bindings = results["results"]["bindings"]
    data = []
    for item in bindings:
        row = {}
        for key in item:
            row[key] = item[key]["value"]
        data.append(row)
    df = pd.DataFrame(data)
    return df

In [7]:
primary_key = pd.DataFrame()

for value_int in range(0, 30000000, 10000000):
    value_str = str(value_int)
    query = """
    SELECT ?title_id ?title WHERE {
    ?title_id schema:name ?title .
    ?title_id schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/"""+dataset_kb+"""> .
    }
    LIMIT 10000000 OFFSET """+value_str+"""
    """
    df = query_sparql_and_convert_to_df(sparql_endpoint, query)
    primary_key = pd.concat([primary_key, df], ignore_index=True)
    time.sleep(1)


In [8]:
# In the dataset of the KB multiple spellings of names have been added. And accessible through:
# ?title_id schema:author/schema:name ?author .
# ?title_id schema:author/schema:birthDate ?birthdate .
# Ideally we would want multiple spellings, however due to limited server response from the KB it has been decided to keep it with the label (which contains the year of birth as well) and lateron extract them.
# For future analysis it is recommended to optimize this.

author = pd.DataFrame()


for value_int in range(0, 50000000, 10000000):
    value_str = str(value_int)
    query = """
    SELECT ?title_id ?author WHERE {
        ?title_id schema:author/rdfs:label ?author .
        ?title_id a schema:Book .
        ?title_id schema:name ?title .
        ?title_id schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/"""+dataset_kb+"""> .
    }
    LIMIT 10000000 OFFSET """+value_str+"""
    """
    df = query_sparql_and_convert_to_df(sparql_endpoint, query)
    author = pd.concat([author, df], ignore_index=True)
    time.sleep(2)

In [9]:
co_author = pd.DataFrame()

for value_int in range(0, 50000000, 10000000):
    value_str = str(value_int)
    query = """
    select ?title_id ?co_author where {
        ?title_id schema:contributor/rdfs:label ?co_author.
        ?title_id a schema:Book .
        ?title_id schema:name ?title .
        ?title_id schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/"""+dataset_kb+"""> .
    }
    LIMIT 10000000 OFFSET """+value_str+"""
    """
    df = query_sparql_and_convert_to_df(sparql_endpoint, query)
    co_author = pd.concat([co_author, df], ignore_index=True)
    time.sleep(1)

In [10]:
pub_year = pd.DataFrame()

for value_int in range(0, 30000000, 10000000):
    value_str = str(value_int)
    query = """
    select ?title_id ?pub_year where {
    ?title_id schema:publication/schema:startDate ?pub_year .
    ?title_id a schema:Book .
    ?title_id schema:name ?title .
    ?title_id schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/"""+dataset_kb+"""> .
    }
    LIMIT 10000000 OFFSET """+value_str+"""
    """
    df = query_sparql_and_convert_to_df(sparql_endpoint, query)
    pub_year = pd.concat([pub_year, df], ignore_index=True)
    time.sleep(1)

In [11]:
pub_location = pd.DataFrame()

for value_int in range(0, 30000000, 10000000):
    value_str = str(value_int)
    query = """
    select ?title_id ?pub_location {
    ?title_id schema:publication/schema:location/schema:name ?pub_location .
    ?title_id a schema:Book .
    ?title_id schema:name ?title .
    ?title_id schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/"""+dataset_kb+"""> .
    }
    LIMIT 10000000 OFFSET """+value_str+"""
    """
    df = query_sparql_and_convert_to_df(sparql_endpoint, query)
    pub_location = pd.concat([pub_location, df], ignore_index=True)
    time.sleep(1)

In [12]:
pub_name = pd.DataFrame()

for value_int in range(0, 30000000, 10000000):
    value_str = str(value_int)
    query = """
    select ?title_id ?publisher_name where {
    ?title_id schema:publication/schema:organizer/schema:name ?publisher_name .
    ?title_id a schema:Book .
    ?title_id schema:name ?title .
    ?title_id schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/"""+dataset_kb+"""> .
    }
    LIMIT 10000000 OFFSET """+value_str+"""
    """
    df = query_sparql_and_convert_to_df(sparql_endpoint, query)
    pub_name = pd.concat([pub_name, df], ignore_index=True)
    time.sleep(1)

In [13]:
df_join = pd.merge(primary_key, pub_year, left_on='title_id', right_on='title_id', how='left')
df_join2 = pd.merge(df_join, pub_location, left_on='title_id', right_on='title_id', how='left')
df_join_total = pd.merge(df_join2, pub_name, left_on='title_id', right_on='title_id', how='left')

In [14]:
# Export to csv and automatically fill in the date of the export
df_join_total.to_csv(folderlink+folder_output+dataset_kb+"_books_"+formatted_date+".csv", sep=';', encoding='utf-8', index=False)
author.to_csv(folderlink+folder_output+dataset_kb+"_books_authors_"+formatted_date+".csv", sep=';', encoding='utf-8', index=False)
co_author.to_csv(folderlink+folder_output+dataset_kb+"_books_co_authors_"+formatted_date+".csv", sep=';', encoding='utf-8', index=False)