In [1]:
# Import the various libraries
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
import time

In [2]:
# Settings for files
folderlink = '..//data//'
input_folder = 'input//'
folder_output = 'output//'

In [3]:
# Add sparql endpoint
sparql_endpoint = "http://data.bibliotheken.nl/sparql"

In [4]:
# The current script generates a dump for the STCN dataset with the fieldnames requested by the the Lead applicant from the project.
# In order to generate a similar datadump with the same fields for the Nederlandse Bibliografie Totaal (NBT) "stcn" should be changed into "nbt"

dataset_kb = "nbt"

In [5]:
def query_sparql_and_convert_to_df(sparql_endpoint, query):
    sparql = SPARQLWrapper(sparql_endpoint)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    bindings = results["results"]["bindings"]
    data = []
    for item in bindings:
        row = {}
        for key in item:
            row[key] = item[key]["value"]
        data.append(row)
    df = pd.DataFrame(data)
    return df

In [6]:
primary_key = pd.DataFrame()

for value_int in range(0, 30000000, 10000000):
    value_str = str(value_int)
    query = """
    SELECT ?title_id ?title WHERE {
    ?title_id a schema:Book .
    ?title_id schema:name ?title .
    ?title_id schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/"""+dataset_kb+"""> .
    }
    LIMIT 10000000 OFFSET """+value_str+"""
    """
    df = query_sparql_and_convert_to_df(sparql_endpoint, query)
    primary_key = pd.concat([primary_key, df], ignore_index=True)
    time.sleep(1)


In [11]:
language = pd.DataFrame()

for value_int in range(0, 30000000, 10000000):
    value_str = str(value_int)
    query = """
    select ?title_id ?language where {
    ?title_id schema:inLanguage ?language .
    ?title_id a schema:Book .
    ?title_id schema:name ?title .
    ?title_id schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/"""+dataset_kb+"""> .
    }
    LIMIT 10000000 OFFSET """+value_str+"""
    """
    df = query_sparql_and_convert_to_df(sparql_endpoint, query)
    language = pd.concat([language, df], ignore_index=True)
    time.sleep(1)

In [None]:
pub_year = pd.DataFrame()

for value_int in range(0, 30000000, 10000000):
    value_str = str(value_int)
    query = """
    select ?title_id ?pub_year where {
    ?title_id schema:publication/schema:startDate ?pub_year .
    ?title_id a schema:Book .
    ?title_id schema:name ?title .
    ?title_id schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/"""+dataset_kb+"""> .
    }
    LIMIT 10000000 OFFSET """+value_str+"""
    """
    df = query_sparql_and_convert_to_df(sparql_endpoint, query)
    pub_year = pd.concat([pub_year, df], ignore_index=True)
    time.sleep(1)

In [None]:
pub_location = pd.DataFrame()

for value_int in range(0, 30000000, 10000000):
    value_str = str(value_int)
    query = """
    select ?title_id ?pub_location {
    ?title_id schema:publication/schema:location/schema:name ?pub_location .
    ?title_id a schema:Book .
    ?title_id schema:name ?title .
    ?title_id schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/"""+dataset_kb+"""> .
    }
    LIMIT 10000000 OFFSET """+value_str+"""
    """
    df = query_sparql_and_convert_to_df(sparql_endpoint, query)
    pub_location = pd.concat([pub_location, df], ignore_index=True)
    time.sleep(1)

In [None]:
author = pd.DataFrame()

for value_int in range(0, 30000000, 10000000):
    value_str = str(value_int)
    query = """
    select ?title_id ?author where {
    ?title_id schema:author/schema:name ?author .
    ?title_id a schema:Book .
    ?title_id schema:name ?title .
    ?title_id schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/"""+dataset_kb+"""> .
    }
    LIMIT 10000000 OFFSET """+value_str+"""
    """
    df = query_sparql_and_convert_to_df(sparql_endpoint, query)
    author = pd.concat([author, df], ignore_index=True)
    time.sleep(1)

In [None]:
co_author = pd.DataFrame()

for value_int in range(0, 40000000, 10000000):
    value_str = str(value_int)
    query = """
    select ?title_id ?co_author where {
    ?title_id schema:contributor/schema:name ?co_author.
    ?title_id a schema:Book .
    ?title_id schema:name ?title .
    ?title_id schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/"""+dataset_kb+"""> .
    }
    LIMIT 10000000 OFFSET """+value_str+"""
    """
    df = query_sparql_and_convert_to_df(sparql_endpoint, query)
    co_author = pd.concat([co_author, df], ignore_index=True)
    time.sleep(1)

In [None]:
pub_name = pd.DataFrame()

for value_int in range(0, 30000000, 10000000):
    value_str = str(value_int)
    query = """
    select ?title_id ?publisher_name where {
    ?title_id schema:publication/schema:organizer/schema:name ?publisher_name .
    ?title_id a schema:Book .
    ?title_id schema:name ?title .
    ?title_id schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/"""+dataset_kb+"""> .
    }
    LIMIT 10000000 OFFSET """+value_str+"""
    """
    df = query_sparql_and_convert_to_df(sparql_endpoint, query)
    pub_name = pd.concat([pub_name, df], ignore_index=True)
    time.sleep(1)

In [None]:
df_join = pd.merge(primary_key, language, left_on='title_id', right_on='title_id', how='left')
df_join2 = pd.merge(df_join, pub_year, left_on='title_id', right_on='title_id', how='left')
df_join3 = pd.merge(df_join2, pub_location, left_on='title_id', right_on='title_id', how='left')
df_join4 = pd.merge(df_join3, author, left_on='title_id', right_on='title_id', how='left')
df_join5 = pd.merge(df_join4, co_author, left_on='title_id', right_on='title_id', how='left')
df_join_total = pd.merge(df_join5, pub_name, left_on='title_id', right_on='title_id', how='left')

In [None]:
df_join_total.to_csv(folderlink+folder_output+dataset_kb+"_Dump.csv", sep=';', encoding='utf-8', index=False)

In [None]:
excel_export = folderlink+folder_output+dataset_kb+"excel.xlsx"
df_join_total.to_excel(excel_export, index=False, engine='openpyxl')
