In [1]:
# Import the various libraries
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
import time

In [2]:
from datetime import datetime
current_date = datetime.now()
formatted_date = current_date.strftime('%d%m%Y')

In [3]:
# Settings for files
folderlink = '..//data//'
input_folder = 'input//'
folder_output = 'output//'

In [4]:
# Add sparql endpoint
sparql_endpoint = "http://data.bibliotheken.nl/sparql"

In [5]:
dataset_kb = "nbt"

In [6]:
def query_sparql_and_convert_to_df(sparql_endpoint, query):
    sparql = SPARQLWrapper(sparql_endpoint)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    bindings = results["results"]["bindings"]
    data = []
    for item in bindings:
        row = {}
        for key in item:
            row[key] = item[key]["value"]
        data.append(row)
    df = pd.DataFrame(data)
    return df

In [7]:
primary_key = pd.DataFrame()

for value_int in range(0, 30000000, 10000000):
    value_str = str(value_int)
    query = """
    SELECT ?title_id ?title WHERE {
    ?title_id schema:name ?title .
    ?title_id schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/"""+dataset_kb+"""> .
    }
    LIMIT 10000000 OFFSET """+value_str+"""
    """
    df = query_sparql_and_convert_to_df(sparql_endpoint, query)
    primary_key = pd.concat([primary_key, df], ignore_index=True)
    time.sleep(1)


In [8]:
primary_key.head()

Unnamed: 0,title_id,title
0,http://data.bibliotheken.nl/id/nbt/p750005645,Nonarchimedean fields and asymptotic expansions
1,http://data.bibliotheken.nl/id/nbt/p750009543,Urethral obstruction in boys : diagnosis and t...
2,http://data.bibliotheken.nl/id/nbt/p750009969,Health and industrial growth
3,http://data.bibliotheken.nl/id/nbt/p750012978,Good news in Acts : the ''Acts of the Apostles...
4,http://data.bibliotheken.nl/id/nbt/p750023635,Basisvorming voor alle leerlingen? : de werksi...


In [9]:
book = pd.DataFrame()

for value_int in range(0, 30000000, 10000000):
    value_str = str(value_int)
    query = """
    SELECT ?title_id ( "book" AS ?book ) WHERE {
    ?title_id a schema:Book .
    ?title_id schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/"""+dataset_kb+"""> .
    }
    LIMIT 10000000 OFFSET """+value_str+"""
    """
    df = query_sparql_and_convert_to_df(sparql_endpoint, query)
    book = pd.concat([book, df], ignore_index=True)
    time.sleep(1)

In [12]:
book_series = pd.DataFrame()

for value_int in range(0, 30000000, 10000000):
    value_str = str(value_int)
    query = """
    SELECT ?title_id ( "bookseries" AS ?bookseries ) WHERE {
    ?title_id a schema:BookSeries .
    ?title_id schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/"""+dataset_kb+"""> .
    }
    LIMIT 10000000 OFFSET """+value_str+"""
    """
    df = query_sparql_and_convert_to_df(sparql_endpoint, query)
    book_series = pd.concat([book_series, df], ignore_index=True)
    time.sleep(1)

In [14]:
periodical = pd.DataFrame()

for value_int in range(0, 30000000, 10000000):
    value_str = str(value_int)
    query = """
    SELECT ?title_id ( "periodical" AS ?periodical ) WHERE {
    ?title_id a schema:Periodical .
    ?title_id schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/"""+dataset_kb+"""> .
    }
    LIMIT 10000000 OFFSET """+value_str+"""
    """
    df = query_sparql_and_convert_to_df(sparql_endpoint, query)
    periodical = pd.concat([periodical, df], ignore_index=True)
    time.sleep(1)

Unnamed: 0,title_id,bookseries
count,72655,72655
unique,72655,1
top,http://data.bibliotheken.nl/id/nbt/p791367398,bookseries
freq,1,72655


In [None]:
article = pd.DataFrame()

for value_int in range(0, 30000000, 10000000):
    value_str = str(value_int)
    query = """
    SELECT ?title_id ( "article" AS ?article ) WHERE {
    ?title_id a schema:Article .
    ?title_id schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/"""+dataset_kb+"""> .
    }
    LIMIT 10000000 OFFSET """+value_str+"""
    """
    df = query_sparql_and_convert_to_df(sparql_endpoint, query)
    article = pd.concat([article, df], ignore_index=True)
    time.sleep(1)

In [None]:
#http://schema.org/CreativeWork
http://schema.org/Book
#http://schema.org/ProductModel
http://schema.org/BookSeries
http://schema.org/Periodical
#http://schema.org/Map
#http://schema.org/SoftwareApplication
#http://schema.org/MusicComposition
#http://schema.org/VisualArtwork
#http://purl.org/ontology/bibo/Manuscript
http://schema.org/Article
#http://purl.org/ontology/bibo:Manuscript



In [10]:
language = pd.DataFrame()

for value_int in range(0, 30000000, 10000000):
    value_str = str(value_int)
    query = """
    select ?title_id ?language where {
    ?title_id schema:inLanguage ?language .
    ?title_id a schema:Book .
    ?title_id schema:name ?title .
    ?title_id schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/"""+dataset_kb+"""> .
    }
    LIMIT 10000000 OFFSET """+value_str+"""
    """
    df = query_sparql_and_convert_to_df(sparql_endpoint, query)
    language = pd.concat([language, df], ignore_index=True)
    time.sleep(1)

In [11]:
pub_year = pd.DataFrame()

for value_int in range(0, 30000000, 10000000):
    value_str = str(value_int)
    query = """
    select ?title_id ?pub_year where {
    ?title_id schema:publication/schema:startDate ?pub_year .
    ?title_id a schema:Book .
    ?title_id schema:name ?title .
    ?title_id schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/"""+dataset_kb+"""> .
    }
    LIMIT 10000000 OFFSET """+value_str+"""
    """
    df = query_sparql_and_convert_to_df(sparql_endpoint, query)
    pub_year = pd.concat([pub_year, df], ignore_index=True)
    time.sleep(1)

In [12]:
pub_location = pd.DataFrame()

for value_int in range(0, 30000000, 10000000):
    value_str = str(value_int)
    query = """
    select ?title_id ?pub_location {
    ?title_id schema:publication/schema:location/schema:name ?pub_location .
    ?title_id a schema:Book .
    ?title_id schema:name ?title .
    ?title_id schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/"""+dataset_kb+"""> .
    }
    LIMIT 10000000 OFFSET """+value_str+"""
    """
    df = query_sparql_and_convert_to_df(sparql_endpoint, query)
    pub_location = pd.concat([pub_location, df], ignore_index=True)
    time.sleep(1)

In [13]:
author = pd.DataFrame()

for value_int in range(0, 40000000, 10000000):
    value_str = str(value_int)
    query = """
    select ?title_id ?author where {
    ?title_id schema:author/schema:name ?author .
    ?title_id a schema:Book .
    ?title_id schema:name ?title .
    ?title_id schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/"""+dataset_kb+"""> .
    }
    LIMIT 10000000 OFFSET """+value_str+"""
    """
    df = query_sparql_and_convert_to_df(sparql_endpoint, query)
    author = pd.concat([author, df], ignore_index=True)
    time.sleep(1)

In [14]:
co_author = pd.DataFrame()

for value_int in range(0, 40000000, 10000000):
    value_str = str(value_int)
    query = """
    select ?title_id ?co_author where {
    ?title_id schema:contributor/schema:name ?co_author.
    ?title_id a schema:Book .
    ?title_id schema:name ?title .
    ?title_id schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/"""+dataset_kb+"""> .
    }
    LIMIT 10000000 OFFSET """+value_str+"""
    """
    df = query_sparql_and_convert_to_df(sparql_endpoint, query)
    co_author = pd.concat([co_author, df], ignore_index=True)
    time.sleep(1)

In [15]:
pub_name = pd.DataFrame()

for value_int in range(0, 30000000, 10000000):
    value_str = str(value_int)
    query = """
    select ?title_id ?publisher_name where {
    ?title_id schema:publication/schema:organizer/schema:name ?publisher_name .
    ?title_id a schema:Book .
    ?title_id schema:name ?title .
    ?title_id schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/"""+dataset_kb+"""> .
    }
    LIMIT 10000000 OFFSET """+value_str+"""
    """
    df = query_sparql_and_convert_to_df(sparql_endpoint, query)
    pub_name = pd.concat([pub_name, df], ignore_index=True)
    time.sleep(1)

In [16]:
df_join = pd.merge(primary_key, language, left_on='title_id', right_on='title_id', how='left')
df_join2 = pd.merge(df_join, pub_year, left_on='title_id', right_on='title_id', how='left')
df_join3 = pd.merge(df_join2, pub_location, left_on='title_id', right_on='title_id', how='left')
df_join4 = pd.merge(df_join3, author, left_on='title_id', right_on='title_id', how='left')
df_join5 = pd.merge(df_join4, co_author, left_on='title_id', right_on='title_id', how='left')
df_join6 = pd.merge(df_join5, pub_name, left_on='title_id', right_on='title_id', how='left')
df_join7 = pd.merge(df_join6, book, left_on='title_id', right_on='title_id', how='left')
df_join8 = pd.merge(df_join6, book_series, left_on='title_id', right_on='title_id', how='left')
df_join9 = pd.merge(df_join8, article, left_on='title_id', right_on='title_id', how='left')
df_join_total = pd.merge(df_join9, periodical, left_on='title_id', right_on='title_id', how='left')


In [17]:
df_join_total.to_csv(folderlink+folder_output+dataset_kb+"_Dump"+formatted_date+".csv", sep=';', encoding='utf-8', index=False)

In [19]:
df1 = df_join_total.iloc[:1000000]
df2 = df_join_total.iloc[1000000:]

excel_export1 = folderlink+folder_output+dataset_kb+"nbt_export_part1_"+formatted_date+".xlsx"
df1.to_excel(excel_export1, index=False, engine='openpyxl')

excel_export2 = folderlink+folder_output+dataset_kb+"nbt_export_part2_"+formatted_date+".xlsx"
df2.to_excel(excel_export2, index=False, engine='openpyxl')
