In [None]:
import pandas as pd
import os
from SPARQLWrapper import SPARQLWrapper, JSON

In [None]:
folderlink = '..//data//'
input_folder = 'input//'
folder_output = 'output//'
ddrc_file = 'parent_data.csv'
input_file = os.path.join(folderlink+folder_output, ddrc_file)


In [None]:
sparql_endpoint = "http://data.bibliotheken.nl/sparql"


In [None]:
# query 1 to get all titles that are available in STCN
sparql_query = """
select ?title_id ?title where {
  ?p schema:mainEntityOfPage/owl:sameAs ?title_id .
  ?p schema:name ?title .
  ?p schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn> .
} 
"""

In [None]:
sparql = SPARQLWrapper(sparql_endpoint)
sparql.setQuery(sparql_query)
sparql.setReturnFormat(JSON)

results = sparql.query().convert()

bindings = results["results"]["bindings"]
data = []

for item in bindings:
    row = {}
    for key in item:
        row[key] = item[key]["value"]
    data.append(row)

df = pd.DataFrame(data)

# Panda settings for showing data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

output_file = folderlink+folder_output+"stcn_q1_nbt.csv"
df.to_csv(output_file, sep=';', encoding='utf-8', index=False)

df.describe()

There are 216051 unique book items in STCN of which 176553 are truly unique. Meaning that some book items have the same title or are duplicates. 

In [None]:
# query 2 to get all titles that are available in STCN plus additional data
sparql_query = """
select ?title_id ?title ?fingerprint where {
  ?p schema:mainEntityOfPage/owl:sameAs ?title_id .
  ?p schema:name ?title .
  OPTIONAL {
    ?p <http://data.bibliotheken.nl/def#stcnFingerprint> ?fingerprint .
    }
  ?p schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn> .
} 
"""

In [None]:
sparql = SPARQLWrapper(sparql_endpoint)
sparql.setQuery(sparql_query)
sparql.setReturnFormat(JSON)

results = sparql.query().convert()

bindings = results["results"]["bindings"]
data = []

for item in bindings:
    row = {}
    for key in item:
        row[key] = item[key]["value"]
    data.append(row)

df = pd.DataFrame(data)

# Panda settings for showing data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

output_file = folderlink+folder_output+"stcn_q2.csv"
df.to_csv(output_file, sep=';', encoding='utf-8', index=False)

df.describe()

There are 216051 unique titles in STCN, there are 201960 fingerprint records. Not every title has a fingerprinte, some titles have multiple fingerprints.

In [None]:
# query 3 to get all titles that are available in STCN plus information about the fingerptint and about the main author
sparql_query = """
select ?title_id ?title ?fingerprint ?author_id where {
  ?p schema:mainEntityOfPage/owl:sameAs ?title_id .
  ?p schema:name ?title .
  OPTIONAL {
    ?p <http://data.bibliotheken.nl/def#stcnFingerprint> ?fingerprint .
    ?p schema:author ?author_id .
    }
  ?p schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn> .
} 
"""

In [None]:
sparql = SPARQLWrapper(sparql_endpoint)
sparql.setQuery(sparql_query)
sparql.setReturnFormat(JSON)

results = sparql.query().convert()

bindings = results["results"]["bindings"]
data = []

for item in bindings:
    row = {}
    for key in item:
        row[key] = item[key]["value"]
    data.append(row)

df = pd.DataFrame(data)

# Panda settings for showing data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

output_file = folderlink+folder_output+"stcn_q3.csv"
df.to_csv(output_file, sep=';', encoding='utf-8', index=False)

df.describe()

text

In [None]:
# query 4 to get all titles that are available in STCN and information about the main author
sparql_query = """
select ?title_id ?title ?author_id ?author where {
  ?p schema:mainEntityOfPage/owl:sameAs ?title_id .
  ?p schema:name ?title .
  OPTIONAL {
    ?p schema:author ?author_id .
    ?p schema:author/schema:name ?author .


    ?p schema:author ?author_id .
    ?author_id schema:name ?author .

    }
  ?p schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn> .
} 
"""

In [None]:
sparql = SPARQLWrapper(sparql_endpoint)
sparql.setQuery(sparql_query)
sparql.setReturnFormat(JSON)

results = sparql.query().convert()

bindings = results["results"]["bindings"]
data = []

for item in bindings:
    row = {}
    for key in item:
        row[key] = item[key]["value"]
    data.append(row)

df = pd.DataFrame(data)

# Panda settings for showing data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

#output_file = folderlink+folder_output+"stcn_q4.csv"
#df.to_csv(output_file, sep=';', encoding='utf-8', index=False)

df.describe()

In [None]:
df.head()

In [None]:
# query 6 to get all titles that are available in STCN and information about the main author and additional information about it
sparql_query = """
select ?title_id ?title ?author_id ?author ?author_givenName where {
  ?p schema:mainEntityOfPage/owl:sameAs ?title_id .
  ?p schema:name ?title .
  OPTIONAL {
    ?p schema:author ?author_id .
    ?p schema:author/schema:name ?author .
    ?p schema:author/schema:author/schema:givenName ?author_givenName .
    }
  ?p schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn> .
} 
"""

In [None]:
sparql = SPARQLWrapper(sparql_endpoint)
sparql.setQuery(sparql_query)
sparql.setReturnFormat(JSON)

results = sparql.query().convert()

bindings = results["results"]["bindings"]
data = []

for item in bindings:
    row = {}
    for key in item:
        row[key] = item[key]["value"]
    data.append(row)

df = pd.DataFrame(data)

# Panda settings for showing data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

output_file = folderlink+folder_output+"stcn_q6.csv"
df.to_csv(output_file, sep=';', encoding='utf-8', index=False)

df.describe()

In [None]:
# query 7 to get all titles that are available in STCN and information about the main author and additional information about it. In addition alternative names are added.
sparql_query = """
select ?title_id ?title ?author_id ?author ?author_deathDate ?author_birthDate ?author_familyName ?author_givenName where {
  ?p schema:mainEntityOfPage/owl:sameAs ?title_id .
  ?p schema:name ?title .
  OPTIONAL {
    ?p schema:author ?author_id .
    ?p schema:author/schema:name ?author .
    ?p schema:author/schema:author/schema:deathDate ?author_deathDate .
    ?p schema:author/schema:author/schema:birthDate ?author_birthDate .
    ?p schema:author/schema:author/schema:familyName ?author_familyName .
    ?p schema:author/schema:author/schema:givenName ?author_givenName .
    }
  ?p schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn> .
} 
"""

In [None]:
sparql = SPARQLWrapper(sparql_endpoint)
sparql.setQuery(sparql_query)
sparql.setReturnFormat(JSON)

results = sparql.query().convert()

bindings = results["results"]["bindings"]
data = []

for item in bindings:
    row = {}
    for key in item:
        row[key] = item[key]["value"]
    data.append(row)

df = pd.DataFrame(data)

# Panda settings for showing data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

output_file = folderlink+folder_output+"stcn_q7.csv"
df.to_csv(output_file, sep=';', encoding='utf-8', index=False)

df.describe()

In [None]:
# query 8 to get all titles that are available in STCN and information about the main author and additional information about it. In addition alternative names are added.
sparql_query = """
select ?title_id ?title ?author_id ?author ?author_deathDate ?author_birthDate ?author_familyName ?author_givenName  where {
  ?p schema:mainEntityOfPage/owl:sameAs ?title_id .
  ?p schema:name ?title .
  OPTIONAL {
    ?p schema:author ?author_id .
    ?p schema:author/schema:name ?author .
    ?p schema:author/schema:author/schema:deathDate ?author_deathDate .
    ?p schema:author/schema:author/schema:birthDate ?author_birthDate .
    ?p schema:author/schema:author/schema:familyName ?author_familyName .
    ?p schema:author/schema:author/schema:givenName ?author_givenName .
    
    }
  ?p schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn> .
} 
"""

In [None]:
sparql = SPARQLWrapper(sparql_endpoint)
sparql.setQuery(sparql_query)
sparql.setReturnFormat(JSON)

results = sparql.query().convert()

bindings = results["results"]["bindings"]
data = []

for item in bindings:
    row = {}
    for key in item:
        row[key] = item[key]["value"]
    data.append(row)

df = pd.DataFrame(data)

# Panda settings for showing data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

output_file = folderlink+folder_output+"stcn_q8.csv"
df.to_csv(output_file, sep=';', encoding='utf-8', index=False)

df.describe()

In [None]:
# query 5 to get all titles that are available in STCN and information about the contributing authors
sparql_query = """
select ?title_id ?title ?contributor_id ?contributor where {
  ?p schema:mainEntityOfPage/owl:sameAs ?title_id .
  ?p schema:name ?title .
  OPTIONAL {
    ?p schema:contributor ?contributor_id .
    ?p schema:contributor/schema:name ?contributor .
    }
  ?p schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn> .
} 
"""

In [None]:
sparql = SPARQLWrapper(sparql_endpoint)
sparql.setQuery(sparql_query)
sparql.setReturnFormat(JSON)

results = sparql.query().convert()

bindings = results["results"]["bindings"]
data = []

for item in bindings:
    row = {}
    for key in item:
        row[key] = item[key]["value"]
    data.append(row)

df = pd.DataFrame(data)

# Panda settings for showing data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

output_file = folderlink+folder_output+"stcn_q5.csv"
df.to_csv(output_file, sep=';', encoding='utf-8', index=False)

df.describe()

In [None]:
# query 9 to get all titles that are available in STCN and information about the contributing authors
sparql_query = """
select ?title_id ?title ?contributor_id ?contributor ?contributor_deathDate ?contributor_birthDate ?contributor_familyName ?contributor_givenName where {
  ?p schema:mainEntityOfPage/owl:sameAs ?title_id .
  ?p schema:name ?title .
  OPTIONAL {
    ?p schema:contributor ?contributor_id .
    ?p schema:contributor/schema:name ?contributor .
    ?p schema:contributor/schema:contributor/schema:deathDate ?contributor_deathDate .
    ?p schema:contributor/schema:contributor/schema:birthDate ?contributor_birthDate .
    ?p schema:contributor/schema:contributor/schema:familyName ?contributor_familyName .
    ?p schema:contributor/schema:contributor/schema:givenName ?contributor_givenName .
    }
  ?p schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn> .
} 
"""

In [None]:
sparql = SPARQLWrapper(sparql_endpoint)
sparql.setQuery(sparql_query)
sparql.setReturnFormat(JSON)

results = sparql.query().convert()

bindings = results["results"]["bindings"]
data = []

for item in bindings:
    row = {}
    for key in item:
        row[key] = item[key]["value"]
    data.append(row)

df = pd.DataFrame(data)

# Panda settings for showing data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

output_file = folderlink+folder_output+"stcn_q9.csv"
df.to_csv(output_file, sep=';', encoding='utf-8', index=False)

df.describe()

In [None]:
# query 10 to get all titles that are available in STCN and the author and data that these were published
sparql_query = """
select ?title_id ?title ?pub_start_date ?author_id ?author ?author_deathDate ?author_birthDate ?author_familyName ?author_givenName  where {
  ?p schema:mainEntityOfPage/owl:sameAs ?title_id .
  ?p schema:name ?title .
  OPTIONAL {
    ?p schema:publication/schema:startDate ?pub_start_date .
    ?p schema:author ?author_id .
    ?p schema:author/schema:name ?author .
    ?p schema:author/schema:author/schema:deathDate ?author_deathDate .
    ?p schema:author/schema:author/schema:birthDate ?author_birthDate .
    ?p schema:author/schema:author/schema:familyName ?author_familyName .
    ?p schema:author/schema:author/schema:givenName ?author_givenName .
    
    }
  ?p schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn> .
} 
"""

In [None]:
sparql = SPARQLWrapper(sparql_endpoint)
sparql.setQuery(sparql_query)
sparql.setReturnFormat(JSON)

results = sparql.query().convert()

bindings = results["results"]["bindings"]
data = []

for item in bindings:
    row = {}
    for key in item:
        row[key] = item[key]["value"]
    data.append(row)

df = pd.DataFrame(data)

# Panda settings for showing data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

output_file = folderlink+folder_output+"stcn_q10.csv"
df.to_csv(output_file, sep=';', encoding='utf-8', index=False)

df.describe()