In [None]:
import pandas as pd
import os
from SPARQLWrapper import SPARQLWrapper, JSON

In [None]:
folderlink = '..//data//'
input_folder = 'input//'
folder_output = 'output//'
ddrc_file = 'parent_data.csv'
input_file = os.path.join(folderlink+folder_output, ddrc_file)


In [None]:
sparql_endpoint = "http://data.bibliotheken.nl/sparql"


In [None]:
# query 1 to get all titles that are available in STCN
sparql_query = """
select ?title_id ?title where {
  ?p schema:mainEntityOfPage/owl:sameAs ?title_id .
  ?p schema:name ?title .
  ?p schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn> .
} 
"""

In [None]:
sparql = SPARQLWrapper(sparql_endpoint)
sparql.setQuery(sparql_query)
sparql.setReturnFormat(JSON)

results = sparql.query().convert()

bindings = results["results"]["bindings"]
data = []

for item in bindings:
    row = {}
    for key in item:
        row[key] = item[key]["value"]
    data.append(row)

df = pd.DataFrame(data)

# Panda settings for showing data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

output_file = folderlink+folder_output+"stcn_q1.csv"
df.to_csv(output_file, sep=';', encoding='utf-8', index=False)

df.describe()

title_id	title
count	216051	216051
unique	216051	176553
top	http://data.bibliotheken.nl/doc/nbt/p82210427X	Theses juridicae inaugurales.
freq	1	497

There are 216051 unique book items in STCN of which 176553 are truly unique. Meaning that some book items have the same title or are duplicates. 

In [None]:
# query 2 to get all titles that are available in STCN plus additional data
sparql_query = """
select ?title_id ?title ?fingerprint where {
  ?p schema:mainEntityOfPage/owl:sameAs ?title_id .
  ?p schema:name ?title .
  OPTIONAL {
    ?p <http://data.bibliotheken.nl/def#stcnFingerprint> ?fingerprint .
    }
  ?p schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn> .
} 
"""

In [None]:
sparql = SPARQLWrapper(sparql_endpoint)
sparql.setQuery(sparql_query)
sparql.setReturnFormat(JSON)

results = sparql.query().convert()

bindings = results["results"]["bindings"]
data = []

for item in bindings:
    row = {}
    for key in item:
        row[key] = item[key]["value"]
    data.append(row)

df = pd.DataFrame(data)

# Panda settings for showing data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

output_file = folderlink+folder_output+"stcn_q2.csv"
df.to_csv(output_file, sep=';', encoding='utf-8', index=False)

df.describe()


title_id	title	fingerprint
count	222619	222619	214773
unique	216051	176553	201960
top	http://data.bibliotheken.nl/doc/nbt/p375930256	Theses juridicae inaugurales.	000004 - b1=b2 A e
freq	13	499	566

There are 216051 unique titles in STCN, there are 201960 fingerprint records. Not every title has a fingerprinte, some titles have multiple fingerprints.

In [None]:
# query 3 to get all titles that are available in STCN plus information about the fingerptint and about the main author
sparql_query = """
select ?title_id ?title ?fingerprint ?author_id where {
  ?p schema:mainEntityOfPage/owl:sameAs ?title_id .
  ?p schema:name ?title .
  OPTIONAL {
    ?p <http://data.bibliotheken.nl/def#stcnFingerprint> ?fingerprint .
    ?p schema:author ?author_id .
    }
  ?p schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn> .
} 
"""

In [None]:
sparql = SPARQLWrapper(sparql_endpoint)
sparql.setQuery(sparql_query)
sparql.setReturnFormat(JSON)

results = sparql.query().convert()

bindings = results["results"]["bindings"]
data = []

for item in bindings:
    row = {}
    for key in item:
        row[key] = item[key]["value"]
    data.append(row)

df = pd.DataFrame(data)

# Panda settings for showing data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

output_file = folderlink+folder_output+"stcn_q3.csv"
df.to_csv(output_file, sep=';', encoding='utf-8', index=False)

df.describe()

title_id	title	fingerprint	author_id
count	220080	220080	115660	115660
unique	216051	176553	112631	111631
top	http://data.bibliotheken.nl/doc/nbt/p235143030	Theses juridicae inaugurales.	000004	nodeID://b266146401
freq	9	499	445	9

In [None]:
# query 4 to get all titles that are available in STCN and information about the main author
sparql_query = """
select ?title_id ?title ?author_id ?author where {
  ?p schema:mainEntityOfPage/owl:sameAs ?title_id .
  ?p schema:name ?title .
  OPTIONAL {
    ?p schema:author ?author_id .
    ?p schema:author/schema:name ?author .
    }
  ?p schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn> .
} 
"""

In [None]:
sparql = SPARQLWrapper(sparql_endpoint)
sparql.setQuery(sparql_query)
sparql.setReturnFormat(JSON)

results = sparql.query().convert()

bindings = results["results"]["bindings"]
data = []

for item in bindings:
    row = {}
    for key in item:
        row[key] = item[key]["value"]
    data.append(row)

df = pd.DataFrame(data)

# Panda settings for showing data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

output_file = folderlink+folder_output+"stcn_q4.csv"
df.to_csv(output_file, sep=';', encoding='utf-8', index=False)

df.describe()

title_id	title	author_id	author
count	216051	216051	117951	117951
unique	216051	176553	117951	51294
top	http://data.bibliotheken.nl/doc/nbt/p82220682X	Theses juridicae inaugurales.	nodeID://b265835929	De Voltaire
freq	1	497	1	194

In [None]:
# query 5 to get all titles that are available in STCN and information about the contributing authors
sparql_query = """
select ?title_id ?title ?contributor_id ?contributor where {
  ?p schema:mainEntityOfPage/owl:sameAs ?title_id .
  ?p schema:name ?title .
  OPTIONAL {
    ?p schema:contributor ?contributor_id .
    ?p schema:contributor/schema:name ?contributor .
    }
  ?p schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn> .
} 
"""

In [None]:
sparql = SPARQLWrapper(sparql_endpoint)
sparql.setQuery(sparql_query)
sparql.setReturnFormat(JSON)

results = sparql.query().convert()

bindings = results["results"]["bindings"]
data = []

for item in bindings:
    row = {}
    for key in item:
        row[key] = item[key]["value"]
    data.append(row)

df = pd.DataFrame(data)

# Panda settings for showing data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

output_file = folderlink+folder_output+"stcn_q5.csv"
df.to_csv(output_file, sep=';', encoding='utf-8', index=False)

df.describe()

title_id	title	contributor_id	contributor
count	251541	251541	70165	70165
unique	216051	176553	42161	13316
top	http://data.bibliotheken.nl/doc/nbt/p284863009	Verhandelingen uitgegeven door het Zeeuwsch Genootschap der Wetenschappen te Vlissingen.	nodeID://b266206822	Luyken
freq	400	1760	20	792

In [None]:
# query 6 to get all titles that are available in STCN and information about the main author and additional information about it
sparql_query = """
select ?title_id ?title ?author_id ?author ?author_familyName ?author_givenName where {
  ?p schema:mainEntityOfPage/owl:sameAs ?title_id .
  ?p schema:name ?title .
  OPTIONAL {
    ?p schema:author ?author_id .
    ?p schema:author/schema:name ?author .
    ?p schema:author/schema:familyName ?author_familyName .
    ?p schema:author/schema:givenName ?author_givenName .
    }
  ?p schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn> .
} 
"""

In [None]:
sparql = SPARQLWrapper(sparql_endpoint)
sparql.setQuery(sparql_query)
sparql.setReturnFormat(JSON)

results = sparql.query().convert()

bindings = results["results"]["bindings"]
data = []

for item in bindings:
    row = {}
    for key in item:
        row[key] = item[key]["value"]
    data.append(row)

df = pd.DataFrame(data)

# Panda settings for showing data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

output_file = folderlink+folder_output+"stcn_q6.csv"
df.to_csv(output_file, sep=';', encoding='utf-8', index=False)

df.describe()

In [None]:
# query 7 to get all titles that are available in STCN and information about the main author and additional information about it. In addition alternative names are added.
sparql_query = """
select ?title_id ?title ?author_id ?author ?author_deathDate ?author_birthDate ?author_familyName ?author_givenName where {
  ?p schema:mainEntityOfPage/owl:sameAs ?title_id .
  ?p schema:name ?title .
  OPTIONAL {
    ?p schema:author ?author_id .
    ?p schema:author/schema:name ?author .
    ?p schema:author/schema:deathDate ?author_deathDate .
    ?p schema:author/schema:birthDate ?author_birthDate .
    ?p schema:author/schema:familyName ?author_familyName .
    ?p schema:author/schema:givenName ?author_givenName .
    ?p schema:author/schema:alternateName/schema:name ?alt_authorname .
    }
  ?p schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn> .
} 
"""

In [None]:
sparql_query = """
select ?title_id ?title ?author_id ?author ?contributor ?contributor_id ?startdate where {
  ?publicatie schema:mainEntityOfPage/owl:sameAs ?title_id .
  ?publicatie schema:name ?title .
  ?publicatie schema:publication/schema:startDate ?startdate .
  ?publicatie schema:author/schema:author ?author_id .
  ?publicatie schema:author/schema:name ?author .
  ?publicatie schema:contributor/schema:contributor ?contributor_id .
  ?publicatie schema:contributor/schema:name ?contributor .
  ?publicatie schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn> .
} 
"""

In [None]:
sparql_query = """
SELECT ?title_id ?title ?author_id ?author (DISTINCT ?contributor) ?contributor_id ?startdate WHERE {
  ?publicatie schema:mainEntityOfPage/owl:sameAs ?title_id .
  ?publicatie schema:name ?title .
  OPTIONAL {
    ?publicatie schema:contributor/schema:contributor ?contributor_id .
    ?publicatie schema:contributor/schema:name ?contributor .
    ?publicatie schema:publication/schema:startDate ?startdate .
    ?publicatie schema:author/schema:author ?author_id .
    ?publicatie schema:author/schema:name ?author .
  }
  ?publicatie schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn> .
}
"""



In [None]:
sparql_query = """
SELECT ?title_id ?title ?author_id ?author ?contributor ?contributor_id ?startdate WHERE {
  ?p schema:mainEntityOfPage/owl:sameAs ?title_id .
  ?p schema:name ?title .
  OPTIONAL {
    ?p schema:contributor/schema:contributor ?contributor_id .
    ?p schema:contributor/schema:name ?contributor .
    ?p schema:publication/schema:startDate ?startdate .
    ?p schema:author/schema:author ?author_id .
    ?p schema:author/schema:name ?author .
  }
  ?p schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn> .
}
"""

In [None]:
sparql_query = """
SELECT ?title_id ?title ?author_id ?author (SAMPLE(?contributor) as ?distinct_contributor) ?contributor_id ?startdate WHERE {
  ?publicatie schema:mainEntityOfPage/owl:sameAs ?title_id .
  ?publicatie schema:name ?title .
  ?publicatie schema:publication/schema:startDate ?startdate .
  ?publicatie schema:author/schema:author ?author_id .
  ?publicatie schema:author/schema:name ?author .
  OPTIONAL {
    ?publicatie schema:contributor/schema:contributor ?contributor_id .
    ?publicatie schema:contributor/schema:name ?contributor .
    ?publicatie schema:contributor/schema:name ?contributor .
  }
  ?publicatie schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn> .
}
GROUP BY ?title_id ?title ?author_id ?author ?contributor_id ?startdate
LIMIT 50
"""

In [None]:
sparql = SPARQLWrapper(sparql_endpoint)
sparql.setQuery(sparql_query)
sparql.setReturnFormat(JSON)

results = sparql.query().convert()

bindings = results["results"]["bindings"]
data = []

for item in bindings:
    row = {}
    for key in item:
        row[key] = item[key]["value"]
    data.append(row)

df = pd.DataFrame(data)

# Panda settings for showing data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

output_file = folderlink+folder_output+"stcn_q6.csv"
df.to_csv(output_file, sep=';', encoding='utf-8', index=False)

df.describe()

In [None]:
output_file = folderlink+folder_output+"stcn_q6.csv"
df.to_csv(output_file, sep=';', encoding='utf-8', index=False)

In [None]:
df.describe()

In [None]:
df.head()