In [129]:
import pandas as pd
import os
from SPARQLWrapper import SPARQLWrapper, JSON

In [130]:
folderlink = '..//data//'
input_folder = 'input//'
folder_output = 'output//'
ddrc_file = 'parent_data.csv'
input_file = os.path.join(folderlink+folder_output, ddrc_file)


In [131]:
sparql_endpoint = "http://data.bibliotheken.nl/sparql"


In [146]:
sparql_query = """
select ?title_id ?title ?author_id ?author ?contributor ?contributor_id ?startdate where {
  ?publicatie schema:mainEntityOfPage/owl:sameAs ?title_id .
  ?publicatie schema:name ?title .
  ?publicatie schema:publication/schema:startDate ?startdate .
  ?publicatie schema:author/schema:author ?author_id .
  ?publicatie schema:author/schema:name ?author .
  ?publicatie schema:contributor/schema:contributor ?contributor_id .
  ?publicatie schema:contributor/schema:name ?contributor .
  ?publicatie schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn> .
} 
"""

In [167]:
sparql_query = """
SELECT ?title_id ?title ?author_id ?author (DISTINCT ?contributor) ?contributor_id ?startdate WHERE {
  ?publicatie schema:mainEntityOfPage/owl:sameAs ?title_id .
  ?publicatie schema:name ?title .
  OPTIONAL {
    ?publicatie schema:contributor/schema:contributor ?contributor_id .
    ?publicatie schema:contributor/schema:name ?contributor .
    ?publicatie schema:publication/schema:startDate ?startdate .
    ?publicatie schema:author/schema:author ?author_id .
    ?publicatie schema:author/schema:name ?author .
  }
  ?publicatie schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn> .
}
"""



In [169]:
sparql_query = """
SELECT ?title_id ?title ?author_id ?author (SAMPLE(?contributor) as ?distinct_contributor) ?contributor_id ?startdate WHERE {
  ?publicatie schema:mainEntityOfPage/owl:sameAs ?title_id .
  ?publicatie schema:name ?title .
  ?publicatie schema:publication/schema:startDate ?startdate .
  ?publicatie schema:author/schema:author ?author_id .
  ?publicatie schema:author/schema:name ?author .
  OPTIONAL {
    ?publicatie schema:contributor/schema:contributor ?contributor_id .
    ?publicatie schema:contributor/schema:name ?contributor .
  }
  ?publicatie schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn> .
}
GROUP BY ?title_id ?title ?author_id ?author ?contributor_id ?startdate
"""

In [170]:
sparql = SPARQLWrapper(sparql_endpoint)
sparql.setQuery(sparql_query)
sparql.setReturnFormat(JSON)

results = sparql.query().convert()

bindings = results["results"]["bindings"]
data = []

for item in bindings:
    row = {}
    for key in item:
        row[key] = item[key]["value"]
    data.append(row)

df = pd.DataFrame(data)

# Panda settings for showing data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

#df.head(500)

In [171]:
df.describe()

Unnamed: 0,title_id,title,author_id,author,distinct_contributor,contributor_id,startdate
count,121013,121013,121013,121013,28729,28729,121013
unique,117939,92223,37898,51287,8514,6878,380
top,http://data.bibliotheken.nl/doc/nbt/p303809205,Theses juridicae inaugurales.,http://data.bibliotheken.nl/id/thes/p068423667,De Voltaire,Johannes à Marck,http://data.bibliotheken.nl/id/thes/p06914852X,17XX
freq,9,496,444,195,366,374,2429


In [172]:
output_file = folderlink+folder_output+"stcn.csv"
df.to_csv(output_file, sep=';', encoding='utf-8', index=False)

In [None]:
df_duplicates = df[df.duplicated('title_id', keep=False)]

In [None]:
df_duplicates.head(500)

In [None]:
df.sort_values('title_id', ascending=False)