# dbnlist

Trying to pull authors from the DBNL linked data resources.

Anthe - 2024-11-29

## SPARQL data retrieval

We pull all data from the DBNL linked data resources, so we can further process it. I made this SPARQL query several months ago, and I absolutely don't remember anything about it. I just know it works!

In [26]:
!pip install SPARQLWrapper
!pip install pandas



In [27]:
from SPARQLWrapper import SPARQLWrapper, JSON

In [28]:
def run_query(query):
    sparql = SPARQLWrapper("http://data.bibliotheken.nl/sparql")
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    
    return results["results"]["bindings"]

In [29]:
find_authors = """
    PREFIX dbnlt:<http://data.bibliotheken.nl/id/dataset/dbnla>
    PREFIX schema:<http://schema.org/>
    SELECT DISTINCT
        ?s
        ?name
        ?identifier
        ?author_given_name
        ?author_family_name
        ?occupation
        ?birthPlace
        ?birthDate
        ?deathPlace
        ?deathDate
        ?gender
        ?affiliation
        ?alumniOf
        ?award
        ?children
        ?colleague
        ?jobTitle
        ?nationality
        ?parent
        ?spouse
        (group_concat(distinct ?alternateName ; separator = "#SEP#") AS ?alternateNames) WHERE {
        ?s schema:mainEntityOfPage/schema:isPartOf <http://data.bibliotheken.nl/id/dataset/dbnla> .
        OPTIONAL { ?s schema:name ?name . }
        OPTIONAL { ?s schema:identifier ?identifier . }
        OPTIONAL { ?s schema:givenName ?author_given_name . }
        OPTIONAL { ?s schema:familyName ?author_family_name . }
        OPTIONAL { ?s schema:hasOccupation ?occupation . }
        OPTIONAL { ?s schema:birthPlace ?birthPlace . }
        OPTIONAL { ?s schema:deathPlace ?deathPlace . }
        OPTIONAL { ?s schema:birthDate ?birthDate . }
        OPTIONAL { ?s schema:deathDate ?deathDate . }
        OPTIONAL { ?s schema:gender ?gender . }
        OPTIONAL { ?s schema:alternateName ?alternateName . }
        OPTIONAL { ?s schema:affiliation ?affiliation . }
        OPTIONAL { ?s schema:alumniOf ?alumniOf . }
        OPTIONAL { ?s schema:award ?award . }
        OPTIONAL { ?s schema:children ?children . }
        OPTIONAL { ?s schema:colleague ?colleague . }
        OPTIONAL { ?s schema:follows ?follows . }
        OPTIONAL { ?s schema:funder ?funder . }
        OPTIONAL { ?s schema:funding ?funding . }
        OPTIONAL { ?s schema:height ?height . }
        OPTIONAL { ?s schema:jobTitle ?jobTitle . }
        OPTIONAL { ?s schema:knows ?knows . }
        OPTIONAL { ?s schema:knowsLanguage ?knowsLanguage . }
        OPTIONAL { ?s schema:nationality ?nationality . }
        OPTIONAL { ?s schema:parent ?parent . }
        OPTIONAL { ?s schema:spouse ?spouse . }
    }
"""

In [30]:
authors = run_query(find_authors)
len(authors)

120576

## Data processing

In [31]:
def get_key_value(sparql_object, key_name):
    try:
        return sparql_object[key_name]["value"]
    except:
        return None

In [34]:
def sparql_to_jsonl(sparql_objects, is_title=True):
    works = []
    
    for sparql_object in sparql_objects:        
        name = get_key_value(sparql_object, "name")
        identifier = get_key_value(sparql_object, "s").replace("http://data.bibliotheken.nl/id/nbt/", "") \
                                                      .replace("http://data.bibliotheken.nl/id/dbnla/", "")

        author_given_name = get_key_value(sparql_object, "author_given_name")
        author_family_name = get_key_value(sparql_object, "author_family_name")
        alternate_names = get_key_value(sparql_object, "alternateNames")

        if (alternate_names is not None):
            alternate_names = alternate_names.split("#SEP#")

        occupation = get_key_value(sparql_object, "occupation")
        birthPlace = get_key_value(sparql_object, "birthPlace")
        birthDate = get_key_value(sparql_object, "birthDate")
        deathPlace = get_key_value(sparql_object, "deathPlace")
        deathDate = get_key_value(sparql_object, "deathDate")
        gender = get_key_value(sparql_object, "gender")

        if gender is not None:
            gender = gender.replace("http://schema.org/", "").lower()

        affiliation = get_key_value(sparql_object, "affiliation")
        alumniOf = get_key_value(sparql_object, "alumniOf")
        award = get_key_value(sparql_object, "award")
        children = get_key_value(sparql_object, "children")
        colleague = get_key_value(sparql_object, "colleague")
        jobTitle = get_key_value(sparql_object, "jobTitle")
        nationality = get_key_value(sparql_object, "nationality")
        parent = get_key_value(sparql_object, "parent")
        spouse = get_key_value(sparql_object, "spouse")
        
        work = { "name": name,
                 "identifier": identifier, 
                 #"genres": genres,
                 "author_given_name": author_given_name,
                 "author_family_name": author_family_name,
                 "alternate_names": alternate_names,
                 "occupation": occupation,
                 "birthPlace": birthPlace,
                 "birthDate": birthDate,
                 "deathPlace": deathPlace,
                 "deathDate": deathDate,
                 "gender": gender,
                 "affiliation": affiliation,
                 "alumniOf": alumniOf,
                 "award": award,
                 "children": children,
                 "colleague": colleague,
                 "jobTitle": jobTitle,
                 "nationality": nationality,
                 "parent": parent,
                 "spouse": spouse
        }
        
        if is_title:
            author_identifier = get_key_value(sparql_object, "author").replace("http://data.bibliotheken.nl/id/thes/", "")
            work["author_identifier"] = author_identifier
        
        works.append(work)
        
    return works

In [35]:
authors_jsonl = sparql_to_jsonl(authors, is_title=False)

In [36]:
len(authors_jsonl)

120576

## Data output

### As data frame (CSV)

In [37]:
import pandas as pd

In [38]:
output_df = pd.DataFrame.from_records(authors_jsonl)
output_df['alternate_names'] = output_df['alternate_names'].apply(lambda x: '#SEP#'.join(map(str, x)) if isinstance(x, list) else x)
output_df.to_csv("auteurs.csv")

### As JSONL (JSONL)

In [39]:
import json
import os

def write_works_jsonl(works, filename):
    if os.path.exists(filename):
        os.remove(filename)
    
    with open(filename, "at") as writer:
        for work in works:
            writer.write(json.dumps(work) + "\n")

In [40]:
write_works_jsonl(authors_jsonl, "auteurs.jsonl")