This notebook retrieved the metadata to construct the network graph. The metadata was retrieved using the Trove API. The notebook is available as a Jupyter notebook.

Code in this notebook has been adapted from the following sources:
- Sherratt, Tim. (2023). trove-api-intro (version v1.0.0). Zenodo. https://doi.org/10.5281/zenodo.7545885

In [669]:
import datetime
import os
import re
import time
from tqdm import tqdm
from selenium import webdriver

import os
import requests
import fuzzysearch
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import json
import spacy

import pandas as pd
import requests_cache
from IPython.display import HTML, display
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm

s = requests_cache.CachedSession()
retries = Retry(total=10, backoff_factor=0.2, status_forcelist=[500, 502, 503, 504])
s.mount("http://", HTTPAdapter(max_retries=retries))
s.mount("https://", HTTPAdapter(max_retries=retries))

In [670]:
# This creates a variable called 'api_key', paste your key between the quotes
api_key = "" # ADD YOUR API KEY HERE

# Use an api key value from environment variables if it is available (useful for testing)
if os.getenv("TROVE_API_KEY"):
    api_key = os.getenv("TROVE_API_KEY")

# This displays a message with your key
print("Your API key is: {}".format(api_key))

Your API key is: 


In [671]:

api_url = "http://api.trove.nla.gov.au/v2/result"
# This creates a dictionary called 'params' and sets values for the API's mandatory parameters
params = {
    "q": " ",  # Search for this keyword -- feel free to change!
    "zone": "collection",
    "encoding": "json",
    "l-format": "Unpublished",
    "l-australian": "y",
    "l-occupation": "Academics", 
    "key": api_key,
    "n": "10000",
    "include": "holdings",
    "bulkHarvest": "true",

    
}

In [672]:
response = s.get(api_url, params=params)
data = response.json()
total = int(data["response"]["zone"][0]["records"]["total"])
print("There are", total, "records in the collection zone")

There are 473 records in the collection zone


In [673]:
items = []
start = "*"
params["n"] = "100"  # Set the initial number of records per request to 100

while start:
    params["s"] = start
    response = s.get(api_url, params=params)
    data = response.json()

    for record in data["response"]["zone"][0]["records"]["work"]:
        try:
            if record["holding"][0]["nuc"] == "ANL":
                call_number = record["holding"][0]["callNumber"][0]
                if isinstance(call_number, str):
                    #skip the item if it is a string
                    continue
                

                item = {
                    "trove_id": record["id"],
                    "title": record["title"],
                    "trove_url": record["troveUrl"],
                    #"holding": record["holding"][0]["callNumber"][0],
                    "holding": record["holding"][0]["callNumber"][0]["localIdentifier"],
                    "location": record["holding"][0]["nuc"]
                }
                # Add the item to the list of items
                items.append(item)
        except KeyError:
            # Handle missing keys in the record
            pass

    try:
        start = data["response"]["zone"][0]["records"]["nextStart"]
    except KeyError:
        start = None

    if len(items) >= total:
        break

    time.sleep(0.2)  # Add a delay between API requests to avoid overloading


In [674]:
print(len(items), "items retrieved")

282 items retrieved


In [675]:
df = pd.DataFrame(items)
df.head()

Unnamed: 0,trove_id,title,trove_url,holding,location
0,10201186,Papers of Otto Van der Sprenkel,https://trove.nla.gov.au/work/10201186,2788171,ANL
1,10347079,Papers of Dorothy Green,https://trove.nla.gov.au/work/10347079,1272711,ANL
2,10642789,Papers of John W. Burton,https://trove.nla.gov.au/work/10642789,2877417,ANL
3,10738203,Papers of Cameron Hazlehurst,https://trove.nla.gov.au/work/10738203,2912363,ANL
4,10738291,Papers of James Brigden,https://trove.nla.gov.au/work/10738291,2877442,ANL


In [676]:
import requests
from bs4 import BeautifulSoup
import time

def collect_archive_metadata(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the 'div' with class 'record'
    record_div = soup.find('div', class_='record')

    # Find the title of the record and strip the text
    title_element = soup.find('h5', attrs={'class': 'header_title'}).text.strip()

    # Find the table with metadata information
    table = soup.find('table', class_='recDisplay')

    # Find all <tr> elements within the table
    rows = table.find_all('tr')

    manuscript_metadata = {}  # Dictionary to store the extracted data

    for row in rows:
        th = row.find('th')  # Find the <th> element
        td = row.find('td')  # Find the <td> element
        if th is not None and td is not None:
            key = th.text.strip()
            value = td.text.strip()
            manuscript_metadata[key] = value

    # Create a new dictionary to store title_element as key and manuscript_metadata as value
    data = {title_element: manuscript_metadata}
    return data

def construct_NLA_url(df):
    NLA_url = {}  # Initialize the dictionary
    for i in df['holding']:
        # Make sure the holding number is a string
        i = str(i)
        NLA_url[i] = "https://catalogue.nla.gov.au/Record/" + i
    return NLA_url


NLA_archive_metadata = construct_NLA_url(df)

#create a new dictionary to store the metadata
total_results_metadata = {}

start_time = time.time()  # Start the timer

for index, (holding, url) in enumerate(NLA_archive_metadata.items()):
    metadata_start_time = time.time()  # Start the timer for each item
    metadata = collect_archive_metadata(url)
    total_results_metadata.update(metadata)
    metadata_end_time = time.time()  # Stop the timer for each item
    elapsed_time = metadata_end_time - metadata_start_time
    print(f"Processed item {index+1}/{len(NLA_archive_metadata)}. Elapsed time: {elapsed_time:.2f} seconds.")

end_time = time.time()  # Stop the timer
total_elapsed_time = end_time - start_time

print(f"Total elapsed time: {total_elapsed_time:.2f} seconds.")

Processed item 1/282. Elapsed time: 0.88 seconds.
Processed item 2/282. Elapsed time: 1.20 seconds.
Processed item 3/282. Elapsed time: 1.65 seconds.
Processed item 4/282. Elapsed time: 0.70 seconds.
Processed item 5/282. Elapsed time: 0.54 seconds.
Processed item 6/282. Elapsed time: 0.77 seconds.
Processed item 7/282. Elapsed time: 0.61 seconds.
Processed item 8/282. Elapsed time: 0.98 seconds.
Processed item 9/282. Elapsed time: 0.77 seconds.
Processed item 10/282. Elapsed time: 0.73 seconds.
Processed item 11/282. Elapsed time: 0.58 seconds.
Processed item 12/282. Elapsed time: 0.68 seconds.
Processed item 13/282. Elapsed time: 0.96 seconds.
Processed item 14/282. Elapsed time: 0.67 seconds.
Processed item 15/282. Elapsed time: 0.68 seconds.
Processed item 16/282. Elapsed time: 0.60 seconds.
Processed item 17/282. Elapsed time: 0.68 seconds.
Processed item 18/282. Elapsed time: 1.45 seconds.
Processed item 19/282. Elapsed time: 1.24 seconds.
Processed item 20/282. Elapsed time: 0.7

In [677]:
#save total_results_metadata as a json file
with open('total_results_metadata.json', 'w') as f:
    json.dump(total_results_metadata, f, indent=4)

In [678]:
#create new dictionary that will store named entities as values under title of the record as key.

# Load the spaCy English model
nlp = spacy.load('en_core_web_sm')

# COMMENT OUT IF YOU ARE LOADING IN JSON FILE
#with open('ADD FILE NAME HERE') as file:
#    data = json.load(file)

# Dictionary to store named entitites with the same key as total_results_metadata dictionary
same_key_entities = {}

# load in dictionary of total_results_metadata
for entry_key, entry_value in total_results_metadata.items():
    same_key = entry_key

#for entry_key, entry_value in data.items(): # COMMENT THIS OUT IF LOADING IN JSON FILE
#for entry_key, entry_value in total_results_metadata.items():
#    same_key = entry_key  

    named_entities = {"ORGANIZATION": [], "LOCATION": [], "PERSON": []}  # Use lists to store named entities by type
    for field in ['Description', 'Summary', 'Notes', 'Biography/History']:
        text = entry_value.get(field, '')  # Get the text from the current field (default to empty string if field is missing)
        doc = nlp(text)  # Process the text with spaCy's NLP pipeline
        for entity in doc.ents:
            if entity.label_ in named_entities:
                named_entities[entity.label_].append(entity.text)  # Add the named entity to the corresponding list

    same_key_entities[same_key] = named_entities  # Store the named entities dictionary using the same key

# Save to JSON file
with open('named_entities.json', 'w') as file:
    json.dump(same_key_entities, file, indent=4)


In [679]:
#merge same_key_entities into total_results_metadata dictionary and save as new dictionary called total_results_metadata_with_entities
total_results_metadata_with_entities = {}

for entry_key, entry_value in total_results_metadata.items():
    same_key = entry_key
    named_entities = same_key_entities.get(same_key, {})
    entry_value.update(named_entities)
    total_results_metadata_with_entities[same_key] = entry_value

#save total_results_metadata_with_entities as a json file for safekeeping
with open('total_results_metadata_with_entities.json', 'w') as f:
    json.dump(total_results_metadata_with_entities, f, indent=4)
        



In [680]:
#IF YOU ARE WORKING FROM JSON FILES YOU CAN USE THE SCRIPT TO MERGE THE TWO FILES
"""
# Load the data from json_one
with open('authors_total_results_same_key_entitiesTRYFIX.json') as file:
    json_one_data = json.load(file)

# Load the data from json_two
with open('authors_total_results_metadata.json') as file:
    json_two_data = json.load(file)

# Iterate over each entry in json_one_data
for key, entities in json_one_data.items():
    # Check if the key exists in json_two_data
    if key in json_two_data:
        # Merge the information from json_one_data onto json_two_data
        json_two_data[key].update(entities)

# Save the updated json_two_data to a file
with open('merged.json', 'w') as file:
    json.dump(json_two_data, file, indent=4)

"""

"\n# Load the data from json_one\nwith open('authors_total_results_same_key_entitiesTRYFIX.json') as file:\n    json_one_data = json.load(file)\n\n# Load the data from json_two\nwith open('authors_total_results_metadata.json') as file:\n    json_two_data = json.load(file)\n\n# Iterate over each entry in json_one_data\nfor key, entities in json_one_data.items():\n    # Check if the key exists in json_two_data\n    if key in json_two_data:\n        # Merge the information from json_one_data onto json_two_data\n        json_two_data[key].update(entities)\n\n# Save the updated json_two_data to a file\nwith open('merged.json', 'w') as file:\n    json.dump(json_two_data, file, indent=4)\n\n"

In [681]:
#TERMPORARY CODE BLOCK TO RESOLVE ISSUES

import re

for entry_key, entry_value in total_results_metadata_with_entities.items():
    # Retrieve the value for 'Author' key, or provide an empty string if the key is not found
    author_name = entry_value.get('Author', '')
    # Remove numeric values from Author field and replace with an empty string
    cleaned_author_name = re.sub(r'\d+', '', author_name)
    # Split on the first comma and reverse the order of the list
    cleaned_author_name = cleaned_author_name.split(',', 1)[::-1]
    # Join the list back together with a space between the two elements
    cleaned_author_name = ' '.join(cleaned_author_name)
    # Remove commas from the end of the Author name and any non-alphanumeric characters
    cleaned_author_name = re.sub(r'[^a-zA-Z0-9 ]+', '', cleaned_author_name).rstrip(',')
    # Remove any double spaces
    cleaned_author_name = re.sub(r'\s+', ' ', cleaned_author_name)
    # Remove any leading or trailing spaces
    cleaned_author_name = cleaned_author_name.strip()
    # Add the cleaned_author_name to the dictionary as a new value under the key 'cleaned_author_name'
    entry_value['cleaned_author_name'] = cleaned_author_name

# Now you can proceed with further processing or create a DataFrame


In [682]:

# extract information on archival extent for each archive from "Description"
# split on last '\n' and take the last element of the list

for entry_key, entry_value in total_results_metadata_with_entities.items():
    archival_extent = entry_value['Description'].rsplit('\n', 1)[-1]
    #split on brackets and take the first element of the list
    archival_extent = archival_extent.split('(', 1)[0]
    #remove trailing whitespace
    archival_extent = archival_extent.rstrip()
    entry_value['archival_extent'] = archival_extent



In [683]:
#convert dictionary to dataframe but only include Bib ID, cleaned_author_name, archival_extent, and named entities.
#this will be used to create the network visualisation
#explode the named entities so that each named entity is in a separate row with the same Bib ID, cleaned_author_name, and archival_extent
#this will allow us to create a network visualisation with the named entities as nodes and the Bib ID, cleaned_author_name, and archival_extent as attributes

#convert dictionary to dataframe
entity_new_df = pd.DataFrame.from_dict(total_results_metadata_with_entities, orient='index', columns=['Bib ID', 'Author', 'Description', 'Summary', 'Notes', 'Biography/History', 'ORGANIZATION', 'LOCATION', 'PERSON', 'cleaned_author_name', 'archival_extent'])
#explode the named entities so that each named entity is in a separate row with the same Bib ID, cleaned_author_name, and archival_extent
#this will allow us to create a network visualisation with the named entities as nodes and the Bib ID, cleaned_author_name, and archival_extent as attributes
explode_entity_new_df = entity_new_df.explode('PERSON')
#save explode_entity_new_df as a csv file for safekeeping
explode_entity_new_df.to_csv('explode_entity_new_df.csv', index=False)





