In [None]:
%%capture
import sys
!{sys.executable} -m pip install -r requirements.txt

In [None]:
import json                       # json library to read json file formats
import requests                   # Uses the requests library for REST apis
import os                         # Loads operating system libraries
from ldaca.ldaca import LDaCA     # Loads the LDaCA ReST api wrapper
from rocrate_lang.utils import as_list # A handy utility for converting to list

In [None]:
# Specify location where collection is
LDACA_API = 'https://data.ldaca.edu.au/api'
COLLECTION_ID = 'arcp://name,doi10.26180%2F23961609'

from dotenv import load_dotenv    # loads environment variables
load_dotenv('vars.env') # load the environment variables located in the vars.env files
API_TOKEN = os.getenv('API_KEY') # store your environment variable in this jupyter notebook
if not API_TOKEN:
    print("Get a token from the portal, set a variable in the vars.env file named API_KEY, then restart the kernel.")


In [None]:
# Get the ro-crate metadata

ldaca = LDaCA(url=LDACA_API, token=API_TOKEN, data_dir='data')
ldaca.retrieve_collection(collection=COLLECTION_ID, collection_type='Collection', data_dir='data')


In [None]:
metadata = ldaca.crate

# Inspect the metadata
metadata

In [None]:
# TYPE values should be lists.
# We define a PRIMARY_OBJECT as a 'RepositoryObject' because that is where the main data is stored
PRIMARY_OBJECT = 'RepositoryObject'

In [None]:
# Find all types and find types that have linked objects
files = set()
types = list()
primary_object_types = list()

# Lets see what we can find in our metadata
for entity in ldaca.crate.contextual_entities + ldaca.crate.data_entities:
    entity_type = as_list(entity.type)  # We make sure that each type is a list
    for e_t in entity_type:
        types.append(e_t)


In [None]:
# Print the variables
# All the types, removing duplicates
list(dict.fromkeys(types))

In [None]:
# Types of PRIMARY_OBJECTs ie [PRIMARY_OBJECT, X]. What kinds of Xs do we have?
for entity in ldaca.crate.contextual_entities + ldaca.crate.data_entities:
    if 'RepositoryObject' in as_list(entity.type):
        item = ldaca.crate.dereference(entity.id)
        primary_object_types.append(item.as_jsonld())

In [None]:
import pandas as pd  # this means we will refer to pandas as 'pd' throughout the code

primary_objects_dataframe = pd.json_normalize(primary_object_types)
primary_objects_dataframe

In [None]:
# Types of File that are in each primary object. What kinds of files do we have?
for entity in primary_object_types:
    if 'hasPart' in entity:
        hasPart = entity.get('hasPart')
        for part in as_list(hasPart):
            file = ldaca.crate.dereference(part.get('@id'))
            files.add(file)
print(f"{len(files)} files")
list(files)[:20]

In [None]:
annotations = set()
texts = list()

# pick out the annotation files
for file in files:
    if 'DerivedMaterial' in as_list(file.type):
        annotations.add(file)

# from this annotations select only the CSVs

for annotation in annotations:
    if annotation.get('encodingFormat') == 'text/plain':
        selected_file_json = annotation.as_jsonld()
        texts.append(selected_file_json)


print(f"We have {len(texts)} text objects")

In [None]:
text = texts[1:2]
print(json.dumps(text, indent=2, sort_keys=False))

# Sample of one entry

In [None]:
url = text[0]["@id"]
url

# Downloading a file from the ReST API

In [None]:
import requests

headers = {"Authorization": "Bearer %s" % API_TOKEN}
response = requests.get(url=url, headers=headers)

print(response.text)
