In [35]:
import sys
!{sys.executable} -m pip install -r requirements.txt

Collecting ldaca@ git+https://github.com/Language-Research-Technology/ldaca-py.git@v0.0.5
  Cloning https://github.com/Language-Research-Technology/ldaca-py.git (to revision v0.0.5) to /private/var/folders/8q/x8j9p8c137q04zdt2zlqjx9w0000gn/T/pip-install-rjf9yw43/ldaca_f51aca48ce0e4d5dbc6a821b8a1ff074
  Running command git clone -q https://github.com/Language-Research-Technology/ldaca-py.git /private/var/folders/8q/x8j9p8c137q04zdt2zlqjx9w0000gn/T/pip-install-rjf9yw43/ldaca_f51aca48ce0e4d5dbc6a821b8a1ff074
  Running command git checkout -q 53f3d31c7e84a183f6ed8b4f131aa039198da341
  Resolved https://github.com/Language-Research-Technology/ldaca-py.git to commit 53f3d31c7e84a183f6ed8b4f131aa039198da341
Collecting rocrate_lang@ git+https://github.com/Language-Research-Technology/rocrate-lang-py
  Cloning https://github.com/Language-Research-Technology/rocrate-lang-py to /private/var/folders/8q/x8j9p8c137q04zdt2zlqjx9w0000gn/T/pip-install-rjf9yw43/rocrate-lang_8960bd23533f4b1f8f0016d1

In [36]:
import json                       # json library to read json file formats
import requests                   # Uses the requests library for REST apis
import os                         # Loads operating system libraries
from ldaca.ldaca import LDaCA     # Loads the LDaCA ReST api wrapper
from rocrate_lang.utils import as_list # A handy utility for converting to list

In [37]:
# Specify location where collection is
LDACA_API = 'https://data.atap.edu.au/api'
COLLECTION_ID = 'arcp://name,cooee-corpus/corpus/root'

from dotenv import load_dotenv    # loads environment variables
load_dotenv('vars.env') # load the environment variables located in the vars.env files
API_TOKEN = os.getenv('API_KEY') # store your environment variable in this jupyter notebook
if not API_TOKEN:
    print("Set a variable in the vars.env file and name API_KEY")


In [38]:
# Get the ro-crate metadata

ldaca = LDaCA(url=LDACA_API, token=API_TOKEN, data_dir='data')
ldaca.retrieve_collection(collection=COLLECTION_ID, collection_type='Collection', data_dir='data')


In [39]:
metadata = ldaca.crate

# Inspect the metadata
metadata

<rocrate_lang.rocrate_plus.ROCratePlus at 0x7fd8b0780670>

In [40]:
# TYPE values should be lists.
# We define a PRIMARY_OBJECT as a 'RepositoryObject' because that is where the main data is stored
PRIMARY_OBJECT = 'RepositoryObject'

In [41]:
# Find all types and find types that have linked objects
files = set()
types = list()
primary_object_types = list()

# Lets see what we can find in our metadata
for entity in ldaca.crate.contextual_entities + ldaca.crate.data_entities:
    entity_type = as_list(entity.type)  # We make sure that each type is a list
    for e_t in entity_type:
        types.append(e_t)


In [42]:
# Print the variables
# All the types, removing duplicates
list(dict.fromkeys(types))

['OrganizationReuseLicense',
 'Book',
 'website',
 'Person',
 'DefinedTerm',
 'CreativeWork',
 'Language',
 'PrimaryText',
 'RepositoryObject',
 'PropertyValue',
 'SoftwareSourceCode',
 'CreateAction',
 'File',
 'DerivedText']

In [43]:
# Types of PRIMARY_OBJECTs ie [PRIMARY_OBJECT, X]. What kinds of Xs do we have?
for entity in ldaca.crate.contextual_entities + ldaca.crate.data_entities:
    if 'RepositoryObject' in as_list(entity.type):
        print(entity.get('name'))
        item = ldaca.crate.dereference(entity.id)
        primary_object_types.append(item.as_jsonld())

Text 1-001 1788 Phillip, Arthur
Text 1-002 1788 Phillip, Arthur
Text 1-003 1788 Phillip, Arthur
Text 1-004 1788 Phillip, Arthur
Text 1-005 1788 Phillip, Arthur
Text 1-006 1788 Phillip, Arthur
Text 1-007 1788 Phillip, Arthur
Text 1-008 1788 Phillip, Arthur
Text 1-009 1788 Bench of Magistrates
Text 1-010 1788 Fowell, Newton
Text 1-011 1788 Fowell, Newton
Text 1-012 1788 Convict Woman
Text 1-013 1788 Phillip, Arthur
Text 1-014 1788 Worgan, George Bouchier
Text 1-015 1789 Tench, Watkin
Text 1-016 1790 White, John*
Text 1-017 1790 Surgeon's Mate
Text 1-018 1790 Convict
Text 1-019 1790 Phillip, Arthur
Text 1-020 1790 Phillip, Arthur
Text 1-021 1790 NN
Text 1-022 1790 Convict Woman
Text 1-023 1790 Johnson, Richard
Text 1-024 1791 Barrington, George
Text 1-025 1791 Phillip, Arthur
Text 1-026 1791 Macarthur, Elizabeth
Text 1-027 1791 Martin, James
Text 1-028 1791 Convict
Text 1-029 1792 Phillip, Arthur
Text 1-030 1792 Grose, Francis
Text 1-031 1792 Phillip, Arthur
Text 1-032 1792 Reibey, Mary
T

In [44]:
import pandas as pd  # this means we will refer to pandas as 'pd' throughout the code

primary_objects_dataframe = pd.json_normalize(primary_object_types)
primary_objects_dataframe

Unnamed: 0,@id,@type,name,dateCreated,hasPart,conformsTo.@id,author.@id,register.@id,linguisticGenre.@id,citation.@id,modality.@id,language.@id,indexableText.@id,@reverse.hasMember
0,"arcp://name,cooee-corpus/item/1-001",RepositoryObject,"Text 1-001 1788 Phillip, Arthur",1788,[{'@id': 'https://data.atap.edu.au/api/stream?...,https://purl.archive.org/textcommons/profile#O...,"arcp://name,cooee-corpus/author/Phillip_Arthur...",#register_PrW,#register_PrW,"arcp://name,cooee-corpus/work/Niall1998p10-11",txc:Orthography,https://www.ethnologue.com/language/eng,https://data.atap.edu.au/api/stream?id=arcp://...,"[{'@id': 'arcp://name,cooee-corpus/corpus/root..."
1,"arcp://name,cooee-corpus/item/1-002",RepositoryObject,"Text 1-002 1788 Phillip, Arthur",1788,[{'@id': 'https://data.atap.edu.au/api/stream?...,https://purl.archive.org/textcommons/profile#O...,"arcp://name,cooee-corpus/author/Phillip_Arthur...",#register_GE,#register_GE,"arcp://name,cooee-corpus/work/Niall1998p12-13",txc:Orthography,https://www.ethnologue.com/language/eng,https://data.atap.edu.au/api/stream?id=arcp://...,"[{'@id': 'arcp://name,cooee-corpus/corpus/root..."
2,"arcp://name,cooee-corpus/item/1-003",RepositoryObject,"Text 1-003 1788 Phillip, Arthur",1788,[{'@id': 'https://data.atap.edu.au/api/stream?...,https://purl.archive.org/textcommons/profile#O...,"arcp://name,cooee-corpus/author/Phillip_Arthur...",#register_GE,#register_GE,"arcp://name,cooee-corpus/work/Clark1977p44-46",txc:Orthography,https://www.ethnologue.com/language/eng,https://data.atap.edu.au/api/stream?id=arcp://...,"[{'@id': 'arcp://name,cooee-corpus/corpus/root..."
3,"arcp://name,cooee-corpus/item/1-004",RepositoryObject,"Text 1-004 1788 Phillip, Arthur",1788,[{'@id': 'https://data.atap.edu.au/api/stream?...,https://purl.archive.org/textcommons/profile#O...,"arcp://name,cooee-corpus/author/Phillip_Arthur...",#register_GE,#register_GE,"arcp://name,cooee-corpus/work/Clark1977p47-48",txc:Orthography,https://www.ethnologue.com/language/eng,https://data.atap.edu.au/api/stream?id=arcp://...,"[{'@id': 'arcp://name,cooee-corpus/corpus/root..."
4,"arcp://name,cooee-corpus/item/1-005",RepositoryObject,"Text 1-005 1788 Phillip, Arthur",1788,[{'@id': 'https://data.atap.edu.au/api/stream?...,https://purl.archive.org/textcommons/profile#O...,"arcp://name,cooee-corpus/author/Phillip_Arthur...",#register_GE,#register_GE,"arcp://name,cooee-corpus/work/Clark1977p52",txc:Orthography,https://www.ethnologue.com/language/eng,https://data.atap.edu.au/api/stream?id=arcp://...,"[{'@id': 'arcp://name,cooee-corpus/corpus/root..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1352,"arcp://name,cooee-corpus/item/4-421",RepositoryObject,"Text 4-421 1897 Turner, George",1897,[{'@id': 'https://data.atap.edu.au/api/stream?...,https://purl.archive.org/textcommons/profile#O...,"arcp://name,cooee-corpus/author/Turner_George-...",#register_SB,#register_SB,"arcp://name,cooee-corpus/work/Federation_Debat...",txc:Orthography,https://www.ethnologue.com/language/eng,https://data.atap.edu.au/api/stream?id=arcp://...,"[{'@id': 'arcp://name,cooee-corpus/corpus/root..."
1353,"arcp://name,cooee-corpus/item/4-422",RepositoryObject,"Text 4-422 1898 Lyne, William John",1898,[{'@id': 'https://data.atap.edu.au/api/stream?...,https://purl.archive.org/textcommons/profile#O...,"arcp://name,cooee-corpus/author/Lyne_William J...",#register_SB,#register_SB,"arcp://name,cooee-corpus/work/Federation_Debat...",txc:Orthography,https://www.ethnologue.com/language/eng,https://data.atap.edu.au/api/stream?id=arcp://...,"[{'@id': 'arcp://name,cooee-corpus/corpus/root..."
1354,"arcp://name,cooee-corpus/item/4-423",RepositoryObject,"Text 4-423 1898 Downer, John William",1898,[{'@id': 'https://data.atap.edu.au/api/stream?...,https://purl.archive.org/textcommons/profile#O...,"arcp://name,cooee-corpus/author/Downer_John Wi...",#register_SB,#register_SB,"arcp://name,cooee-corpus/work/Federation_Debat...",txc:Orthography,https://www.ethnologue.com/language/eng,https://data.atap.edu.au/api/stream?id=arcp://...,"[{'@id': 'arcp://name,cooee-corpus/corpus/root..."
1355,"arcp://name,cooee-corpus/item/4-424",RepositoryObject,"Text 4-424 1898 Kingston, Charles Cameron",1898,[{'@id': 'https://data.atap.edu.au/api/stream?...,https://purl.archive.org/textcommons/profile#O...,"arcp://name,cooee-corpus/author/Kingston_Charl...",#register_SB,#register_SB,"arcp://name,cooee-corpus/work/Federation_Debat...",txc:Orthography,https://www.ethnologue.com/language/eng,https://data.atap.edu.au/api/stream?id=arcp://...,"[{'@id': 'arcp://name,cooee-corpus/corpus/root..."


In [45]:
# Types of File that are in each primary object. What kinds of files do we have?
for entity in primary_object_types:
    if 'hasPart' in entity:
        hasPart = entity.get('hasPart')
        for part in as_list(hasPart):
            file = ldaca.crate.dereference(part.get('@id'))
            files.add(file)
print(f"{len(files)} files")
files

2714 files


{<https://data.atap.edu.au/api/stream?id=arcp://name,cooee-corpus/corpus/root&path=data/1-156.txt ['File', 'DerivedText']>,
 <https://data.atap.edu.au/api/stream?id=arcp://name,cooee-corpus/corpus/root&path=data/3-241-plain.txt ['File', 'DerivedText']>,
 <https://data.atap.edu.au/api/stream?id=arcp://name,cooee-corpus/corpus/root&path=data/3-255-plain.txt ['File', 'DerivedText']>,
 <https://data.atap.edu.au/api/stream?id=arcp://name,cooee-corpus/corpus/root&path=data/4-360-plain.txt ['File', 'DerivedText']>,
 <https://data.atap.edu.au/api/stream?id=arcp://name,cooee-corpus/corpus/root&path=data/1-052-plain.txt ['File', 'DerivedText']>,
 <https://data.atap.edu.au/api/stream?id=arcp://name,cooee-corpus/corpus/root&path=data/3-152-plain.txt ['File', 'DerivedText']>,
 <https://data.atap.edu.au/api/stream?id=arcp://name,cooee-corpus/corpus/root&path=data/1-006-plain.txt ['File', 'DerivedText']>,
 <https://data.atap.edu.au/api/stream?id=arcp://name,cooee-corpus/corpus/root&path=data/3-212.tx

In [46]:
annotations = set()
texts = list()

# pick out the annotation files
for file in files:
    if 'DerivedText' in as_list(file.type):
        annotations.add(file)

# from this annotations select only the CSVs

for annotation in annotations:
    if annotation.get('encodingFormat') == 'text/plain':
        selected_file_json = annotation.as_jsonld()
        texts.append(selected_file_json)


print(f"We have {len(texts)} text objects")

We have 2714 text objects


In [47]:
text = texts[1:2]
print(json.dumps(text, indent=2, sort_keys=False))

[
  {
    "@id": "https://data.atap.edu.au/api/stream?id=arcp://name,cooee-corpus/corpus/root&path=data/1-156.txt",
    "@type": [
      "File",
      "DerivedText"
    ],
    "name": "Text 1-156 1816 NN - text with metadata codes",
    "modality": {
      "@id": "txc:Orthography"
    },
    "annotationOf": {
      "@id": "arcp://name,cooee-corpus/work/Ward1969p91-94"
    },
    "language": {
      "@id": "https://www.ethnologue.com/language/eng"
    },
    "encodingFormat": "text/plain",
    "size": 8841,
    "@reverse": {
      "hasPart": [
        {
          "@id": "arcp://name,cooee-corpus/corpus/root/"
        },
        {
          "@id": "arcp://name,cooee-corpus/item/1-156"
        }
      ]
    }
  }
]


# Sample of one entry

In [48]:
url = text[0]["@id"]
url

'https://data.atap.edu.au/api/stream?id=arcp://name,cooee-corpus/corpus/root&path=data/1-156.txt'

# Downloading a file from the ReST API

In [49]:
import requests

headers = {"Authorization": "Bearer %s" % API_TOKEN}
response = requests.get(url=url, headers=headers)

print(response.request.headers)
print(response.text)

{'User-Agent': 'python-requests/2.26.0', 'Accept-Encoding': 'gzip, deflate, br', 'Accept': '*/*', 'Connection': 'keep-alive', 'Authorization': 'Bearer 28b7afba-02ad-4039-9185-67970def8aec'}



<source><g=m><o=b><age=un><status=2><abode=un><p=vdl><r=prw><tt=di><1-156>
On seeing them (a native tribe numbering about two hundred) approach we pulled the boat out a little from the shore, leaving Tolo (a leading native) on the rock. We got out our arms and examined them to see if they were in firing order, and afterwards held up three or four seals' carcasses, and acquainted the natives we wished to trade for kangaroo skins. Tolo ordered ten women to go into the water each loaded with kangaroo skins and flesh. We gave them in return the carcasses, and they carried them to their tribe, returning immediately to the boat with more skins as payment. We then requested Tolo to fill our kegs with fresh water, which he did, but we would not let them take more than one keg at a time, for fear they