In [1]:
%%capture
import sys
!{sys.executable} -m pip install -r requirements.txt

In [2]:
import json                       # json library to read json file formats
import requests                   # Uses the requests library for REST apis
import os                         # Loads operating system libraries
from ldaca.ldaca import LDaCA     # Loads the LDaCA ReST api wrapper
from rocrate_lang.utils import as_list # A handy utility for converting to list

In [3]:
# Specify location where collection is
LDACA_API = 'https://data.atap.edu.au/api'
COLLECTION_ID = 'arcp://name,corpus-of-oz-early-english'

from dotenv import load_dotenv    # loads environment variables
load_dotenv('vars.env') # load the environment variables located in the vars.env files
API_TOKEN = os.getenv('API_KEY') # store your environment variable in this jupyter notebook
if not API_TOKEN:
    print("Get a token from the portal, set a variable in the vars.env file named API_KEY, then restart the kernel.")


In [4]:
# Get the ro-crate metadata

ldaca = LDaCA(url=LDACA_API, token=API_TOKEN, data_dir='data')
ldaca.retrieve_collection(collection=COLLECTION_ID, collection_type='Collection', data_dir='data')


In [5]:
metadata = ldaca.crate

# Inspect the metadata
metadata

<rocrate_lang.rocrate_plus.ROCratePlus at 0x7fcfd0976700>

In [6]:
# TYPE values should be lists.
# We define a PRIMARY_OBJECT as a 'RepositoryObject' because that is where the main data is stored
PRIMARY_OBJECT = 'RepositoryObject'

In [7]:
# Find all types and find types that have linked objects
files = set()
types = list()
primary_object_types = list()

# Lets see what we can find in our metadata
for entity in ldaca.crate.contextual_entities + ldaca.crate.data_entities:
    entity_type = as_list(entity.type)  # We make sure that each type is a list
    for e_t in entity_type:
        types.append(e_t)


In [8]:
# Print the variables
# All the types, removing duplicates
list(dict.fromkeys(types))

['Person',
 'Book',
 'OrganizationReuseLicense',
 'RepositoryObject',
 'PropertyValue',
 'PersonSnapshot',
 'DefinedTerm',
 'PrimaryMaterial',
 'Language',
 'website',
 'CreativeWork',
 'GeoCoordinates',
 'SoftwareSourceCode',
 'CreateAction',
 'File',
 'Dataset',
 'DerivedMaterial']

In [9]:
# Types of PRIMARY_OBJECTs ie [PRIMARY_OBJECT, X]. What kinds of Xs do we have?
for entity in ldaca.crate.contextual_entities + ldaca.crate.data_entities:
    if 'RepositoryObject' in as_list(entity.type):
        item = ldaca.crate.dereference(entity.id)
        primary_object_types.append(item.as_jsonld())

In [10]:
import pandas as pd  # this means we will refer to pandas as 'pd' throughout the code

primary_objects_dataframe = pd.json_normalize(primary_object_types)
primary_objects_dataframe

Unnamed: 0,@id,@type,name,dateCreated,hasPart,conformsTo.@id,author.@id,register.@id,linguisticGenre.@id,citation.@id,modality.@id,language.@id,indexableText.@id,@reverse.hasMember
0,"arcp://name,cooee-corpus/item/1-001",RepositoryObject,"Text 1-001 1788 Phillip, Arthur",1788,[{'@id': 'https://data.atap.edu.au/api/stream?...,https://purl.archive.org/language-data-commons...,"arcp://name,cooee-corpus/author/Phillip_Arthur...",#register_PrW,#register_PrW,"arcp://name,cooee-corpus/work/Niall1998p10-11",https://purl.archive.org/language-data-commons...,https://glottolog.org/resource/languoid/id/sta...,https://data.atap.edu.au/api/stream?id=arcp://...,"[{'@id': 'arcp://name,cooee-corpus/corpus/root..."
1,"arcp://name,cooee-corpus/item/1-002",RepositoryObject,"Text 1-002 1788 Phillip, Arthur",1788,[{'@id': 'https://data.atap.edu.au/api/stream?...,https://purl.archive.org/language-data-commons...,"arcp://name,cooee-corpus/author/Phillip_Arthur...",#register_GE,#register_GE,"arcp://name,cooee-corpus/work/Niall1998p12-13",https://purl.archive.org/language-data-commons...,https://glottolog.org/resource/languoid/id/sta...,https://data.atap.edu.au/api/stream?id=arcp://...,"[{'@id': 'arcp://name,cooee-corpus/corpus/root..."
2,"arcp://name,cooee-corpus/item/1-003",RepositoryObject,"Text 1-003 1788 Phillip, Arthur",1788,[{'@id': 'https://data.atap.edu.au/api/stream?...,https://purl.archive.org/language-data-commons...,"arcp://name,cooee-corpus/author/Phillip_Arthur...",#register_GE,#register_GE,"arcp://name,cooee-corpus/work/Clark1977p44-46",https://purl.archive.org/language-data-commons...,https://glottolog.org/resource/languoid/id/sta...,https://data.atap.edu.au/api/stream?id=arcp://...,"[{'@id': 'arcp://name,cooee-corpus/corpus/root..."
3,"arcp://name,cooee-corpus/item/1-004",RepositoryObject,"Text 1-004 1788 Phillip, Arthur",1788,[{'@id': 'https://data.atap.edu.au/api/stream?...,https://purl.archive.org/language-data-commons...,"arcp://name,cooee-corpus/author/Phillip_Arthur...",#register_GE,#register_GE,"arcp://name,cooee-corpus/work/Clark1977p47-48",https://purl.archive.org/language-data-commons...,https://glottolog.org/resource/languoid/id/sta...,https://data.atap.edu.au/api/stream?id=arcp://...,"[{'@id': 'arcp://name,cooee-corpus/corpus/root..."
4,"arcp://name,cooee-corpus/item/1-005",RepositoryObject,"Text 1-005 1788 Phillip, Arthur",1788,[{'@id': 'https://data.atap.edu.au/api/stream?...,https://purl.archive.org/language-data-commons...,"arcp://name,cooee-corpus/author/Phillip_Arthur...",#register_GE,#register_GE,"arcp://name,cooee-corpus/work/Clark1977p52",https://purl.archive.org/language-data-commons...,https://glottolog.org/resource/languoid/id/sta...,https://data.atap.edu.au/api/stream?id=arcp://...,"[{'@id': 'arcp://name,cooee-corpus/corpus/root..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1352,"arcp://name,cooee-corpus/item/4-421",RepositoryObject,"Text 4-421 1897 Turner, George",1897,[{'@id': 'https://data.atap.edu.au/api/stream?...,https://purl.archive.org/language-data-commons...,"arcp://name,cooee-corpus/author/Turner_George-...",#register_SB,#register_SB,"arcp://name,cooee-corpus/work/Federation_Debat...",https://purl.archive.org/language-data-commons...,https://glottolog.org/resource/languoid/id/sta...,https://data.atap.edu.au/api/stream?id=arcp://...,"[{'@id': 'arcp://name,cooee-corpus/corpus/root..."
1353,"arcp://name,cooee-corpus/item/4-422",RepositoryObject,"Text 4-422 1898 Lyne, William John",1898,[{'@id': 'https://data.atap.edu.au/api/stream?...,https://purl.archive.org/language-data-commons...,"arcp://name,cooee-corpus/author/Lyne_William J...",#register_SB,#register_SB,"arcp://name,cooee-corpus/work/Federation_Debat...",https://purl.archive.org/language-data-commons...,https://glottolog.org/resource/languoid/id/sta...,https://data.atap.edu.au/api/stream?id=arcp://...,"[{'@id': 'arcp://name,cooee-corpus/corpus/root..."
1354,"arcp://name,cooee-corpus/item/4-423",RepositoryObject,"Text 4-423 1898 Downer, John William",1898,[{'@id': 'https://data.atap.edu.au/api/stream?...,https://purl.archive.org/language-data-commons...,"arcp://name,cooee-corpus/author/Downer_John Wi...",#register_SB,#register_SB,"arcp://name,cooee-corpus/work/Federation_Debat...",https://purl.archive.org/language-data-commons...,https://glottolog.org/resource/languoid/id/sta...,https://data.atap.edu.au/api/stream?id=arcp://...,"[{'@id': 'arcp://name,cooee-corpus/corpus/root..."
1355,"arcp://name,cooee-corpus/item/4-424",RepositoryObject,"Text 4-424 1898 Kingston, Charles Cameron",1898,[{'@id': 'https://data.atap.edu.au/api/stream?...,https://purl.archive.org/language-data-commons...,"arcp://name,cooee-corpus/author/Kingston_Charl...",#register_SB,#register_SB,"arcp://name,cooee-corpus/work/Federation_Debat...",https://purl.archive.org/language-data-commons...,https://glottolog.org/resource/languoid/id/sta...,https://data.atap.edu.au/api/stream?id=arcp://...,"[{'@id': 'arcp://name,cooee-corpus/corpus/root..."


In [11]:
# Types of File that are in each primary object. What kinds of files do we have?
for entity in primary_object_types:
    if 'hasPart' in entity:
        hasPart = entity.get('hasPart')
        for part in as_list(hasPart):
            file = ldaca.crate.dereference(part.get('@id'))
            files.add(file)
print(f"{len(files)} files")
list(files)[:20]

2714 files


[<https://data.atap.edu.au/api/stream?id=arcp://name,cooee-corpus/corpus/root&path=data/4-008-plain.txt ['File', 'DerivedMaterial']>,
 <https://data.atap.edu.au/api/stream?id=arcp://name,cooee-corpus/corpus/root&path=data/3-301-plain.txt ['File', 'DerivedMaterial']>,
 <https://data.atap.edu.au/api/stream?id=arcp://name,cooee-corpus/corpus/root&path=data/4-337-plain.txt ['File', 'DerivedMaterial']>,
 <https://data.atap.edu.au/api/stream?id=arcp://name,cooee-corpus/corpus/root&path=data/3-108.txt ['File', 'DerivedMaterial']>,
 <https://data.atap.edu.au/api/stream?id=arcp://name,cooee-corpus/corpus/root&path=data/2-205-plain.txt ['File', 'DerivedMaterial']>,
 <https://data.atap.edu.au/api/stream?id=arcp://name,cooee-corpus/corpus/root&path=data/3-079-plain.txt ['File', 'DerivedMaterial']>,
 <https://data.atap.edu.au/api/stream?id=arcp://name,cooee-corpus/corpus/root&path=data/4-369-plain.txt ['File', 'DerivedMaterial']>,
 <https://data.atap.edu.au/api/stream?id=arcp://name,cooee-corpus/co

In [12]:
annotations = set()
texts = list()

# pick out the annotation files
for file in files:
    if 'DerivedMaterial' in as_list(file.type):
        annotations.add(file)

# from this annotations select only the CSVs

for annotation in annotations:
    if annotation.get('encodingFormat') == 'text/plain':
        selected_file_json = annotation.as_jsonld()
        texts.append(selected_file_json)


print(f"We have {len(texts)} text objects")

We have 2714 text objects


In [13]:
text = texts[1:2]
print(json.dumps(text, indent=2, sort_keys=False))

[
  {
    "@id": "https://data.atap.edu.au/api/stream?id=arcp://name,cooee-corpus/corpus/root&path=data/3-301-plain.txt",
    "@type": [
      "File",
      "DerivedMaterial"
    ],
    "name": "Text 3-301 1874 Randell, G. - text",
    "annotationOf": {
      "@id": "arcp://name,cooee-corpus/work/Warburton1875p305-06"
    },
    "modality": {
      "@id": "https://purl.archive.org/language-data-commons/terms#WrittenLanguage"
    },
    "language": {
      "@id": "https://glottolog.org/resource/languoid/id/stan1293"
    },
    "encodingFormat": "text/plain",
    "size": 1365,
    "@reverse": {
      "hasPart": [
        {
          "@id": "arcp://name,cooee-corpus/corpus/root/"
        },
        {
          "@id": "arcp://name,cooee-corpus/item/3-301"
        }
      ],
      "indexableText": [
        {
          "@id": "arcp://name,cooee-corpus/item/3-301"
        }
      ]
    }
  }
]


# Sample of one entry

In [14]:
url = text[0]["@id"]
url

'https://data.atap.edu.au/api/stream?id=arcp://name,cooee-corpus/corpus/root&path=data/3-301-plain.txt'

# Downloading a file from the ReST API

In [15]:
import requests

headers = {"Authorization": "Bearer %s" % API_TOKEN}
response = requests.get(url=url, headers=headers)

print(response.text)





To Colonel Peter Egerton Warburton., Leader of the South Australian Exploring Expedition.
SIR, - We the Council and Burgesses of the city of Perth congratulate you on the successful termination of your enterprise. The brave spirit which animated Captain Roe, the Gregorys, and Austin, on our side of the continent, equally impelled Oxley, Sturt, Mitchell, Leichhardt, Stuart, and yourself on its eastern side. The results to civilization have been vast, and, purchased with much suffering, have placed your names in the front rank of benefactors to mankind. Last on the list, it must be cheering to you to know that, under Divine Providence having accomplished the difficult and hazardous undertaking entrusted to you, you have thus prepared the way for future explorers still further to disclose to us the characteristics and resources of our island continent. We feel pride in being the first representatives of an Australian city to express to you their thanks and admiration of the energy, ski