# Harvest GLAM datasets from data.gov.au

This is a quick attempt to harvest datasets published by GLAM institutions using the new [data.gov.au API](https://data.gov.au/api/v0/apidocs/index.html).

To create the list of organisations, I searched the organisations on the [data.gov.au site](https://data.gov.au/) for 'library', 'archives', 'records', and 'museum'. I noticed that Queensland State Archives isn't included as an organisation, even though it's used as a tag, so I added it in as a query. There are inconsistencies in the way organisations are listed, so it's possible I've missed some.

In [1]:
import requests
import json
from json import JSONDecodeError
import pandas as pd
from urllib.parse import urlparse
from IPython.display import display, FileLink
import os
import re
import time
import pandas as pd
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])
s.mount('http://', HTTPAdapter(max_retries=retries))
s.mount('https://', HTTPAdapter(max_retries=retries))

In [2]:
api_url = 'https://data.gov.au/api/v0/search/datasets'
organisations = [
    'NSW State Archives',
    'National Archives of Australia',
    'Libraries Tasmania',
    'State Records',
    'State Records Office of Western Australia',
    'State Library of Victoria',
    'State Library of NSW',
    'Mount Gambier Library',
    'National Library of Australia',
    'State Library of Queensland',
    'State Library of Western Australia',
    'State Library of South Australia',
    'State Library of New South Wales',
    'Western Australian Museum',
    'South Australian Museum',
    'Museum of Applied Arts and Sciences',
    'Tasmanian Museum and Art Gallery',
    'History Trust of South Australia'
]
# No entries under organisations
queries = [
    '"Queensland State Archives"',
    'PROV Public Record Office'
]

In [3]:
def safe_get(dct, *keys):
    for key in keys:
        try:
            dct = dct[key]
        except (KeyError, TypeError):
            return None
    return dct

def process_dataset(dataset, query=None):
    datafiles = []
    for dist in dataset['distributions']:
        if query:
            publisher = query.strip('"')
        else:
            publisher = safe_get(dataset, 'publisher', 'name')
        datafile = {
            'dataset_title': safe_get(dataset, 'title'),
            'publisher': publisher,
            'dataset_issued': safe_get(dataset, 'issued'),
            'dataset_modified': safe_get(dataset, 'modified'),
            'dataset_description': safe_get(dataset, 'description'),
            'source': safe_get(dataset, 'catalog'),
            'info_url': safe_get(dataset, 'landingPage'),
            'start_date': safe_get(dataset, 'temporal', 'start', 'date'),
            'end_date': safe_get(dataset, 'temporal', 'end', 'date'),
            'file_title': safe_get(dist, 'title'),
            'download_url': safe_get(dist, 'downloadURL'),
            'format': safe_get(dist, 'format'),
            'file_description': safe_get(dist, 'description'),
            'file_issued': safe_get(dist, 'issued'),
            'file_modified': safe_get(dist, 'modified'),
            'licence': safe_get(dist, 'license', 'name')
        }
        datafiles.append(datafile)
    return datafiles

def harvest_datasets():
    datafiles = []
    for organisation in organisations:
        response = s.get(api_url, params={'publisher': organisation, 'limit': 100})
        print(response.url)
        data = response.json()
        for dataset in data['dataSets']:
            datafiles += process_dataset(dataset)
    for query in queries:
        response = s.get(api_url, params={'query': query, 'limit': 100})
        print(response.url)
        data = response.json()
        for dataset in data['dataSets']:
            datafiles += process_dataset(dataset, query=query)
    return datafiles

In [4]:
datafiles = harvest_datasets()

https://data.gov.au/api/v0/search/datasets?publisher=NSW+State+Archives&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=National+Archives+of+Australia&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=Libraries+Tasmania&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=State+Records&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=State+Records+Office+of+Western+Australia&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=State+Library+of+Victoria&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=State+Library+of+NSW&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=Mount+Gambier+Library&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=National+Library+of+Australia&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=State+Library+of+Queensland&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=State+Library+of+Western+Australia&limit=100
https://data.gov.au/

In [5]:
df = pd.DataFrame(datafiles)
df.head()

Unnamed: 0,dataset_description,dataset_issued,dataset_modified,dataset_title,download_url,end_date,file_description,file_issued,file_modified,file_title,format,info_url,licence,publisher,source,start_date
0,This index contains 763 entries from two serie...,2014-09-30T04:34:46Z,2016-07-20T12:10:21Z,Railway Employment Records,http://data.nsw.gov.au/data/storage/f/2014-09-...,,This dataset contains the following attributes...,2014-09-30T00:36:18Z,,Railway Employment Records,CSV,https://data.nsw.gov.au/data/dataset/1f776129-...,Creative Commons Attribution,NSW State Archives,New South Wales Government,
1,Researching deceased estates files before 1923...,2014-09-30T04:52:48Z,2016-07-20T12:09:20Z,"Deceased Estate Files, 1880-1923",https://data.nsw.gov.au/data/dataset/5d45437c-...,,This dataset contains the following attributes...,2014-09-30T00:55:53Z,,Deceased Estates,CSV,https://data.nsw.gov.au/data/dataset/5d45437c-...,Creative Commons Attribution,NSW State Archives,New South Wales Government,
2,The passage of the Returned Soldiers Settlemen...,2013-05-28T05:07:29Z,2016-06-16T01:18Z,Soldier Settlement Indexes,https://data.nsw.gov.au/data/dataset/78fe0108-...,,"For a description of this data, see: [Closer S...",2013-05-28T01:08:57Z,,"Closer Settlement Promotion files, 1913-",CSV,https://data.nsw.gov.au/data/dataset/78fe0108-...,Creative Commons Attribution,NSW State Archives,New South Wales Government,
3,The passage of the Returned Soldiers Settlemen...,2013-05-28T05:07:29Z,2016-06-16T01:18Z,Soldier Settlement Indexes,https://data.nsw.gov.au/data/dataset/78fe0108-...,,"For a description of this data, see this page:...",2013-05-28T01:09:42Z,,"Closer Settlement Transfer Registers, Jul 1919...",CSV,https://data.nsw.gov.au/data/dataset/78fe0108-...,Creative Commons Attribution,NSW State Archives,New South Wales Government,
4,The passage of the Returned Soldiers Settlemen...,2013-05-28T05:07:29Z,2016-06-16T01:18Z,Soldier Settlement Indexes,https://data.nsw.gov.au/data/dataset/78fe0108-...,,"For a description of this data, see this page:...",2013-05-28T01:10:39Z,,Closer Settlement and Returned Soldier’s Trans...,CSV,https://data.nsw.gov.au/data/dataset/78fe0108-...,Creative Commons Attribution,NSW State Archives,New South Wales Government,


In [6]:
df.shape

(939, 16)

In [7]:
df['format'].value_counts()

CSV           584
XML            80
JSON           74
XLSX           60
DOCX           34
HTML           33
ZIP            14
PLAIN          14
API             9
GEOJSON         8
DATA            6
OTHER           4
KML             3
JPEG            2
RSS             2
JAVASCRIPT      1
APP             1
RDF             1
WMS             1
CSS             1
PDF             1
WFS             1
HMTL            1
Name: format, dtype: int64

In [8]:
df['licence'].value_counts()

Creative Commons Attribution                       246
Creative Commons Attribution 3.0 Australia         241
Creative Commons Attribution 4.0                   237
Creative Commons Attribution 4.0 International     144
Creative Commons Attribution 2.5 Australia          32
Creative Commons Attribution-NonCommercial          10
Other (Open)                                         5
notspecified                                         5
Creative Commons Attribution 3.0                     3
Creative Commons Attribution Share-Alike 4.0         3
Creative Commons Attribution Non-Commercial 4.0      2
Custom (Other)                                       1
Name: licence, dtype: int64

In [9]:
df['publisher'].value_counts()

State Library of Queensland                  204
Queensland State Archives                    172
State Library of Western Australia           147
State Library of South Australia             140
Libraries Tasmania                            71
State Records                                 41
PROV Public Record Office                     33
South Australian Museum                       33
State Library of New South Wales              21
History Trust of South Australia              19
NSW State Archives                            19
State Records Office of Western Australia      7
Western Australian Museum                      6
State Library of NSW                           6
National Library of Australia                  5
State Library of Victoria                      5
National Archives of Australia                 3
Museum of Applied Arts and Sciences            3
Tasmanian Museum and Art Gallery               2
Mount Gambier Library                          2
Name: publisher, dty

In [10]:
df.to_csv('glam_datasets_all_formats_from_datagovau.csv', index=False)
display(FileLink('glam_datasets_all_formats_from_datagovau.csv'))

In [11]:
csvs = df.loc[df['format'] == 'CSV']

In [12]:
csvs.shape

(584, 16)

In [13]:
csvs['publisher'].value_counts()

State Library of Queensland                  185
Queensland State Archives                    122
State Library of Western Australia            76
State Library of South Australia              72
State Records                                 29
PROV Public Record Office                     26
Libraries Tasmania                            23
NSW State Archives                            17
State Library of New South Wales              10
South Australian Museum                        8
History Trust of South Australia               6
State Library of Victoria                      5
State Records Office of Western Australia      3
National Archives of Australia                 1
Mount Gambier Library                          1
Name: publisher, dtype: int64

In [14]:
csvs.to_csv('glam_datasets_csvs_from_datagovau.csv', index=False)
display(FileLink('glam_datasets_csvs_from_datagovau.csv'))

In [15]:
# Write results to a markdown file
orgs = df.sort_values(by=['publisher', 'dataset_title', 'dataset_modified']).groupby('publisher')
with open('glam_datasets_from_datagovau.md', 'w') as md_file:
    for org, group in orgs:
        md_file.write('\n## {}\n'.format(org))
        for dataset, files in group.groupby(['dataset_title', 'info_url']):
            md_file.write('\n### [{}]({})\n'.format(dataset[0], dataset[1]))
            for row in files.itertuples():
                md_file.write('* [{}]({}) ({}, {})\n'.format(row.file_title, row.download_url, row.format, row.file_issued))
    