In [1]:
import requests
import json
from json import JSONDecodeError
import pandas as pd
from urllib.parse import urlparse
import os
import re

In [2]:
portals = [
    {
        'name': 'data.gov.au',
        'api_url': 'https://data.gov.au/api/3/action/',
        'orgs': [
            'aiatsis',
            'nationallibraryofaustralia',
            'linctasmania',
            'slq',
            'statelibraryofnewsouthwales',
            'statelibraryofsouthaustralia',
            'statelibraryofwesternaustralia',
            'nationalarchivesofaustralia'
        ],
        'groups': [],
        'base_url': 'https://data.gov.au/dataset/',
        'package_ids': [
            '9849aa7f-e316-426e-8ab5-74658a62c7e6',
            '2f8d50dd-da9f-4c71-b278-8d787c1114fd',
            'c737c490-e962-4740-aced-76705386bfa2',
            '35eda988-6bca-47b5-b4fc-54dd84d375ae',
            '18cd3523-a1fc-4b21-b4a8-0f3f3e267a6c',
            'dd448c4c-c54d-4362-9450-bd0af4b03419'
        ]
    },
    {
        'name': 'data.sa.gov.au',
        'api_url': 'https://data.sa.gov.au/data/api/3/action/',
        'orgs': [    
            'state-library-of-south-australia',
            'mount-gambier-library',
            'state-records',
            'history-sa'
        ],
        'groups': [],
        'base_url': 'https://data.sa.gov.au/data/dataset/',
        'package_ids': []
    },
    {
        'name': 'data.qld.gov.au',
        'api_url': 'https://data.qld.gov.au/api/action/',
        'orgs': [],
        'groups': ['historical'],
        'base_url': 'https://data.qld.gov.au/dataset/',
        'package_ids': [
            '91efa00c-b982-4df9-8735-f9a2d3bfdd34'
        ]
    },
    {
        'name': 'data.nsw.gov.au',
        'api_url': 'https://data.nsw.gov.au/data/api/3/action/',
        'orgs': [
            'state-library-of-nsw',
            'state-archives-nsw',
            'maas'
        ],
        'groups': [],
        'base_url': 'https://data.nsw.gov.au/dataset/',
        'package_ids': [
            'c424f394-c952-442f-b462-3ba6fe9ba8d3',
            '4e57d134-79e9-42ad-a0a9-83fc91e1091c'
        ]
    },
    {
        'name': 'data.wa.gov.au',
        'api_url': 'https://catalogue.data.wa.gov.au/api/3/action/',
        'orgs': [
            'state-library-of-western-australia',
            'state-records-office-of-western-australia'
        ],
        'groups': [],
        'base_url': 'https://data.wa.gov.au/dataset/',
        'package_ids': []
    },
    {
        'name': 'data.vic.gov.au',
        'api_url': 'https://www.data.vic.gov.au/api/action/',
        'orgs': [
            'state-library-of-victoria'
        ],
        'groups': [],
        'base_url': 'https://www.data.vic.gov.au/data/dataset/',
        'package_ids': []
    },
]

In [3]:
def get_value(field):
    '''
    Sometimes values are strings and sometimes objects in strings.
    Get string values.
    '''
    try:
        s = field.replace("u'", "'").replace("'", '"')
        j = json.loads(s)
        value = j['name']
    except JSONDecodeError:
        value = field
    except AttributeError:
        value = None
    return value

In [4]:
def check_for_duplicates(organisation, created, url):
    parsed = urlparse(url)
    file_name = os.path.basename(parsed.path)
    for resource in resources:
        if organisation == resource['organisation'] and created == resource['created'] and re.search('\/{}$'.format(file_name), resource['url']):
            print(file_name)
            return True # It's a duplicate
    return False

In [5]:
def fix_github_links(url):
    '''
    Make sure github links point to downloadable files.
    '''
    return url.replace('github.com', 'raw.githubusercontent.com').replace('/master', '')

In [6]:
def get_package_data(package_id, api_url):
    resources = []
    url = '{}package_show?id={}'.format(api_url, package_id)
    print(url)
    response = requests.get(url)
    package_data = response.json()
    title = package_data['result']['title']
    organisation = package_data['result']['organization']['title']
    author = get_value(package_data['result']['author'])
    try:
        date_from = package_data['result']['temporal_coverage_from']
    except KeyError:
        date_from = ''
    try:
        date_to = package_data['result']['temporal_coverage_to']
    except KeyError:
        date_to = ''
    for resource in package_data['result']['resources']:
        if resource['url'][-4:] == '.csv' and not check_for_duplicates(organisation, resource['created'], resource['url']):
            dataset = {}
            dataset['organisation'] = organisation
            dataset['author'] = author
            dataset['package_title'] = title
            dataset['file_title'] = resource['name']
            dataset['description'] = resource['description']
            dataset['created'] = resource['created']
            dataset['last_modified'] = resource['created']
            dataset['date_from'] = date_from
            dataset['date_to'] = date_to
            dataset['licence'] = package_data['result']['license_title']
            dataset['url'] = fix_github_links(resource['url'])
            dataset['info'] = portal['base_url'] + package_id
            resources.append(dataset)
    return resources

In [7]:
def get_packages_by_org(org_list, api_url):   
    resources = []
    for org in org_list:
        url = '{}organization_show?id={}&include_datasets=True'.format(api_url, org)
        print(url)
        response = requests.get(url)
        data = response.json()
        for package in data['result']['packages']:
            resources.extend(get_package_data(package['id'], api_url))
    return resources

In [8]:
def get_packages_by_group(group_list, api_url):   
    resources = []
    for org in group_list:
        url = '{}group_show?id={}&include_datasets=True'.format(api_url, org)
        print(url)
        response = requests.get(url)
        data = response.json()
        for package in data['result']['packages']:
            resources.extend(get_package_data(package['id'], api_url))
    return resources

In [9]:
resources = []
for portal in portals:
    resources.extend(get_packages_by_org(portal['orgs'], portal['api_url']))
    resources.extend(get_packages_by_group(portal['groups'], portal['api_url']))
    for package_id in portal['package_ids']:
        resources.extend(get_package_data(package_id, portal['api_url']))

https://data.gov.au/api/3/action/organization_show?id=aiatsis&include_datasets=True
https://data.gov.au/api/3/action/organization_show?id=nationallibraryofaustralia&include_datasets=True
https://data.gov.au/api/3/action/package_show?id=c53ab05a-03f8-4414-9f31-68359e8ebdac
https://data.gov.au/api/3/action/package_show?id=7f03ac78-7210-4702-8cd5-f3e373f4097b
https://data.gov.au/api/3/action/package_show?id=86bb09b6-acc9-4666-b464-9cf3fe59080b
https://data.gov.au/api/3/action/package_show?id=b247e1c0-608d-495d-8583-8bc9de63c4ee
https://data.gov.au/api/3/action/organization_show?id=linctasmania&include_datasets=True
https://data.gov.au/api/3/action/package_show?id=575f1bb4-87ed-4f08-990d-3cfcd06ccd9c
https://data.gov.au/api/3/action/package_show?id=58a9a8d7-01e0-43df-9f91-06ccdae5c634
https://data.gov.au/api/3/action/package_show?id=b0627a17-6783-4c18-a83e-11aa7e22e50e
https://data.gov.au/api/3/action/package_show?id=3e1eb60e-5ad1-4a20-8cb9-32be085e56f0
https://data.gov.au/api/3/action/pac

https://data.sa.gov.au/data/api/3/action/organization_show?id=state-library-of-south-australia&include_datasets=True
https://data.sa.gov.au/data/api/3/action/package_show?id=3070a7b7-f76d-432d-8c9a-f9daf6077efa
https://data.sa.gov.au/data/api/3/action/package_show?id=8d057a25-324d-47d8-a841-2d0a8b60adab
https://data.sa.gov.au/data/api/3/action/package_show?id=a1fce9bf-9afe-426f-a4b8-e2661e611436
https://data.sa.gov.au/data/api/3/action/package_show?id=49266c9b-ae27-47bc-8e23-866a2d027403
https://data.sa.gov.au/data/api/3/action/package_show?id=62c4e993-8d5b-48a4-9d11-9e9dd4708455
https://data.sa.gov.au/data/api/3/action/package_show?id=e52b0259-2b57-4121-8e04-be333025fb3b
https://data.sa.gov.au/data/api/3/action/package_show?id=387e2e32-f30c-4f9b-9d8f-848d06dfe18d
https://data.sa.gov.au/data/api/3/action/package_show?id=97f60b3e-9b50-4cc2-9960-73288d076c51
https://data.sa.gov.au/data/api/3/action/package_show?id=54c50844-6653-4704-8dc8-5f5c8f5f14eb
https://data.sa.gov.au/data/api/3/act

https://data.qld.gov.au/api/action/package_show?id=4f5b4c2f-4578-4737-b199-53347b60345d
https://data.qld.gov.au/api/action/package_show?id=ca2bf49a-6f27-47b9-8be8-7f09649b97a7
https://data.qld.gov.au/api/action/package_show?id=96c624e5-c6fd-4e26-8c07-90a236185437
https://data.qld.gov.au/api/action/package_show?id=261c3079-d154-4797-8211-289eb16e6d18
https://data.qld.gov.au/api/action/package_show?id=359f31cd-92ca-453c-a539-09dc261856bb
https://data.qld.gov.au/api/action/package_show?id=75d36b23-4510-463e-aa85-0fac1d7b3899
https://data.qld.gov.au/api/action/package_show?id=40b98318-11ab-4c1b-8e41-9b11a2db6071
https://data.qld.gov.au/api/action/package_show?id=959d611f-a9cf-4e80-affa-1ebd978cafea
https://data.qld.gov.au/api/action/package_show?id=c1d5bf09-e06d-49ab-9d84-290840574971
https://data.qld.gov.au/api/action/package_show?id=c3746e58-0ba7-4b18-888c-dab4840937f2
https://data.qld.gov.au/api/action/package_show?id=ba182873-e8a7-45e1-b0e7-e0b6671fa1a9
https://data.qld.gov.au/api/acti

https://data.nsw.gov.au/data/api/3/action/package_show?id=1f776129-6d97-4225-afc0-b9fc1833f815
https://data.nsw.gov.au/data/api/3/action/package_show?id=5d45437c-d07a-4977-95ab-e53fb86f02c1
https://data.nsw.gov.au/data/api/3/action/package_show?id=78fe0108-2ae2-4918-b359-48a3bb1c31dd
https://data.nsw.gov.au/data/api/3/action/package_show?id=acdd01d0-d700-465c-a183-735c4f139ca0
https://data.nsw.gov.au/data/api/3/action/package_show?id=32f00807-077b-45f9-978b-df4f688b1b17
https://data.nsw.gov.au/data/api/3/action/package_show?id=e2c36616-36db-4bb3-a907-87db836481f0
https://data.nsw.gov.au/data/api/3/action/organization_show?id=maas&include_datasets=True
https://data.nsw.gov.au/data/api/3/action/package_show?id=bf5a60c5-3340-4cd7-9282-d4db9819e071
https://data.nsw.gov.au/data/api/3/action/package_show?id=bf9df234-7890-4907-94f6-e7872c8f4258
https://data.nsw.gov.au/data/api/3/action/package_show?id=91829aff-594e-4cdf-bc76-bf639335fa26
https://data.nsw.gov.au/data/api/3/action/package_show?

In [10]:
df = pd.DataFrame(resources)
df = df[['organisation', 'author', 'created', 'last_modified', 'package_title', 'file_title', 'date_from', 'date_to', 'description', 'licence', 'info', 'url']]
df = df.sort_values(by=['organisation', 'package_title', 'file_title'])
df

Unnamed: 0,organisation,author,created,last_modified,package_title,file_title,date_from,date_to,description,licence,info,url
427,Aboriginal and Torres Strait Islander Partners...,opendata@datsima.qld.gov.au,2014-06-25T03:28:09.245396,2014-06-25T03:28:09.245396,Black Tracker files index,Black Tracker files index,,,This dataset contains names of the Black Track...,Other (Open),https://data.qld.gov.au/dataset/d4eae643-937c-...,https://data.qld.gov.au/dataset/d4eae643-937c-...
428,Aboriginal and Torres Strait Islander Partners...,opendata@datsima.qld.gov.au,2014-06-25T03:36:16.835616,2014-06-25T03:36:16.835616,Correspondence relating to Aboriginal and Torr...,Correspondence relating to Aboriginal and Torr...,,,This dataset relates to indexes created for co...,Other (Open),https://data.qld.gov.au/dataset/fa8fe6cd-8560-...,https://data.qld.gov.au/dataset/fa8fe6cd-8560-...
426,Aboriginal and Torres Strait Islander Partners...,opendata@datsima.qld.gov.au,2014-06-25T06:25:06.236469,2014-06-25T06:25:06.236469,Index of census returns of Aboriginal and Torr...,Index of census returns of Aboriginal and Torr...,,,This dataset contains an index of census retur...,Other (Open),https://data.qld.gov.au/dataset/deee57ae-b8c4-...,https://data.qld.gov.au/dataset/deee57ae-b8c4-...
320,Environment and Science,opendata@qm.qld.gov.au,2014-06-25T05:09:42.879738,2014-06-25T05:09:42.879738,Queensland Museum collection of historical obj...,Queensland Museum collection of historical obj...,,,A CSV file containing records of a selection o...,Creative Commons Attribution 4.0,https://data.qld.gov.au/dataset/bef12bdd-27de-...,http://www.qm.qld.gov.au/microsites/data/histo...
390,Housing and Public Works,info@archives.qld.gov.au,2015-07-08T05:51:31.632805,2015-07-08T05:51:31.632805,Aboriginal War Census 1915 to 1916,Aboriginal War Census 1915 to 1916,,,This open data file records Aboriginal peoples...,Creative Commons Attribution 4.0,https://data.qld.gov.au/dataset/74824ca8-2ba5-...,https://data.qld.gov.au/dataset/74824ca8-2ba5-...
394,Housing and Public Works,info@archives.qld.gov.au,2016-08-16T23:29:26.628723,2016-08-16T23:29:26.628723,Army Reservist payments 1909 to 1912,Army Reservist payments 1909 to 1912,,,This open data file alphabetically lists the n...,Creative Commons Attribution 4.0,https://data.qld.gov.au/dataset/696f4c13-a846-...,https://data.qld.gov.au/dataset/696f4c13-a846-...
334,Housing and Public Works,info@archives.qld.gov.au,2013-06-25T20:34:24.588785,2013-06-25T20:34:24.588785,Assistant Immigration Agent Maryborough 1875 t...,"Rations issued to immigrants, Maryborough 1875...",,,This open data file lists the names of immigra...,Creative Commons Attribution 4.0,https://data.qld.gov.au/dataset/7ccf8996-505a-...,https://data.qld.gov.au/dataset/7ccf8996-505a-...
332,Housing and Public Works,info@archives.qld.gov.au,2013-06-25T20:22:54.184972,2013-06-25T20:22:54.184972,Assistant Immigration Agent Maryborough 1884 t...,"Immigrants nominated for passage, Maryborough ...",,,This open data file lists the names of immigra...,Creative Commons Attribution 4.0,https://data.qld.gov.au/dataset/8a0928d0-60c2-...,https://data.qld.gov.au/dataset/8a0928d0-60c2-...
333,Housing and Public Works,info@archives.qld.gov.au,2013-06-25T20:22:54.185005,2013-06-25T20:22:54.185005,Assistant Immigration Agent Maryborough 1884 t...,"Nominated immigrants, Maryborough 1904 to 1907",,,This open data file lists the names of immigra...,Creative Commons Attribution 4.0,https://data.qld.gov.au/dataset/8a0928d0-60c2-...,https://data.qld.gov.au/dataset/8a0928d0-60c2-...
297,Housing and Public Works,info@archives.qld.gov.au,2013-03-05T23:30:57.308546,2013-03-05T23:30:57.308546,Assisted immigration 1848 to 1912,Assisted immigration 1848 to 1912 - A,,,This open data file lists the names of assiste...,Creative Commons Attribution 4.0,https://data.qld.gov.au/dataset/ba182873-e8a7-...,https://data.qld.gov.au/dataset/ba182873-e8a7-...


In [11]:
df.to_csv('gov-glam-datasets.csv', index=False)