In [None]:
import requests
import json
from json import JSONDecodeError
import pandas as pd
from urllib.parse import urlparse
import os
import re
import time

In [18]:
# The Queensland data portal has a habit of dropping connections, 
# so this cell sets up Requests to retry in case of a dropout.
import logging
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

logging.basicConfig(level=logging.DEBUG)

s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])
s.mount('http://', HTTPAdapter(max_retries=retries))

In [19]:
portals = [
    {
        'name': 'data.qld.gov.au',
        'api_url': 'https://data.qld.gov.au/api/action/',
        'orgs': [],
        'tags': ['State Library of Queensland', 'Queensland State Archives'],
        'queries': ['Queensland Museum']
        'groups': [],
        'base_url': 'https://data.qld.gov.au/dataset/',
        'package_ids': [
            '91efa00c-b982-4df9-8735-f9a2d3bfdd34'
        ]
    },
    {
        'name': 'data.gov.au',
        'api_url': 'https://data.gov.au/api/3/action/',
        'orgs': [
            'aiatsis',
            'nationallibraryofaustralia',
            'linctasmania',
            'slq',
            'statelibraryofnewsouthwales',
            'statelibraryofsouthaustralia',
            'statelibraryofwesternaustralia',
            'nationalarchivesofaustralia',
            'national-portrait-gallery'
        ],
        'groups': [],
        'base_url': 'https://data.gov.au/dataset/',
        'package_ids': [
            '9849aa7f-e316-426e-8ab5-74658a62c7e6',
            '2f8d50dd-da9f-4c71-b278-8d787c1114fd',
            'c737c490-e962-4740-aced-76705386bfa2',
            '35eda988-6bca-47b5-b4fc-54dd84d375ae',
            '18cd3523-a1fc-4b21-b4a8-0f3f3e267a6c',
            'dd448c4c-c54d-4362-9450-bd0af4b03419'
        ]
    },
    {
        'name': 'data.sa.gov.au',
        'api_url': 'https://data.sa.gov.au/data/api/3/action/',
        'orgs': [    
            'state-library-of-south-australia',
            'mount-gambier-library',
            'state-records',
            'history-sa'
        ],
        'groups': [],
        'base_url': 'https://data.sa.gov.au/data/dataset/',
        'package_ids': []
    },
    {
        'name': 'data.nsw.gov.au',
        'api_url': 'https://data.nsw.gov.au/data/api/3/action/',
        'orgs': [
            'state-library-of-nsw',
            'state-archives-nsw',
            'maas'
        ],
        'groups': [],
        'base_url': 'https://data.nsw.gov.au/dataset/',
        'package_ids': [
            'c424f394-c952-442f-b462-3ba6fe9ba8d3',
            '4e57d134-79e9-42ad-a0a9-83fc91e1091c'
        ]
    },
    {
        'name': 'data.wa.gov.au',
        'api_url': 'https://catalogue.data.wa.gov.au/api/3/action/',
        'orgs': [
            'state-library-of-western-australia',
            'state-records-office-of-western-australia'
        ],
        'groups': [],
        'base_url': 'https://data.wa.gov.au/dataset/',
        'package_ids': []
    },
    {
        'name': 'data.vic.gov.au',
        'api_url': 'https://www.data.vic.gov.au/api/action/',
        'orgs': [
            'state-library-of-victoria'
        ],
        'groups': [],
        'base_url': 'https://www.data.vic.gov.au/data/dataset/',
        'package_ids': []
    },
]

In [20]:
def get_value(field):
    '''
    Sometimes values are strings and sometimes objects in strings.
    Get string values.
    '''
    try:
        s = field.replace("u'", "'").replace("'", '"')
        j = json.loads(s)
        value = j['name']
    except JSONDecodeError:
        value = field
    except AttributeError:
        value = None
    return value

In [21]:
def check_for_duplicates(organisation, created, url):
    '''
    Not using this anymore...
    '''
    parsed = urlparse(url)
    file_name = os.path.basename(parsed.path)
    for resource in resources:
        if organisation == resource['organisation'] and created == resource['created'] and re.search('\/{}$'.format(file_name), resource['url']):
            print(file_name)
            return True # It's a duplicate
    return False

In [22]:
def fix_github_links(url):
    '''
    Make sure github links point to downloadable files.
    '''
    return url.replace('//github.com', '//raw.githubusercontent.com').replace('/master', '')

In [23]:
def check_http_status(url):
    '''
    Do a HEAD request of downloadable datasets to check if they're still there.
    '''
    response = s.head(url, allow_redirects=True)
    return response.status_code

In [24]:
def get_format(resource):
    # First try getting file extension
    try:
        url = fix_github_links(resource['url'])
        file_format = re.search('\.([a-zA-Z]+)$', url).group(1)
    # If that fails just use the supplied value (which may be dodgy)
    except AttributeError:
        file_format = resource['format'].lower()
    return file_format

In [25]:
def get_package_data(package_id, portal):
    '''
    Given a package id and a portal, download details of all associated datasets/
    '''
    resources = []
    api_url = portal['api_url']
    url = '{}package_show?id={}'.format(api_url, package_id)
    print(url)
    response = s.get(url)
    package_data = response.json()
    title = package_data['result']['title']
    organisation = package_data['result']['organization']['title']
    author = get_value(package_data['result']['author'])
    try:
        date_from = package_data['result']['temporal_coverage_from']
    except KeyError:
        date_from = ''
    try:
        date_to = package_data['result']['temporal_coverage_to']
    except KeyError:
        date_to = ''
    for resource in package_data['result']['resources']:
        dataset = {}
        resource_url = fix_github_links(resource['url'])
        dataset['source'] = portal['name']
        dataset['organisation'] = organisation
        dataset['author'] = author
        dataset['package_title'] = title
        dataset['file_title'] = resource['name']
        dataset['description'] = resource['description']
        dataset['created'] = resource['created']
        dataset['last_modified'] = resource['created']
        dataset['date_from'] = date_from
        dataset['date_to'] = date_to
        dataset['licence'] = package_data['result']['license_title']
        dataset['url'] = resource_url
        dataset['info'] = portal['base_url'] + package_id
        dataset['format'] = get_format(resource)
        dataset['status'] = check_http_status(resource_url)
        resources.append(dataset)
        time.sleep(0.5)
    return resources

In [26]:
def get_packages_by_org(portal):   
    resources = []
    api_url = portal['api_url']
    for org in portal['orgs']:
        url = '{}organization_show?id={}&include_datasets=True'.format(api_url, org)
        print(url)
        response = s.get(url)
        data = response.json()
        for package in data['result']['packages']:
            resources.extend(get_package_data(package['id'], portal))
    return resources

In [27]:
def get_packages_by_group(portal):   
    resources = []
    api_url = portal['api_url']
    for group in portal['groups']:
        url = '{}group_show?id={}&include_datasets=True'.format(api_url, group)
        print(url)
        response = s.get(url)
        data = response.json()
        for package in data['result']['packages']:
            resources.extend(get_package_data(package['id'], portal))
    return resources

In [None]:
def get_packages_by_tag(portal):   
    resources = []
    api_url = portal['api_url']
    for group in portal['groups']:
        url = '{}package_search?fq=tags:{}&include_datasets=True'.format(api_url, tag)
        print(url)
        response = s.get(url)
        data = response.json()
        for package in data['result']['packages']:
            resources.extend(get_package_data(package['id'], portal))
    return resources

In [None]:
resources = []
for portal in portals:
    resources.extend(get_packages_by_org(portal))
    resources.extend(get_packages_by_group(portal))
    for package_id in portal['package_ids']:
        resources.extend(get_package_data(package_id, portal))

In [29]:
df = pd.DataFrame(resources)
df = df[['source', 'organisation', 'author', 'created', 'last_modified', 'package_title', 'file_title', 'date_from', 'date_to', 'description', 'format', 'licence', 'info', 'url', 'status']]
df = df.sort_values(by=['organisation', 'package_title', 'file_title'])
df

Unnamed: 0,source,organisation,author,created,last_modified,package_title,file_title,date_from,date_to,description,format,licence,info,url,status
210,data.qld.gov.au,Aboriginal and Torres Strait Islander Partners...,opendata@datsima.qld.gov.au,2014-06-25T03:28:09.245396,2014-06-25T03:28:09.245396,Black Tracker files index,Black Tracker files index,,,This dataset contains names of the Black Track...,csv,Other (Open),https://data.qld.gov.au/dataset/d4eae643-937c-...,https://data.qld.gov.au/dataset/d4eae643-937c-...,200
211,data.qld.gov.au,Aboriginal and Torres Strait Islander Partners...,opendata@datsima.qld.gov.au,2014-06-25T03:36:16.835616,2014-06-25T03:36:16.835616,Correspondence relating to Aboriginal and Torr...,Correspondence relating to Aboriginal and Torr...,,,This dataset relates to indexes created for co...,csv,Other (Open),https://data.qld.gov.au/dataset/fa8fe6cd-8560-...,https://data.qld.gov.au/dataset/fa8fe6cd-8560-...,200
212,data.qld.gov.au,Aboriginal and Torres Strait Islander Partners...,opendata@datsima.qld.gov.au,2014-06-25T03:40:34.003513,2014-06-25T03:40:34.003513,Correspondence relating to Aboriginal and Torr...,Correspondence relating to Aboriginal and Torr...,,,This resource contains the explanatory notes f...,pdf,Other (Open),https://data.qld.gov.au/dataset/fa8fe6cd-8560-...,https://data.qld.gov.au/dataset/fa8fe6cd-8560-...,200
209,data.qld.gov.au,Aboriginal and Torres Strait Islander Partners...,opendata@datsima.qld.gov.au,2014-06-25T06:25:06.236469,2014-06-25T06:25:06.236469,Index of census returns of Aboriginal and Torr...,Index of census returns of Aboriginal and Torr...,,,This dataset contains an index of census retur...,csv,Other (Open),https://data.qld.gov.au/dataset/deee57ae-b8c4-...,https://data.qld.gov.au/dataset/deee57ae-b8c4-...,200
687,data.nsw.gov.au,Australian Museum,Vanessa Finney,2013-11-21T23:41:24.250732,2013-11-21T23:41:24.250732,Australian Museum: Scott Sisters collection an...,Scott Sisters collection and science data,,,Collection data from the 1800s and current sci...,"csv, json, web services",Creative Commons Attribution,https://data.nsw.gov.au/dataset/4e57d134-79e9-...,http://australianmuseum.net.au/Scott-Sisters-c...,200
686,data.nsw.gov.au,City of Sydney,City of Sydney,2016-07-27T15:06:56.938614,2016-07-27T15:06:56.938614,City of Sydney Collections,City of Sydney - Collections website link,,,http://www.cityofsydney.nsw.gov.au/learn/searc...,website link,Creative Commons Attribution,https://data.nsw.gov.au/dataset/c424f394-c952-...,http://www.cityofsydney.nsw.gov.au/learn/searc...,200
98,data.qld.gov.au,Environment and Science,opendata@qm.qld.gov.au,2014-06-25T05:09:42.879738,2014-06-25T05:09:42.879738,Queensland Museum collection of historical obj...,Queensland Museum collection of historical obj...,,,A CSV file containing records of a selection o...,csv,Creative Commons Attribution 4.0,https://data.qld.gov.au/dataset/bef12bdd-27de-...,http://www.qm.qld.gov.au/microsites/data/histo...,200
5,data.qld.gov.au,Environment and Science,opendata@slq.qld.gov.au,2013-05-29T00:14:05.440822,2013-05-29T00:14:05.440822,State Library of Queensland - British convict ...,State Library of Queensland - British convict ...,,,This dataset contains details of convicts tran...,csv,Creative Commons Attribution 4.0,https://data.qld.gov.au/dataset/458eb59f-e5f1-...,https://data.gov.au/dataset/british-convict-tr...,200
181,data.qld.gov.au,Environment and Science,opendata@slq.qld.gov.au,2013-05-29T00:21:04.693875,2013-05-29T00:21:04.693875,State Library of Queensland - Digitised maps,State Library of Queensland - Digitised maps,,,"A collection of digitised, out of copyright ma...",csv,Creative Commons Attribution 4.0,https://data.qld.gov.au/dataset/befaf9ae-6100-...,https://data.gov.au/dataset/digitised-maps,200
56,data.qld.gov.au,Environment and Science,opendata@slq.qld.gov.au,2018-03-29T05:01:04.829038,2018-03-29T05:01:04.829038,State Library of Queensland - Licensed Victual...,Explanatory Information,,,This document provides information about the c...,docx,Creative Commons Attribution 4.0,https://data.qld.gov.au/dataset/96c624e5-c6fd-...,https://data.gov.au/dataset/slq-licensed-victu...,200


In [30]:
df['format'].value_counts()

csv                           499
xml                            66
wms                            35
xlsx                           27
json                           25
docx                           17
xls                            16
txt                            15
zip                            14
doc                            12
api                            12
geojson                         8
other                           7
data                            6
pdf                             4
jpg                             2
kml                             2
website link                    2
rtf                             2
html                            2
rss                             2
url                             1
jpeg                            1
plain                           1
.txt                            1
wfs                             1
kmz                             1
xsd                             1
app                             1
mp3           

In [31]:
df['status'].value_counts()

200    746
404     39
400      3
403      2
Name: status, dtype: int64

In [32]:
df['source'].value_counts()

data.gov.au        271
data.qld.gov.au    214
data.sa.gov.au     173
data.wa.gov.au      96
data.nsw.gov.au     30
data.vic.gov.au      6
Name: source, dtype: int64

In [33]:
df['organisation'].value_counts()

State Library of South Australia                      121
Housing and Public Works                              117
State Library of Western Australia                    114
Natural Resources, Mines and Energy                    79
State Library of Queensland                            78
LINC Tasmania                                          74
State Records Office of Western Australia              41
State Records                                          41
South Australian Governments                           26
State Library of New South Wales                       21
State Archives NSW                                     19
Environment and Science                                14
History Trust of South Australia                       12
State Library of Victoria                               6
State Library of NSW                                    6
National Library of Australia                           5
Aboriginal and Torres Strait Islander Partnerships      4
Museum of Appl

In [None]:
df.to_csv('gov-glam-datasets-all-formats.csv', index=False)

In [34]:
csvs = df.loc[df['format'] == 'csv']

In [35]:
csvs

Unnamed: 0,source,organisation,author,created,last_modified,package_title,file_title,date_from,date_to,description,format,licence,info,url,status
210,data.qld.gov.au,Aboriginal and Torres Strait Islander Partners...,opendata@datsima.qld.gov.au,2014-06-25T03:28:09.245396,2014-06-25T03:28:09.245396,Black Tracker files index,Black Tracker files index,,,This dataset contains names of the Black Track...,csv,Other (Open),https://data.qld.gov.au/dataset/d4eae643-937c-...,https://data.qld.gov.au/dataset/d4eae643-937c-...,200
211,data.qld.gov.au,Aboriginal and Torres Strait Islander Partners...,opendata@datsima.qld.gov.au,2014-06-25T03:36:16.835616,2014-06-25T03:36:16.835616,Correspondence relating to Aboriginal and Torr...,Correspondence relating to Aboriginal and Torr...,,,This dataset relates to indexes created for co...,csv,Other (Open),https://data.qld.gov.au/dataset/fa8fe6cd-8560-...,https://data.qld.gov.au/dataset/fa8fe6cd-8560-...,200
209,data.qld.gov.au,Aboriginal and Torres Strait Islander Partners...,opendata@datsima.qld.gov.au,2014-06-25T06:25:06.236469,2014-06-25T06:25:06.236469,Index of census returns of Aboriginal and Torr...,Index of census returns of Aboriginal and Torr...,,,This dataset contains an index of census retur...,csv,Other (Open),https://data.qld.gov.au/dataset/deee57ae-b8c4-...,https://data.qld.gov.au/dataset/deee57ae-b8c4-...,200
98,data.qld.gov.au,Environment and Science,opendata@qm.qld.gov.au,2014-06-25T05:09:42.879738,2014-06-25T05:09:42.879738,Queensland Museum collection of historical obj...,Queensland Museum collection of historical obj...,,,A CSV file containing records of a selection o...,csv,Creative Commons Attribution 4.0,https://data.qld.gov.au/dataset/bef12bdd-27de-...,http://www.qm.qld.gov.au/microsites/data/histo...,200
5,data.qld.gov.au,Environment and Science,opendata@slq.qld.gov.au,2013-05-29T00:14:05.440822,2013-05-29T00:14:05.440822,State Library of Queensland - British convict ...,State Library of Queensland - British convict ...,,,This dataset contains details of convicts tran...,csv,Creative Commons Attribution 4.0,https://data.qld.gov.au/dataset/458eb59f-e5f1-...,https://data.gov.au/dataset/british-convict-tr...,200
181,data.qld.gov.au,Environment and Science,opendata@slq.qld.gov.au,2013-05-29T00:21:04.693875,2013-05-29T00:21:04.693875,State Library of Queensland - Digitised maps,State Library of Queensland - Digitised maps,,,"A collection of digitised, out of copyright ma...",csv,Creative Commons Attribution 4.0,https://data.qld.gov.au/dataset/befaf9ae-6100-...,https://data.gov.au/dataset/digitised-maps,200
57,data.qld.gov.au,Environment and Science,opendata@slq.qld.gov.au,2018-03-29T04:59:14.475345,2018-03-29T04:59:14.475345,State Library of Queensland - Licensed Victual...,Licensed Victuallers Index,,,This index was prepared by volunteers transcri...,csv,Creative Commons Attribution 4.0,https://data.qld.gov.au/dataset/96c624e5-c6fd-...,https://data.gov.au/dataset/slq-licensed-victu...,200
182,data.qld.gov.au,Environment and Science,opendata@slq.qld.gov.au,2015-07-02T22:45:10.687859,2015-07-02T22:45:10.687859,State Library of Queensland - Photographs 1914...,SLQ - Photographs 1914-1918,,,Out of copyright photographs and metadata from...,csv,Creative Commons Attribution 4.0,https://data.qld.gov.au/dataset/6b3dc1c0-4ca3-...,https://data.gov.au/dataset/slq-photographs-19...,200
180,data.qld.gov.au,Environment and Science,opendata@slq.qld.gov.au,2013-05-29T00:52:33.021144,2013-05-29T00:52:33.021144,State Library of Queensland - Picture Queensland,State Library of Queensland - Picture Queensland,,,40 000 out of copyright photographs from the p...,csv,Creative Commons Attribution 4.0,https://data.qld.gov.au/dataset/4295bf38-708a-...,https://data.gov.au/dataset/picture-queensland,200
6,data.qld.gov.au,Environment and Science,opendata@slq.qld.gov.au,2014-09-26T00:20:47.182092,2014-09-26T00:20:47.182092,State Library of Queensland - Portraits of sol...,State Library of Queensland - Portraits of sol...,,,From State Library of Queensland - information...,csv,Creative Commons Attribution 4.0,https://data.qld.gov.au/dataset/91efa00c-b982-...,https://data.gov.au/dataset/portraits-of-soldi...,200


In [36]:
csvs['status'].value_counts()

200    493
404      4
400      2
Name: status, dtype: int64

In [None]:
csvs.to_csv('gov-glam-datasets.csv', index=False)