# Harvesting functions from the RecordSearch interface

This notebook attempts to extract information from the RecordSearch interface about the hierarchy of functions it uses to describe the work of government agencies.

[Previous explorations](https://timsherratt.org/research-notebook/aggregated-archives/notes/naa-functions/) have shown that the NAA's use of functions is rather inconsistent. All I'm doing here is finding out what functions RecordSearch itself says it is using. This may not be complete, but it seems like a useful starting point.

There are a few inconsistencies that I've tried to clean up. In particular, the hierarchy is broken in a number of places where a child term links up to a non-preferred term. In this case I've replaced the non-preferred term with the preferred term.

I've also noticed that some 'narrower' terms don't have their own entries in the main list, so I've made sure that these are all added in.

I suspect that the majority of these terms are never used, but we'll save that question for another notebook...

In [115]:
from copy import deepcopy
import json
import os
import robobrowser
import re
from IPython.display import display, HTML, FileLink
from tqdm import tqdm_notebook
from recordsearch_tools.client import RSClient, RSAgencySearchClient

# Make sure there's somewhere to save data files
os.makedirs('data', exist_ok=True)

In [94]:
BROKEN_HIERARCHIES = {
    # borked parent: good parent
    'australian defence forces (adf)': 'defence forces',
    'immigration': 'migration',
    'community protection': 'customs',
    'security': 'security and intelligence',
    'finance management': 'financial matters',
    'education and training': 'education',
    'governance': None,
    'customs regulations': 'customs',
    'employment services': 'employment',
    'health care': 'health',
    'maritime services': 'sea transport',
    'early childhood education': 'education',
    'fiscal policy': 'financial matters',
    'marine and rural regulation': 'primary industries',
    'civic infrastructure': 'works',
    'retirement income': 'financial matters',
    'import regulation': 'trade'
}

class RSFunctionsClient(RSClient):
    '''
    Harvests the functions that are used in the RS functions browse interface.
    '''
    
    def add_child(self, child, parents):
        '''
        Checks to see in the child's parent is in the supplied list of parents.
        If it is, it adds the child to a list of 'narrower' terms in the parent item.
        '''
        this_child = deepcopy(child)
        this_parent = this_child.pop('parent')
        for parent in parents:
            if parent['term'] == this_parent:
                try:
                    if not any(p['term'] == this_child['term'] for p in parent['narrower']):
                        parent['narrower'].append(this_child)
                    # This is necessary to keep narrower items that don't have their own entries
                    elif 'narrower' in this_child:
                        for p in parent['narrower']:
                            if p['term'] == this_child['term']:
                                p['narrower'] = this_child['narrower']
                except KeyError:
                    parent['narrower'] = []
                    parent['narrower'].append(this_child)
        return parents

    def harvest_functions(self):
        terms = []
        self.br.session.headers.update({'Referer': 'http://recordsearch.naa.gov.au/SearchNRetrieve/Interface/SearchScreens/BasicSearch.aspx'})
        self.br.open('http://recordsearch.naa.gov.au/SearchNRetrieve/Interface/SearchScreens/AdvSearchFunctionsBrowsing.aspx')
        for letter in tqdm_notebook(range(0, 26)):
            form = self.br.get_form(id='formSNRMaster')
            form['__EVENTTARGET'] = 'ctl00$ContentPlaceHolderSNR$ctl{}'.format(str(letter).zfill(2))
            submit = robobrowser.forms.fields.Input('<input type="submit" value="Submit" name="submit">Submit</input>')
            form.add_field(submit)
            self.br.submit_form(form, submit=form['submit'])
            try:
                for row in self.br.find(id='ContentPlaceHolderSNR_dlFunctions').find_all('tr', recursive=False):
                    try:
                        # Non-preferred terms don't have links on them, so we can use this to
                        # only include preferred terms.
                        href = row.find('a')['href']
                    except (KeyError):
                        # Ignore non-preferred terms
                        pass
                    else:
                        # Grab the term!
                        term = {'term': row.find('a').string.lower()}
                        if row.find('table'):
                            # Loop through the rows underneath the term heading to get info about related terms
                            for related in row.find('table').find_all('tr'):
                                cells = related.find_all('td')
                                # There is only ever one broader term, it moves you up the hierarchy
                                # It's the parent on the current term
                                if re.search(r'Broad term', cells[0].string):
                                    parent = cells[1].find('a').string.lower()
                                    # These are all problems in the RS interface -- the hierarchies don't match up
                                    # So we're rewriting the broader (parent) function to be the one that's actually in use
                                    if parent in BROKEN_HIERARCHIES:
                                        parent = BROKEN_HIERARCHIES[parent]
                                    if parent:
                                        term['parent'] = parent
                                # We're grabbing these narrower terms because I've noticed that not all of them
                                # have their own top-level entry. So if I don't get them here, we'll lose them.
                                # This means, however, that we'll have duplicates that we'll need to deal with later.
                                if re.search(r'Narrow terms', cells[0].string):
                                    for link in cells[1].find_all('a'):
                                        narrow = link.string.lower()
                                        try:
                                            term['narrower'].append({'term': narrow})
                                        except KeyError:
                                            term['narrower'] = []
                                            term['narrower'].append({'term': narrow})
                                # A non-preferred term shouldn't have a link on the term,
                                # so we shouldn never actually end up here. But just in case...
                                if re.search(r'Preferred term', cells[0].string):
                                    print(term)
                        terms.append(term)
            except AttributeError:
                pass
        # We want to build a hierarchy starting with the top-level functions (those without parents)
        functions = [t for t in terms if 'parent' not in t]
        children = [t for t in terms if 'parent' in t]
        # To build up the hierarchy we start at the top, inserting direct children in the 'narrower' list
        # There are four levels in the RS hierarchy, so we continue down through each level.
        # There must be a nicer way of doing this...
        for child in children:
            functions = self.add_child(child, functions)
        # Next level down
        for child in children:
            for function in functions:
                if 'narrower' in function:
                    function['narrower'] = self.add_child(child, function['narrower'])
        # Next level down
        for child in children:
            for function in functions:
                if 'narrower' in function:
                    for subf in function['narrower']:
                        if 'narrower' in subf:
                            subf['narrower'] = self.add_child(child, subf['narrower'])
        # Next level down
        for child in children:
            for function in functions:
                if 'narrower' in function:
                    for subf in function['narrower']:
                        if 'narrower' in subf:
                            for subsubf in subf['narrower']:
                                if 'narrower' in subsubf:
                                    subsubf['narrower'] = self.add_child(child, subsubf['narrower'])
        return functions

In [95]:
c = RSFunctionsClient()
functions = c.harvest_functions()

HBox(children=(IntProgress(value=0, max=26), HTML(value='')))




In [112]:
def write_functions(functions):
    '''
    Saves the harvested list of functions in text and json.
    '''
    with open('data/functions.txt', 'w') as text_file:
        for function in functions:
            text_file.write('{}\n'.format(function['term'].upper()))
            if 'narrower' in function:
                for subf in function['narrower']:
                    text_file.write('  - {}\n'.format(subf['term'].title()))
                    if 'narrower' in subf:
                        for subsubf in subf['narrower']:
                            text_file.write('    -- {}\n'.format(subsubf['term'].title()))
                            if 'narrower' in subsubf:
                                for subsubsubf in subsubf['narrower']:
                                    text_file.write('      --- {}\n'.format(subsubsubf['term'].title()))
                                    if 'narrower' in subsubsubf:
                                        for subsubsubsubf in subsubsubf['narrower']:
                                            text_file.write('        ---- {}\n'.format(subsubsubsubf['term'].title()))
    with open('data/functions.json', 'w') as json_file:
        json.dump(functions, json_file, indent=4)
    display(FileLink('data/functions.txt'))
    display(FileLink('data/functions.json'))
    

In [113]:
write_functions(functions)