# Harvesting digitised books

This notebook harvests metadata and OCRd text from digitised books in Trove. There's three main steps:

* Harvest metadata of digitised books using the Trove API
* Extract the number of pages for each book from the Trove web interface (the number of pages is necessary to download the OCRd text)
* Download the OCRd text for each book

It's not easy to identify all the digitised books in Trove. I'm starting with a [search in the book zone](https://trove.nla.gov.au/book/result?q=%22nla.obj%22&l-availability=y) for records that include the phrase `"nla.obj"` and are available online. This currently returns 21,699 results. However, this includes works from other zones, such as maps and music. It also includes books where access to the digital copy is 'restricted'. I think these are mostly recent books submitted in digital form under legal deposit. I've filtered the 21,699 results to remove records where the digital copy is not available, and where the primary format is not 'book'. This currently reduces the total to 7,719 results. It's possible that I might have filtered out too many titles.

You can download a [CSV file with the harvested metadata](trove_digitised_books.csv) here.

I've currently downloaded the OCRd text for about 2,000 books. You can view the files in the [text](text) directory of this repository.

## Setting things up

In [124]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm import tqdm_notebook
from IPython.display import display, FileLink
import pandas as pd
import json
import re
import time
import os

In [85]:
s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])
s.mount('https://', HTTPAdapter(max_retries=retries))
s.mount('http://', HTTPAdapter(max_retries=retries))

In [25]:
# Add your Trove API key below
params = {
    'key': '',
    'zone': 'book',
    'q': 'nla.obj',
    'bulkHarvest': 'true',
    'n': 100,
    'encoding': 'json',
    'l-availability': 'y'
}

## Harvest metadata using the API

In [76]:
def get_total_results():
    '''
    Get the total number of results for a search.
    '''
    these_params = params.copy()
    these_params['n'] = 0
    s.get('https://api.trove.nla.gov.au/v2/result', params=these_params)
    data = response.json()
    return int(data['response']['zone'][0]['records']['total'])


def get_fulltext_url(links):
    '''
    Loop through the identifiers to find a link to the full text version of the book.
    '''
    url = None
    for link in links:
        if link['linktype'] == 'fulltext' and 'nla.obj' in link['value']:
            url = link['value']
            break
    return url


def harvest_books():
    '''
    Harvest metadata relating to digitised books.
    '''
    books = []
    total = get_total_results()
    start = '*'
    these_params = params.copy()
    with tqdm_notebook(total=total) as pbar:
        while start:
            these_params['s'] = start
            response = s.get('https://api.trove.nla.gov.au/v2/result', params=these_params)
            data = response.json()
            # The nextStart parameter is used to get the next page of results.
            # If there's no nextStart then it means we're on the last page of results.
            try:
                start = data['response']['zone'][0]['records']['nextStart']
            except KeyError:
                start = None
            for record in data['response']['zone'][0]['records']['work']:
                # See if there's a link to the full text version.
                fulltext_url = get_fulltext_url(record['identifier'])
                # I'm making the assumption that if this is a booky book (not a map or music etc),
                # then 'Book' will appear first in the list of types.
                # This might not be a valid assumption.
                try:
                    format_type = record.get('type')[0]
                except (IndexError, TypeError):
                    format_type = None
                # Save the record if there's a full text link and it's a booky book.
                if fulltext_url and format_type == 'Book':
                    # The 'contributor' field may have a single value or an array.
                    # If it's an array, join the values into a string.
                    try:
                        contributors = '|'.join(record.get('contributor'))
                    except TypeError:
                        contributors = record.get('contributor')
                    # Get the basic metadata.
                    book = {
                        'title': record.get('title'),
                        'url': record.get('troveUrl'),
                        'contributors': contributors,
                        'date': record.get('issued'),
                        'fulltext_url': fulltext_url
                    }
                    books.append(book)
                    #print(book)
            pbar.update(100)
    return books

In [77]:
# Do the harvest!
books = harvest_books()

HBox(children=(IntProgress(value=0, max=21669), HTML(value='')))

## Get the number of pages in each book

In [None]:
def get_work_data(url):
    '''
    Extract work data in a JSON string from the work's HTML page.
    '''
    response = s.get(url)
    try:
        work_data = re.search(r'var work = JSON\.parse\(JSON\.stringify\((\{.*\})', response.text).group(1)
    except AttributeError:
        work_data = '{}'
    return json.loads(work_data)


def add_pages(books):
    '''
    Add the number of pages to the metadata for each book.
    '''
    books_with_pages = []
    for book in tqdm_notebook(books):
        # print(book['fulltext_url'])
        work = get_work_data(book['fulltext_url'])
        try:
            pages = len(work['children']['page'])
        # I'm not really sure if it's true that works without children have 1 page,
        # but it seems to be the case with most of the ones I've checked.
        # I did find one case where the record included 2 versions of the same book,
        # not sure what to do about that...
        except KeyError:
            pages = 1
        book['pages'] = pages
        # print(pages)
        books_with_pages.append(book)
        time.sleep(0.2)
    return books_with_pages

In [93]:
# Add number of pages to the book metadata
books_with_pages = add_pages(books)

HBox(children=(IntProgress(value=0, max=7719), HTML(value='')))

In [95]:
df = pd.DataFrame(books_with_pages)
df.head()

Unnamed: 0,contributors,date,fulltext_url,pages,title,url
0,"Taplin, George",1878-1880,http://nla.gov.au/nla.obj-688657424,24,Grammar of the Narrinyeri tribe of Australian ...,https://trove.nla.gov.au/work/10029401
1,Miriam Agatha,1914-1923,http://nla.gov.au/nla.obj-24357566,246,Nellie Doran : a story of Australian home and ...,https://trove.nla.gov.au/work/10049667
2,,1915,http://nla.gov.au/nla.obj-509324870,33,Le Siege de Berlin : Drame en un Acte / Charle...,https://trove.nla.gov.au/work/10069391
3,"Willshire, W. H. (William Henry), 1852-1925",1888,http://nla.gov.au/nla.obj-188910904,33,The Aborigines of Central Australia : with a v...,https://trove.nla.gov.au/work/10076872
4,"Bent, Andrew, 1790-1851",1827,http://nla.gov.au/nla.obj-76416324,2,General power of attorney,https://trove.nla.gov.au/work/10100860


In [101]:
df.shape

(7719, 6)

In [117]:
# Extract the id from the url and save to a new column
df['book_id'] = df['fulltext_url'].str.extract(r'(nla\.obj\-\d+)', expand=False)

In [125]:
df.to_csv('trove_digitised_books.csv', index=False)
display(FileLink('trove_digitised_books.csv'))

## Download the OCRd texts

In [119]:
def save_ocr(books):
    '''
    Download the OCRd text for each book.
    '''
    empty = []
    # Prepare a directory to save the texts into
    output_dir = 'text'
    os.makedirs(output_dir, exist_ok=True)
    # Loop through the issues
    for book in tqdm_notebook(books):
        print(book['title'])
        # The index value for the last page of an issue will be the total pages - 1
        last_page = book['pages'] - 1
        filename = '{}/{}.txt'.format(output_dir, book_id)
        # Check to see if the file has already been harvested
        if os.path.exists(filename) and os.path.getsize(filename) > 0:
            print('Already saved')
        else:
            url = 'https://trove.nla.gov.au/{}/download?downloadOption=ocr&firstPage=0&lastPage={}'.format(book_id, last_page)
            print(url)
            # Get the file
            r = s.get(url)
            # Check there was no error
            if r.status_code == requests.codes.ok:
                # Check that the file's not empty
                if len(r.content) > 0:
                    # If everything's ok, save the file
                    with open(filename, 'wb') as text_file:
                        text_file.write(r.content)
                    print('Saved')
                else:
                    print('Empty')
                    # Store details of empty files for later
                    empty.append(id)
                time.sleep(1)
            else:
                print('There was a problem: {}'.format(r.status_code))
    print(empty)

In [None]:
save_ocr(books_with_pages)