# Summarise index details

This notebook counts the number of rows in each index and calculates the total for the whole repository. It formats the results in nice HTML and Markdown tables for easy browsing.

In [54]:
from IPython.display import display, HTML
import os
from urllib.parse import urljoin
import pandas as pd
from tabulate import tabulate
from slugify import slugify

## Add links and totals to the list of indexes

In [34]:
# Load the index data
df = pd.read_csv('indexes.csv').sort_values(by='title')

In [58]:
def make_download_link(title):
    '''
    Create a link to download the CSV file from GitHub
    '''
    filename = '{}.csv'.format(slugify(title))
    url = urljoin('https://raw.githubusercontent.com/wragge/srnsw-indexes/master/data/', filename)
    link = '<a href="{}">CSV file</a>'.format(url)
    return link

# Create a HTML link to more info about the index
df['more_info'] = df['more_info_url'].apply(lambda x: '<a href="{}">More info</a>'.format(x))

# Create a HTML link to the index data on the NSWSA site
df['web'] = df['url'].apply(lambda x: '<a href="{}">Browse index</a>'.format(x))

# Create a HTML link to download the CSV file from GitHub
df['download'] = df['title'].apply(lambda x: make_download_link(x))

In [59]:
def count_rows(title):
    '''
    Count the number of rows in a CSV file.
    '''
    df = pd.read_csv(os.path.join('csv', '{}.csv'.format(slugify(title))), dtype=object)
    return df.shape[0]

# Add number of rows in the CSV
df['rows'] = df['title'].apply(lambda x: count_rows(x))

In [60]:
# How many rows in the whole repository?
df['rows'].sum()

1499259

In [61]:
# Which index has the most number of rows?
df.loc[df['rows'].idxmax()]

id                                                              15
more_info_url    https://www.records.nsw.gov.au/archives/collec...
status                                               Not digitised
title                                             Deceased Estates
url              https://www.records.nsw.gov.au/searchhits_noco...
more_info        <a href="https://www.records.nsw.gov.au/archiv...
web              <a href="https://www.records.nsw.gov.au/search...
rows                                                        257524
download         <a href="https://raw.githubusercontent.com/wra...
Name: 29, dtype: object

## Summarise the results of the harvest

In [62]:
'Currently: {} indexes harvested with {:,} rows of data.'.format(df.shape[0], df['rows'].sum())

'Currently: 64 indexes harvested with 1,499,259 rows of data.'

Make a nicely formatted table in both HTML and Markdown.

In [63]:
# Select the columns that we want
columns = df[['title', 'status', 'rows', 'download', 'web', 'more_info']]

# Create a list of headers
headers = ['Title', 'Status', 'Number of rows', 'Download data', 'View at NSWSA', 'More info']

# Use Tabulate to generate a HTML table
display(HTML(tabulate(columns, headers=headers, showindex=False, tablefmt='html')))

# Write a GitHub Markdown formatted version of the table to a file
with open('indexes.md', 'w') as md_file:
    md_file.write(tabulate(columns, headers=headers, showindex=False, tablefmt='github'))

Title,Status,Number of rows,Download data,View at NSWSA,More info
Assisted Immigrants,Fully digitised,191688,CSV file,Browse index,More info
Australian Railway Supply Detachment,Fully digitised,65,CSV file,Browse index,More info
Bankruptcy Index,Not digitised,28880,CSV file,Browse index,More info
"Bench of Magistrates cases, 1788-1820",Not digitised,4442,CSV file,Browse index,More info
Botanic Gardens and Government Domains Employees Index,Not digitised,916,CSV file,Browse index,More info
Bubonic Plague Index,Fully digitised,592,CSV file,Browse index,More info
CSreLand,Not digitised,10849,CSV file,Browse index,More info
Child Care and Protection,Not digitised,21980,CSV file,Browse index,More info
"Closer Settlement Transfer Registers, NRS 8082",Not digitised,4957,CSV file,Browse index,More info
Closer and Soldier Settlement Transfer Files,Not digitised,9656,CSV file,Browse index,More info


----

Created by [Tim Sherratt](https://timsherratt.org/).

Part of the [GLAM Workbench](https://glam-workbench.github.io/) project.