# Harvesting Australian government domains from the CDX index

<p class="alert alert-warning">Work in progress – this notebook isn't finished yet. Check back later for more...<p>

Note that collapse doesn't seem to work with domain matching.

In [1]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm
import pandas as pd
import requests_cache
import time
from requests_cache import CachedSession
from tinydb import TinyDB

s = CachedSession()
retries = Retry(total=10, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])
s.mount('https://', HTTPAdapter(max_retries=retries))
s.mount('http://', HTTPAdapter(max_retries=retries))

In [6]:
def check_for_resumption_key(results):
    '''
    Checks to see if the second-last row is an empty list,
    if it is, return the last value as the resumption key.
    '''
    try:
        if not results[-2]:
            return results[-1][0]
    except IndexError:
        pass

def get_total_pages(params):
    '''
    Gets the total number of pages in a set of results.
    '''
    these_params = params.copy()
    these_params['showNumPages'] = 'true'
    response = s.get('http://web.archive.org/cdx/search/cdx', params=these_params, headers={'User-Agent': ''})
    return int(response.text)

def prepare_params(url, use_resume_key=False, **kwargs):
    '''
    Prepare the parameters for a CDX API requests.
    Adds all supplied keyword arguments as parameters (changing from_ to from).
    Adds in a few necessary parameters and showResumeKey if requested.
    '''
    params = kwargs
    params['url'] = url
    params['output'] = 'json'
    if use_resume_key:
        params['showResumeKey'] = 'true'
    # CDX accepts a 'from' parameter, but this is a reserved word in Python
    # Use 'from_' to pass the value to the function & here we'll change it back to 'from'.
    if 'from_' in params:
        params['from'] = params['from_']
        del(params['from_'])
    return params

def get_cdx_data(params):
    '''
    Make a request to the CDX API using the supplied parameters.
    Check the results for a resumption key, and return the key (if any) and the results.
    '''
    response = s.get('http://web.archive.org/cdx/search/cdx', params=params, headers={'User-Agent': ''})
    response.raise_for_status()
    results = response.json()
    resumption_key = check_for_resumption_key(results)
    # Remove the resumption key from the results
    if resumption_key:
        results = results[:-2]
    if not response.from_cache:
        time.sleep(0.2)
    return resumption_key, results

def convert_lists_to_dicts(results):
    if results:
        keys = results[0]
        results_as_dicts = [dict(zip(keys, v)) for v in results[1:]]
    else:
        results_as_dicts = results
    return results_as_dicts

def query_cdx_by_page(url, **kwargs):
    db = TinyDB('db.json')
    db.purge()
    page = 0
    params = prepare_params(url, **kwargs)
    total_pages = get_total_pages(params)
    with tqdm(total=total_pages-page) as pbar1:
        with tqdm() as pbar2:
            while page < total_pages:
                params['page'] = page
                _, results = get_cdx_data(params)
                db.insert_multiple(convert_lists_to_dicts(results))
                page += 1
                pbar1.update(1)
                pbar2.update(len(results) - 1)

def query_cdx_with_key(url, **kwargs):
    '''
    Harvest results from the CDX API using the supplied parameters.
    Uses showResumeKey to check if there are more than one page of results,
    and if so loops through pages until all results are downloaded.
    '''
    params = prepare_params(url, use_resume_key=True, **kwargs)
    with tqdm() as pbar:
        # This will include the header row
        resumption_key, all_results = get_cdx_data(params)
        pbar.update(len(all_results) - 1)
        while resumption_key is not None:
            params['resumeKey'] = resumption_key
            resumption_key, results = get_cdx_data(params)
            # Remove the header row and add
            all_results += results[1:]
            pbar.update(len(results) - 1)
    return all_results

In [None]:
# Get an overview of the domain -- unique urlkeys only
# Then get all the 3 level domains and harvest separately collapsed on year to give chnage over time
# Trying to balance, speed, performance, file size, memory etc
# Note than harvesting a domain has the same number of pages (ie requests) no matter what filters are applied -- it's just that some pages will be empty.
# So repeating a domain harvest with different filters will mean less data, but the same number of requests.
# What's most efficient? I dunno.
results = query_cdx_by_page('*.gov.au', filter=['statuscode:200', 'mimetype:text/html'], collapse='urlkey', fl='urlkey', pageSize=5)

In [8]:
db = TinyDB('db.json')
df = pd.DataFrame(db.all())

In [10]:
df.head()

Unnamed: 0,urlkey
0,"au,gov,naa)/"
1,"au,gov,naa)/../the_collection/cabinet/1967_cab..."
2,"au,gov,naa)/?c=a.checked:b==="
3,"au,gov,naa)/?sssdmh=dm13.167154"
4,"au,gov,naa)/?sssdmh=dm13.167154/"


In [None]:
d = [[1]]

In [None]:
if d[-2]:
    print('y')
else:
    print('n')

In [None]:
df = pd.DataFrame(results[1:], columns=results[0])
df.head()

In [None]:
df.shape

In [None]:
df['urlkey'].unique().shape

In [None]:
df_domains = df['urlkey'].str.split(')', expand=True)[0].to_frame()

In [None]:
df_domains.head(10)

In [None]:
df_domains[0].value_counts()

In [None]:
df_domains.drop_duplicates(inplace=True, ignore_index=True)

In [None]:
df_domains

In [None]:
df_sub = df_domains[0].str.split(',', expand=True)

In [None]:
df_sub[2].value_counts()

In [None]:
df_sub[4].value_counts()

In [None]:
df_sub.loc[df_sub[4] == 'archcms01']

In [None]:
harvest_by_year()

In [None]:
gov_domains = []
total_pages = get_total_pages(params)
page = 0
with tqdm(total=total_pages-page) as pbar:
    these_params = params.copy()
    these_params['output'] = 'json'
    while page < total_pages:
        these_params['page'] = page
        response = s.get('http://web.archive.org/cdx/search/cdx', params=these_params)
        for capture in response.json()[1:]:
            urlkey = capture[0]
            domain = urlkey[:urlkey.find(')')]
            if domain not in gov_domains:
                gov_domains.append(domain)
        page += 1
        pbar.update(1)

In [None]:
columns = response.json()[0]
df = pd.DataFrame(captures, columns=columns)

In [None]:
import json

In [None]:
with open('gov_domains.json', 'w') as json_file:
    json.dump(gov_domains, json_file)

In [None]:
df.head()

In [None]:
len(domains)

Add 'id_' to timestamp of a web archive url to get the original html (ie not the replay version), eg: http://wayback.archive-it.org/all/20190630231630id_/http://discontents.com.au/

In [None]:
len(gov_domains)

In [None]:
df_gov = pd.DataFrame(gov_domains)

In [None]:
df_gov_split = df_gov[0].str.split(',', expand=True)

In [None]:
df_gov_split.head()

In [None]:
df_gov_split[2].value_counts()[:25]

In [None]:
df_gov_split.loc[df_gov_split[2] == 'dfat']

In [None]:
df_gov_split.to_csv('gov_domains.csv', index=False)

In [None]:
response = requests.get('http://web.archive.org/cdx/search/cdx?url=*.nsw.gov.au&filter=statuscode:200&filter=mimetype:text/html&collapse=urlkey&output=json')

In [None]:
len(response.json())