# Harvesting Australian government domains from the CDX index

Note that collapse doesn't seem to work with domain matching.

In [1]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm
import pandas as pd
import requests_cache

s = requests.Session()
retries = Retry(total=10, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])
s.mount('https://', HTTPAdapter(max_retries=retries))
s.mount('http://', HTTPAdapter(max_retries=retries))

In [15]:
nsw = 'http://web.archive.org/cdx/search/cdx?url=*.nsw.gov.au&filter=statuscode:200&filter=mimetype:text/html'

params = {
    'url': 'gov.au',
    'matchType': 'domain',
    'filter': ['statuscode:200', 'mimetype:text/html'],
    'collapse': 'urlkey',
    'fl': 'urlkey,mimetype,statuscode'
}

def get_total_pages(params):
    these_params = params.copy()
    these_params['showNumPages'] = 'true'
    response = s.get('http://web.archive.org/cdx/search/cdx', params=these_params)
    return int(response.text)

In [16]:
get_total_pages(params)

46826

In [20]:
#gov_domains = []
total_pages = get_total_pages(params)
page = 43875
with tqdm(total=total_pages-page) as pbar:
    these_params = params.copy()
    these_params['output'] = 'json'
    while page < total_pages:
        these_params['page'] = page
        response = s.get('http://web.archive.org/cdx/search/cdx', params=these_params)
        for capture in response.json()[1:]:
            urlkey = capture[0]
            domain = urlkey[:urlkey.find(')')]
            if domain not in gov_domains:
                gov_domains.append(domain)
        page += 1
        pbar.update(1)

HBox(children=(IntProgress(value=0, max=2951), HTML(value='')))




In [None]:
columns = response.json()[0]
df = pd.DataFrame(captures, columns=columns)

In [7]:
import json

In [21]:
with open('gov_domains.json', 'w') as json_file:
    json.dump(gov_domains, json_file)

In [None]:
df.head()

In [14]:
len(domains)

7109

Add 'id_' to timestamp of a web archive url to get the original html (ie not the replay version), eg: http://wayback.archive-it.org/all/20190630231630id_/http://discontents.com.au/

In [22]:
len(gov_domains)

25488

In [23]:
df_gov = pd.DataFrame(gov_domains)

In [29]:
df_gov_split = df_gov[0].str.split(',', expand=True)

In [30]:
df_gov_split.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,au,gov,,,,,,,,
1,au,gov,000,,,,,,,
2,au,gov,2004wheatreview,,,,,,,
3,au,gov,2commando,,,,,,,
4,au,gov,aa,,,,,,,


In [41]:
df_gov_split[2].value_counts()[:25]

nsw         7109
vic         3334
qld         2709
wa          2651
sa          1633
tas          944
nt           747
act          362
embassy      150
nla          147
govspace     111
deewr         77
treasury      70
health        70
dest          69
ato           66
ga            66
bom           60
abs           56
naa           52
tspace        49
dfat          47
business      46
govcms        44
aph           44
Name: 2, dtype: int64

In [45]:
df_gov_split.loc[df_gov_split[2] == 'dfat']

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
2783,au,gov,dfat,,,,,,,
2784,au,gov,dfat,2006yoe,,,,,,
2785,au,gov,dfat,aid,,,,,,
2786,au,gov,dfat,asialine,,,,,,
2787,au,gov,dfat,asno,,,,,,
2788,au,gov,dfat,ausconnect,,,,,,
2789,au,gov,dfat,blog,,,,,,
2790,au,gov,dfat,cfmt,,,,,,
2791,au,gov,dfat,diplomaticacademy,,,,,,
2792,au,gov,dfat,forms,,,,,,


In [35]:
df_gov_split.to_csv('gov_domains.csv', index=False)

In [3]:
response = requests.get('http://web.archive.org/cdx/search/cdx?url=*.nsw.gov.au&filter=statuscode:200&filter=mimetype:text/html&collapse=urlkey&output=json')

In [5]:
len(response.json())

165424