# Exploring the Internet Archive's CDX API

In [51]:
import requests
import pandas as pd
from io import BytesIO
import altair as alt
import os
from base64 import b32encode
from hashlib import sha1

alt.data_transformers.enable('json', urlpath='files')
s = requests.Session()

## Documentation

* [Wayback Machine APIs](https://archive.org/help/wayback_api.php)
* [Wayback CDX API](https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server)
* [Archive-it's CDX/C API](https://support.archive-it.org/hc/en-us/articles/115001790023-Access-Archive-It-s-Wayback-index-with-the-CDX-C-API) – includes useful general documentation of CDX format
* [Hashes](https://blogs.loc.gov/thesignal/2011/11/hashing-out-digital-trust/?loclr=blogsig)

## Captures of a single page

Let's have a look at the sort of data the CDX server gives us.

* `url`
* `limit`

In [24]:
# 8 April 2020 -- without the 'User-Agent' header parameter I get a 445 error
response = requests.get('http://web.archive.org/cdx/search/cdx?url=nla.gov.au&limit=10', headers={'User-Agent': ''})
results = response.text
print(results)

au,gov,nla)/ 19961019064223 http://www.nla.gov.au:80/ text/html 200 M5ORM4XQ5QCEZEDRNZRGSWXPCOGUVASI 1135
au,gov,nla)/ 19961221102755 http://www.nla.gov.au:80/ text/html 200 TM4WSQIGWXAXMB36G4GVOY7MVPTO6CSE 1138
au,gov,nla)/ 19961221132358 http://nla.gov.au:80/ text/html 200 65SH4ZQ7ZYTTPYSVFQUSKZXJPZKSI6XA 603
au,gov,nla)/ 19961223031839 http://www2.nla.gov.au:80/ text/html 200 6XHDP66AXEPMVKVROHHDN6CPZYHZICEX 457
au,gov,nla)/ 19970212053405 http://www.nla.gov.au:80/ text/html 200 TM4WSQIGWXAXMB36G4GVOY7MVPTO6CSE 1141
au,gov,nla)/ 19970215222554 http://nla.gov.au:80/ text/html 200 65SH4ZQ7ZYTTPYSVFQUSKZXJPZKSI6XA 603
au,gov,nla)/ 19970315230640 http://www.nla.gov.au:80/ text/html 200 NOUNS3AYAIAOO4LRFD23MQWW3QIGDMFB 1126
au,gov,nla)/ 19970315230640 http://www.nla.gov.au:80/ text/html 200 TM4WSQIGWXAXMB36G4GVOY7MVPTO6CSE 1140
au,gov,nla)/ 19970413005246 http://nla.gov.au:80/ text/html 200 65SH4ZQ7ZYTTPYSVFQUSKZXJPZKSI6XA 603
au,gov,nla)/ 19970418074154 http://www.nla.gov.au:80/ text/ht

Fields:

* `urlkey` – the page url expressed as a [SURT](http://crawler.archive.org/articles/user_manual/glossary.html#surt) (Sort-friendly URI Reordering Transform)
* timestamp
* original
* mimetype
* statuscode
* digest
* length

Why do some captures have the same digest, but different sizes?

In [5]:
'NOUNS3AYAIAOO4LRFD23MQWW3QIGDMFB' == 'NOUNS3AYAIAOO4LRFD23MQWW3QIGDMFB'

True

Because headers aren't included in the checksum.

## Requesting a particular capture

We can use the `timestamp` to request a the original HTML content of the archived page. If you don't include the `id_`, you get the page with all the Internet Archive navigation elements included.

```
https://web.archive.org/web/[timestamp]id_/[url]
```

## How digests are made


In [52]:
print(b32encode(sha1('This is a string.'.encode()).digest()).decode())
print(b32encode(sha1('This is a string!'.encode()).digest()).decode())

3VQDI552JQRW5ROPWTSKINAWFWGWQ6CQ
MWTI7PY7WJDIBYQKZ2P2Y5UA75UWOSYR


In [91]:
for result in results.splitlines():
    values = result.split()
    print(values[1])
    snapshot_url = f'https://web.archive.org/web/{values[1]}id_/http://www.nla.gov.au/'
    response = requests.get(snapshot_url, headers={'User-Agent': ''})
    checksum = b32encode(sha1(response.content).digest())
    print(checksum.decode() == values[5])

19961019064223
True
19961221102755
True
19961221132358
True
19961223031839
True
19970212053405
True
19970215222554
True
19970315230640
True
19970315230640
False
19970413005246
True
19970418074154
True


In [None]:
https://web.archive.org/web/19970315230640id_/http://www.nla.gov.au/

As JSON

In [3]:
params = {
    'url': 'nla.gov.au',
    'output': 'json',
    'limit': 5
}
response = requests.get('http://web.archive.org/cdx/search/cdx', params=params, headers={'User-Agent': ''})
response.json()

[['urlkey',
  'timestamp',
  'original',
  'mimetype',
  'statuscode',
  'digest',
  'length'],
 ['au,gov,nla)/',
  '19961019064223',
  'http://www.nla.gov.au:80/',
  'text/html',
  '200',
  'M5ORM4XQ5QCEZEDRNZRGSWXPCOGUVASI',
  '1135'],
 ['au,gov,nla)/',
  '19961221102755',
  'http://www.nla.gov.au:80/',
  'text/html',
  '200',
  'TM4WSQIGWXAXMB36G4GVOY7MVPTO6CSE',
  '1138'],
 ['au,gov,nla)/',
  '19961221132358',
  'http://nla.gov.au:80/',
  'text/html',
  '200',
  '65SH4ZQ7ZYTTPYSVFQUSKZXJPZKSI6XA',
  '603'],
 ['au,gov,nla)/',
  '19961223031839',
  'http://www2.nla.gov.au:80/',
  'text/html',
  '200',
  '6XHDP66AXEPMVKVROHHDN6CPZYHZICEX',
  '457'],
 ['au,gov,nla)/',
  '19970212053405',
  'http://www.nla.gov.au:80/',
  'text/html',
  '200',
  'TM4WSQIGWXAXMB36G4GVOY7MVPTO6CSE',
  '1141']]

In [4]:
def query_cdx(url, **kwargs):
    params = kwargs
    params['url'] = url
    params['output'] = 'json'
    response = requests.get('http://web.archive.org/cdx/search/cdx', params=params, headers={'User-Agent': ''})
    response.raise_for_status()
    return response.json()

In [5]:
query_cdx('nla.gov.au', limit=2)

[['urlkey',
  'timestamp',
  'original',
  'mimetype',
  'statuscode',
  'digest',
  'length'],
 ['au,gov,nla)/',
  '19961019064223',
  'http://www.nla.gov.au:80/',
  'text/html',
  '200',
  'M5ORM4XQ5QCEZEDRNZRGSWXPCOGUVASI',
  '1135'],
 ['au,gov,nla)/',
  '19961221102755',
  'http://www.nla.gov.au:80/',
  'text/html',
  '200',
  'TM4WSQIGWXAXMB36G4GVOY7MVPTO6CSE',
  '1138']]

## The complete capture history

In [72]:
# No limit!
data = query_cdx('nla.gov.au')
len(data)

2789

In [73]:
df_nla = pd.DataFrame(data[1:], columns=data[0])
df_nla.head()

Unnamed: 0,urlkey,timestamp,original,mimetype,statuscode,digest,length
0,"au,gov,nla)/",19961019064223,http://www.nla.gov.au:80/,text/html,200,M5ORM4XQ5QCEZEDRNZRGSWXPCOGUVASI,1135
1,"au,gov,nla)/",19961221102755,http://www.nla.gov.au:80/,text/html,200,TM4WSQIGWXAXMB36G4GVOY7MVPTO6CSE,1138
2,"au,gov,nla)/",19961221132358,http://nla.gov.au:80/,text/html,200,65SH4ZQ7ZYTTPYSVFQUSKZXJPZKSI6XA,603
3,"au,gov,nla)/",19961223031839,http://www2.nla.gov.au:80/,text/html,200,6XHDP66AXEPMVKVROHHDN6CPZYHZICEX,457
4,"au,gov,nla)/",19970212053405,http://www.nla.gov.au:80/,text/html,200,TM4WSQIGWXAXMB36G4GVOY7MVPTO6CSE,1141


In [74]:
df_nla['mimetype'].value_counts()

text/html       2619
warc/revisit     169
Name: mimetype, dtype: int64

In [75]:
df_nla['statuscode'].value_counts()

200    2066
301     287
302     264
-       169
503       2
Name: statuscode, dtype: int64

In [76]:
# Convert the timestamp string into a datetime object
df_nla['date'] = pd.to_datetime(df_nla['timestamp'])

# Convert the length from a string into an integer
df_nla['length'] = df_nla['length'].astype('int')

In [77]:
print(f'{len(df_nla["digest"].unique()) / df_nla.shape[0]:.2%} unique')

51.76% unique


In [82]:
df_nla['duplicated'] = df_nla['digest'].duplicated()


In [11]:
# This is wrong! The regex is picking up things like http://Trove@nla.gov.au/ and saving Trove as a subdomain!
df_nla['subdomain'] = df_nla['original'].str.extract(r'https*:\/\/(\w*).*nla')

In [12]:
df_nla['subdomain'].value_counts()

www       2241
           529
www2        10
Trove        6
mailto       2
Name: subdomain, dtype: int64

In [13]:
df_nla.dtypes

urlkey                object
timestamp             object
original              object
mimetype              object
statuscode            object
digest                object
length                 int64
date          datetime64[ns]
subdomain             object
dtype: object

### Visualise

In [85]:
alt.Chart(df_nla).mark_point().encode(
    x='date:T',
    y='length:Q',
    color='statuscode',
).properties(width=700, height=200)

In [86]:
alt.Chart(df_nla).mark_point().encode(
    x='date:T',
    y='length:Q',
    color='duplicated',
).properties(width=700, height=200)

In [89]:
df_nla.loc[(df_nla['date'] > '2004-01-01') & (df_nla['date'] < '2005-12-31')]['duplicated'].value_counts()

True     452
False    100
Name: duplicated, dtype: int64

In [88]:
alt.Chart(df_nla.loc[(df_nla['date'] > '2004-01-01') & (df_nla['date'] < '2005-12-31')]).mark_point().encode(
    x='date:T',
    y='length:Q',
    color='duplicated',
).properties(width=700, height=200)

### Facet by subdomain

In [15]:
alt.Chart(df_nla).mark_point().encode(
    x='date:T',
    y='length:Q',
    color='statuscode',
    row='subdomain'
).properties(width=700, height=200)

### Limit to www

In [16]:
alt.Chart(df_nla.loc[df_nla['subdomain'] == 'www']).mark_point().encode(
    x='date:T',
    y='length:Q',
    color='statuscode'
).properties(width=700, height=200)

### Compare pages

In [69]:
df_nla

Unnamed: 0,urlkey,timestamp,original,mimetype,statuscode,digest,length,subdomain,date,org
0,"au,gov,nla)/",19961019064223,http://www.nla.gov.au:80/,text/html,200,M5ORM4XQ5QCEZEDRNZRGSWXPCOGUVASI,1135,www,1996-10-19 06:42:23,NLA
1,"au,gov,nla)/",19961221102755,http://www.nla.gov.au:80/,text/html,200,TM4WSQIGWXAXMB36G4GVOY7MVPTO6CSE,1138,www,1996-12-21 10:27:55,NLA
2,"au,gov,nla)/",19961221132358,http://nla.gov.au:80/,text/html,200,65SH4ZQ7ZYTTPYSVFQUSKZXJPZKSI6XA,603,,1996-12-21 13:23:58,NLA
3,"au,gov,nla)/",19961223031839,http://www2.nla.gov.au:80/,text/html,200,6XHDP66AXEPMVKVROHHDN6CPZYHZICEX,457,www2,1996-12-23 03:18:39,NLA
4,"au,gov,nla)/",19970212053405,http://www.nla.gov.au:80/,text/html,200,TM4WSQIGWXAXMB36G4GVOY7MVPTO6CSE,1141,www,1997-02-12 05:34:05,NLA
...,...,...,...,...,...,...,...,...,...,...
2719,"au,gov,nla)/",20200308213139,http://nla.gov.au/,text/html,301,HLNR6AWVWYCU3YAENY3HYHLIPNWN66X7,429,,2020-03-08 21:31:39,NLA
2720,"au,gov,nla)/",20200309051512,http://www.nla.gov.au/,warc/revisit,-,HLNR6AWVWYCU3YAENY3HYHLIPNWN66X7,456,www,2020-03-09 05:15:12,NLA
2721,"au,gov,nla)/",20200309051514,https://www.nla.gov.au/,text/html,200,XEIGABHTAXGYLNFIJC7GZJRBVXOPGZR5,15063,www,2020-03-09 05:15:14,NLA
2722,"au,gov,nla)/",20200309095958,http://www.nla.gov.au/,text/html,301,HLNR6AWVWYCU3YAENY3HYHLIPNWN66X7,428,www,2020-03-09 09:59:58,NLA


In [70]:
data = query_cdx('natlib.govt.nz')
df_nlnz = pd.DataFrame(data[1:], columns=data[0])

# Convert the timestamp string into a datetime object
df_nlnz['date'] = pd.to_datetime(df_nlnz['timestamp'])

# Convert the length from a string into an integer
df_nlnz['length'] = df_nlnz['length'].astype('int')

df_nlnz['subdomain'] = df_nlnz['original'].str.extract(r'https*:\/\/(\w*).*natlib')

df_nlnz['org'] = 'NLNZ'

df_nlnz.head()

Unnamed: 0,urlkey,timestamp,original,mimetype,statuscode,digest,length,date,subdomain,org
0,"nz,govt,natlib)/",19970715162656,http://www.natlib.govt.nz:80/,text/html,200,URN557D5JSDC462K2K2ITOICIIZIESEV,986,1997-07-15 16:26:56,www,NLNZ
1,"nz,govt,natlib)/",19970715163225,http://www.natlib.govt.nz:80/,text/html,200,URN557D5JSDC462K2K2ITOICIIZIESEV,986,1997-07-15 16:32:25,www,NLNZ
2,"nz,govt,natlib)/",19971022151931,http://www.natlib.govt.nz:80/,text/html,200,4ODQTB6HPH6V5ROKTL7CTDRI3VMMII32,1005,1997-10-22 15:19:31,www,NLNZ
3,"nz,govt,natlib)/",19971210141320,http://www.natlib.govt.nz:80/,text/html,200,4ODQTB6HPH6V5ROKTL7CTDRI3VMMII32,1004,1997-12-10 14:13:20,www,NLNZ
4,"nz,govt,natlib)/",19980130054835,http://www.natlib.govt.nz:80/,text/html,200,4ODQTB6HPH6V5ROKTL7CTDRI3VMMII32,1004,1998-01-30 05:48:35,www,NLNZ


In [73]:
alt.Chart(df_nlnz).mark_point().encode(
    x='date:T',
    y='length:Q',
    color='statuscode',
    row='subdomain'
).properties(width=700, height=200)

In [74]:
df_nlnz['length'].max()

1358641

In [91]:
alt.Chart(df_nlnz.loc[(df_nlnz['subdomain'] == 'www') | (df_nlnz['subdomain'] == '')]).mark_point().encode(
    x='date:T',
    y='length:Q',
    color='statuscode'
).properties(width=700, height=200)

In [102]:
data = query_cdx('bl.uk')
df_bl = pd.DataFrame(data[1:], columns=data[0])

# Convert the timestamp string into a datetime object
df_bl['date'] = pd.to_datetime(df_bl['timestamp'])

# Convert the length from a string into an integer
df_bl['length'] = df_bl['length'].astype('int')

df_bl['subdomain'] = df_bl['original'].str.extract(r'https*:\/\/(\w*).*bl')

df_bl['port'] = df_bl['original'].str.extract(r'bl\.uk:(\d*)')

df_bl['org'] = 'BL'

df_bl.head()

Unnamed: 0,urlkey,timestamp,original,mimetype,statuscode,digest,length,date,subdomain,port,org
0,"uk,bl)/",19970218190613,http://www.bl.uk:80/,text/html,200,Z42UMUL76GODKO3EMNSLXDTCST66VDAX,1208,1997-02-18 19:06:13,www,80,BL
1,"uk,bl)/",19970613221424,http://www.bl.uk:80/,text/html,200,Z42UMUL76GODKO3EMNSLXDTCST66VDAX,1214,1997-06-13 22:14:24,www,80,BL
2,"uk,bl)/",19970625013236,http://www.bl.uk:80/,text/html,200,Z42UMUL76GODKO3EMNSLXDTCST66VDAX,1212,1997-06-25 01:32:36,www,80,BL
3,"uk,bl)/",19971210090519,http://www.bl.uk:80/,text/html,200,VTGDRCNEGYZFPNF6Y5Z66YBEQSMOD6CF,1800,1997-12-10 09:05:19,www,80,BL
4,"uk,bl)/",19980128082238,http://www.bl.uk:80/,text/html,200,VTGDRCNEGYZFPNF6Y5Z66YBEQSMOD6CF,1802,1998-01-28 08:22:38,www,80,BL


In [103]:
df_bl.loc[df_bl['port'].isnull()]

Unnamed: 0,urlkey,timestamp,original,mimetype,statuscode,digest,length,date,subdomain,port,org
89,"uk,bl)/",20040104234949,http://www.bl.uk/,text/html,200,EDUGXIGFMA4EABQQ7LF66WZ4USUK5O6R,4421,2004-01-04 23:49:49,www,,BL
91,"uk,bl)/",20040117003018,http://www.bl.uk/,text/html,200,Y3JGAZZVJLR7RPNVN4FXDEHBIRE3UPG2,4419,2004-01-17 00:30:18,www,,BL
155,"uk,bl)/",20040722102216,http://www.bl.uk/,text/html,200,XMYB6HAHJILWQMKSC3FKSODR6KLSNNCM,4546,2004-07-22 10:22:16,www,,BL
265,"uk,bl)/",20041110051001,http://www.bl.uk/,text/html,200,TOTLB5QWTDQ42QJBH42JFR6WN7SRY6QT,4546,2004-11-10 05:10:01,www,,BL
266,"uk,bl)/",20041110113540,http://www.bl.uk/,text/html,200,TOTLB5QWTDQ42QJBH42JFR6WN7SRY6QT,4548,2004-11-10 11:35:40,www,,BL
...,...,...,...,...,...,...,...,...,...,...,...
13155,"uk,bl)/",20200308155104,https://www.bl.uk/,text/html,429,6MG25PFYD6KK6XTN7WPHAWC3RECNOKWB,407,2020-03-08 15:51:04,www,,BL
13156,"uk,bl)/",20200309084629,http://www.bl.uk/,text/html,301,HLNR6AWVWYCU3YAENY3HYHLIPNWN66X7,497,2020-03-09 08:46:29,www,,BL
13157,"uk,bl)/",20200309084629,http://www.bl.uk/,text/html,301,HLNR6AWVWYCU3YAENY3HYHLIPNWN66X7,497,2020-03-09 08:46:29,www,,BL
13158,"uk,bl)/",20200309084631,https://www.bl.uk/,text/html,200,7JTAZP6IIXMIAJRTQJI6WMLAXKDEPNPW,15488,2020-03-09 08:46:31,www,,BL


In [104]:
df_bl.shape

(13160, 11)

In [105]:
alt.Chart(df_bl).mark_point().encode(
    x='date:T',
    y='length:Q',
    color='statuscode',
    row='subdomain'
).properties(width=700, height=200)

In [106]:
df_bl.loc[df_bl['statuscode'] == '-']

Unnamed: 0,urlkey,timestamp,original,mimetype,statuscode,digest,length,date,subdomain,port,org
1751,"uk,bl)/",20090416083059,http://www.bl.uk/,warc/revisit,-,EISBVSNARKQBMQPTFRXACFR2PGEMNYOM,400,2009-04-16 08:30:59,www,,BL
1752,"uk,bl)/",20090416083059,http://www.bl.uk/,warc/revisit,-,EISBVSNARKQBMQPTFRXACFR2PGEMNYOM,400,2009-04-16 08:30:59,www,,BL
1771,"uk,bl)/",20090525040929,http://www.bl.uk/,warc/revisit,-,7EV6KKJLOWZNJKUKEODDV3CFUN7LHCOS,399,2009-05-25 04:09:29,www,,BL
1772,"uk,bl)/",20090525040929,http://www.bl.uk/,warc/revisit,-,7EV6KKJLOWZNJKUKEODDV3CFUN7LHCOS,399,2009-05-25 04:09:29,www,,BL
1976,"uk,bl)/",20110201074514,http://www.bl.uk/,warc/revisit,-,IAKFV2NB3HEXPAHHF3THDK23N6HGNP6G,426,2011-02-01 07:45:14,www,,BL
...,...,...,...,...,...,...,...,...,...,...,...
13118,"uk,bl)/",20200214211515,https://www.bl.uk/,warc/revisit,-,ULJZMM3SEGEPW2HQ4K4L2TTN2VNRX5P6,622,2020-02-14 21:15:15,www,,BL
13130,"uk,bl)/",20200224154801,http://www.bl.uk/,warc/revisit,-,HLNR6AWVWYCU3YAENY3HYHLIPNWN66X7,520,2020-02-24 15:48:01,www,,BL
13132,"uk,bl)/",20200227114349,http://www.bl.uk/,warc/revisit,-,HLNR6AWVWYCU3YAENY3HYHLIPNWN66X7,588,2020-02-27 11:43:49,www,,BL
13148,"uk,bl)/",20200307094712,http://www.bl.uk/,warc/revisit,-,HLNR6AWVWYCU3YAENY3HYHLIPNWN66X7,489,2020-03-07 09:47:12,www,,BL


In [107]:
df_bl['statuscode'].value_counts()

200    9202
-      2112
301    1254
302     446
429     146
Name: statuscode, dtype: int64

In [108]:
alt.Chart(df_bl).mark_point().encode(
    x='date:T',
    y='length:Q',
    color='subdomain',
).properties(width=700, height=200)

In [111]:
alt.Chart(df_bl).mark_point().encode(
    x='date:T',
    y='length:Q',
    color='statuscode',
).properties(width=700, height=300)

In [112]:
response = requests.get('http://web.archive.org/cdx/search/cdx?url=nla.gov.au/about&limit=10')

In [113]:
print(response.text)

au,gov,nla)/about 20121222002828 http://www.nla.gov.au:80/about text/html 200 GX2X2CG6V2G26DVG7AB7G3QUQZOPZV74 5730
au,gov,nla)/about 20130114022145 http://www.nla.gov.au/about text/html 200 45JKZHNK5GNTIYBWLG355G42SMFZBLW5 5884
au,gov,nla)/about 20130116094902 http://www.nla.gov.au/about text/html 200 UL7BGFFKDIURDBV4H4I3S4GYDF36IBH2 5849
au,gov,nla)/about 20130117164125 http://www.nla.gov.au:80/about text/html 200 2E6AYXGI24TBVL6NDKYCHZ3UACRUHTMA 5705
au,gov,nla)/about 20130221103132 http://www.nla.gov.au:80/about text/html 200 FSGYYVDXG7A7HC4IDA3PYUWVQGW4P34R 5459
au,gov,nla)/about 20130224090501 http://www.nla.gov.au:80/about text/html 200 MF7ZZPEAFXMJRENL2ONML6OJ2TMDOB4X 5444
au,gov,nla)/about 20130404045509 http://www.nla.gov.au:80/about text/html 200 2OQ3DKAZWARRW2UK2SDKX4N4SISRN3NB 5608
au,gov,nla)/about 20130419124433 http://www.nla.gov.au/about text/html 200 HUVGGGW3ZU32DLVJXRNWKFPLPU34JPUX 5747
au,gov,nla)/about 20130501000821 http://www.nla.gov.au/about text/html 200 NO5PPP

In [114]:
df_bl['subdomain'].value_counts()

www          12204
               946
mailto           4
paul             2
Angela           2
visitor          1
webeditor        1
Name: subdomain, dtype: int64