# Using screenshots to visualise change in a page over time

![Screenshots showing changes to the ABC Australia home page over time](images/abc-net-au.png)

<p class="alert alert-warning">Work in progress – this notebook isn't finished yet. Check back later for more...<p>

In [1]:
import requests
import pandas as pd

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import PIL
from PIL import Image, ImageDraw, ImageFont
import io
import base64
import time
import re
from slugify import slugify
from webdriverdownloader import GeckoDriverDownloader
from pathlib import Path

gdd = GeckoDriverDownloader()
geckodriver = gdd.download_and_install("v0.26.0")[1]

# See https://github.com/ouseful-template-repos/binder-selenium-demoscraper

In [4]:
def query_cdx(url, **kwargs):
    '''
    Query the IA CDX API for the supplied url.
    You can optionally provide any of the parameters accepted by the API.
    '''
    params = {
        'url': url,
        'output': 'json',
        'filter': ['statuscode:200', 'mimetype:text/html']
    }
    # User-Agent value is necessary or else IA gives an error
    response = requests.get('http://web.archive.org/cdx/search/cdx', params=params, headers={'User-Agent': ''})
    response.raise_for_status()
    return response.json()

def get_full_page_screenshot(url, save_width=200):
    '''
    Gets a full page screenshot of the supplied url.
    By default resizes the screenshot to a maximum width of 200px.
    Provide a 'save_width' value to change this.
    '''
    print(url)
    date_str, site = re.search(r'\/web\/(\d+)if_\/https*:\/\/(.+\/)', url).groups()
    ss_file = Path('screenshots', slugify(site), f'{slugify(site)}-{date_str}.png')
    if not ss_file.exists():
        options = webdriver.FirefoxOptions()
        options.headless = True
        driver = webdriver.Firefox(executable_path=geckodriver, options=options)
        driver.get(url)
        # Give some time for everything to load
        time.sleep(5)
        # Can just use maximize_window() instead of the below (only in Geckodriver?)
        # S = lambda X: driver.execute_script('return document.body.parentNode.scroll'+X)
        # driver.set_window_size(capture_width, S('Height') + 50) # May need manual adjustment
        driver.maximize_window()
        current_width = driver.get_window_size()['width']
        try:
            ss = driver.find_element_by_tag_name('body').screenshot_as_base64
        except NoSuchElementException:
            ss = driver.find_element_by_tag_name('frameset').screenshot_as_base64
        driver.quit()
        img = Image.open(io.BytesIO(base64.b64decode(ss)))
        ratio = save_width / current_width
        (width, height) = (round(img.width * ratio), round(img.height * ratio))
        resized_img = img.resize((width, height), PIL.Image.LANCZOS)
        resized_img.save(ss_file)
        
def get_screenshots(domain, num=1):
    '''
    Generate up to the specified number of screenshots for each year.
    Queries CDX API for snapshots of the given url,
    then gets the first 'num' timestamps for each year.
    '''
    #This only gets the first pages of results -- change to page through, or could I collapse the CDX results?
    data = query_cdx(domain, num=1)
    df = pd.DataFrame(data[1:], columns=data[0])
    # Convert the timestamp string into a datetime object
    df['date'] = pd.to_datetime(df['timestamp'])
    # Sort by date
    df.sort_values(by=['date'], inplace=True)
    # Only keep the first instance of each digest
    df.drop_duplicates(subset=['digest'], inplace=True)
    # Extract year from date
    df['year'] = df['date'].dt.year
    # Get the first 'num' instances from each year 
    # (you only need one, but if there are failures, you might want a backup)
    df_years = df.groupby('year', as_index=False).head(num)
    timestamps = df_years['timestamp'].to_list()
    Path('screenshots', slugify(domain)).mkdir(parents=True, exist_ok=True)
    # if_ gives you pages without the IA nav, but with the CSS links etc rewritten!
    for timestamp in timestamps:
        url = f'https://web.archive.org/web/{timestamp}if_/http://{domain}/'
        get_full_page_screenshot(url)
        
def make_composite(domain):
    '''
    Combine single screenshots into a composite image.
    Loops through images in a directory with the given (slugified) domain.
    '''
    max_height = 0
    pngs = sorted(Path('screenshots', slugify(domain)).glob('*.png'))
    for png in pngs:
        img = Image.open(png)
        if img.height > max_height:
            max_height = img.height
    comp = Image.new('RGB', ((len(pngs) * 200) + ((len(pngs) - 1) * 10), max_height + 50), (90,90,90))
    # Canvas to write in the dates
    draw = ImageDraw.Draw(comp)
    # Change this to suit your system
    font = ImageFont.truetype("/Library/Fonts/Microsoft/Gill Sans MT Bold.ttf", 36)
    for i, png in enumerate(pngs):
        year = re.search(r'-(\d{4})\d+.png', png.name).group(1)
        draw.text((i * 210, 10), year,(255,255,255),font=font)
        img = Image.open(png)
        comp.paste(img, (i * 210, 50))
    comp.save(Path('screenshots', f'{slugify(domain)}.png'))

In [6]:
get_screenshots('abc.net.au')

https://web.archive.org/web/19961017233008if_/http://abc.net.au/
https://web.archive.org/web/19970103063844if_/http://abc.net.au/
https://web.archive.org/web/19980131014243if_/http://abc.net.au/
https://web.archive.org/web/19990116225807if_/http://abc.net.au/
https://web.archive.org/web/20000229082857if_/http://abc.net.au/
https://web.archive.org/web/20010305163334if_/http://abc.net.au/
https://web.archive.org/web/20020122073806if_/http://abc.net.au/
https://web.archive.org/web/20030129150102if_/http://abc.net.au/
https://web.archive.org/web/20040101013727if_/http://abc.net.au/
https://web.archive.org/web/20050101010752if_/http://abc.net.au/
https://web.archive.org/web/20060101013429if_/http://abc.net.au/
https://web.archive.org/web/20070105191459if_/http://abc.net.au/
https://web.archive.org/web/20080101014408if_/http://abc.net.au/
https://web.archive.org/web/20090103033228if_/http://abc.net.au/
https://web.archive.org/web/20100107000734if_/http://abc.net.au/
https://web.archive.org/w

In [37]:
make_composite('abc.net.au')

----

Work on this notebook was supported by the [IIPC Discretionary Funding Programme 2019-2020](http://netpreserve.org/projects/)