# Using screenshots to visualise change in a page over time

In [34]:
import requests
import pandas as pd

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import PIL
from PIL import Image, ImageDraw, ImageFont
import io
import base64
import time
import re
from slugify import slugify
from webdriverdownloader import GeckoDriverDownloader
from pathlib import Path

gdd = GeckoDriverDownloader()
geckodriver = gdd.download_and_install("v0.26.0")[1]

# See https://github.com/ouseful-template-repos/binder-selenium-demoscraper
# Note that currently isn't working on Binder -- perhaps related to this https://github.com/mozilla/geckodriver/issues/1559

In [35]:
def query_cdx(url, **kwargs):
    '''
    Query the IA CDX API for the supplied url.
    You can optionally provide any of the parameters accepted by the API.
    '''
    params = kwargs
    params['url'] = url
    params['output'] = 'json'
    # User-Agent value is necessary or else IA gives an error
    response = requests.get('http://web.archive.org/cdx/search/cdx', params=params, headers={'User-Agent': ''})
    response.raise_for_status()
    return response.json()

def get_full_page_screenshot(url, save_width=200):
    '''
    Gets a full page screenshot of the supplied url.
    By default resizes the screenshot to a maximum width of 200px.
    Provide a 'save_width' value to change this.
    '''
    print(url)
    date_str, site = re.search(r'\/web\/(\d+)if_\/https*:\/\/(.+\/)', url).groups()
    ss_file = Path('screenshots', slugify(site), f'{slugify(site)}-{date_str}.png')
    if not ss_file.exists():
        options = webdriver.FirefoxOptions()
        options.headless = True
        driver = webdriver.Firefox(executable_path=geckodriver, options=options)
        driver.get(url)
        # Give some time for everything to load
        time.sleep(5)
        # Can just use maximize_window() instead of the below (only in Geckodriver?)
        # S = lambda X: driver.execute_script('return document.body.parentNode.scroll'+X)
        # driver.set_window_size(capture_width, S('Height') + 50) # May need manual adjustment
        driver.maximize_window()
        current_width = driver.get_window_size()['width']
        try:
            ss = driver.find_element_by_tag_name('body').screenshot_as_base64
        except NoSuchElementException:
            ss = driver.find_element_by_tag_name('frameset').screenshot_as_base64
        driver.quit()
        img = Image.open(io.BytesIO(base64.b64decode(ss)))
        ratio = save_width / current_width
        (width, height) = (round(img.width * ratio), round(img.height * ratio))
        resized_img = img.resize((width, height), PIL.Image.LANCZOS)
        resized_img.save(ss_file)
        
def get_screenshots(domain, num=1):
    '''
    Generate up to the specified number of screenshots for each year.
    '''
    data = query_cdx(domain, num=1)
    df = pd.DataFrame(data[1:], columns=data[0])
    # Convert the timestamp string into a datetime object
    df['date'] = pd.to_datetime(df['timestamp'])
    # Convert the length from a string into an integer
    df['length'] = df['length'].astype('int')
    # Filter to those with a status code of 200
    df_200 = df.copy().loc[df['statuscode'] == '200']
    # Sort by date
    df_200.sort_values(by=['date'], inplace=True)
    # Only keep the first instance of each digest
    df_200.drop_duplicates(subset=['digest'], inplace=True)
    # Extract year from date
    df_200['year'] = df_200['date'].dt.year
    # Only keep the first instance of each year
    # df_years = df_200.copy().drop_duplicates(subset=['year'])
    # Get the first two instances from each year (so you can select the best)
    df_years = df_200.groupby('year', as_index=False).head(num)
    timestamps = df_years['timestamp'].to_list()
    Path('screenshots', slugify(domain)).mkdir(parents=True, exist_ok=True)
    # if_ gives you pages without the IA nav, but with the CSS links etc rewritten!
    for timestamp in timestamps:
        url = f'https://web.archive.org/web/{timestamp}if_/http://{domain}/'
        get_full_page_screenshot(url)
        
def make_composite(domain):
    '''
    
    '''
    max_height = 0
    pngs = sorted(Path('screenshots', slugify(domain)).glob('*.png'))
    for png in pngs:
        img = Image.open(png)
        if img.height > max_height:
            max_height = img.height
    comp = Image.new('RGB', ((len(pngs) * 200) + ((len(pngs) - 1) * 10), max_height + 50), (90,90,90))
    draw = ImageDraw.Draw(comp)
    font = ImageFont.truetype("/Library/Fonts/Microsoft/Gill Sans MT Bold.ttf", 36)
    for i, png in enumerate(pngs):
        year = re.search(r'-(\d{4})\d+.png', png.name).group(1)
        draw.text((i * 210, 10), year,(255,255,255),font=font)
        img = Image.open(png)
        comp.paste(img, (i * 210, 50))
    comp.save(Path('screenshots', f'{slugify(domain)}.png'))

In [36]:
get_screenshots('abc.net.au')

https://web.archive.org/web/19961017233008if_/http://abc.net.au/
https://web.archive.org/web/19970103063844if_/http://abc.net.au/
https://web.archive.org/web/19970124174551if_/http://abc.net.au/
https://web.archive.org/web/19980131014243if_/http://abc.net.au/
https://web.archive.org/web/19980423133541if_/http://abc.net.au/
https://web.archive.org/web/19990116225807if_/http://abc.net.au/
https://web.archive.org/web/19990117055358if_/http://abc.net.au/
https://web.archive.org/web/20000229082857if_/http://abc.net.au/
https://web.archive.org/web/20000619190108if_/http://abc.net.au/
https://web.archive.org/web/20010305163334if_/http://abc.net.au/
https://web.archive.org/web/20010307095205if_/http://abc.net.au/
https://web.archive.org/web/20020122073806if_/http://abc.net.au/
https://web.archive.org/web/20020124154256if_/http://abc.net.au/
https://web.archive.org/web/20030129150102if_/http://abc.net.au/
https://web.archive.org/web/20030208120431if_/http://abc.net.au/
https://web.archive.org/w

In [37]:
make_composite('abc.net.au')