# Get a full page screenshot from an archived web page

In [82]:
from selenium import webdriver
import requests
import selenium
from PIL import Image
import PIL
import io
import base64
import time
import re
import arrow
from slugify import slugify
from webdriverdownloader import GeckoDriverDownloader
from pathlib import Path
from IPython.display import display, HTML
from urllib.parse import urlparse
import ipywidgets as widgets

gdd = GeckoDriverDownloader()
geckodriver = gdd.download_and_install("v0.26.0")[1]

In [90]:
TIMEGATES = {
    'nla': 'https://web.archive.org.au/awa/',
    'nlnz': 'https://ndhadeliver.natlib.govt.nz/webarchive/wayback/',
    'bl': 'https://www.webarchive.org.uk/wayback/archive/',
    'ia': 'https://web.archive.org/web/'
}

pywb = ['webarchive.org.uk', 'web.archive.org.au']
wayback = ['ndhadeliver.natlib.govt.nz', 'web.archive.org']

def format_date_for_headers(iso_date, tz):
    '''
    Convert an ISO date (YYYY-MM-DD) to a datetime at noon in the specified timezone.
    Convert the datetime to UTC and format as required by Accet-Datetime headers:
    eg Fri, 23 Mar 2007 01:00:00 GMT
    '''
    local = arrow.get(f'{iso_date} 12:00:00 {tz}', 'YYYY-MM-DD HH:mm:ss ZZZ')
    gmt = local.to('utc')
    return f'{gmt.format("ddd, DD MMM YYYY HH:mm:ss")} GMT'

def parse_links_from_headers(headers):
    '''
    Extract original, timegate, timemap, and memento links from 'Link' header.
    '''
    memento_links = {}
    links = re.findall(r'<(.*?)>; rel="(original|timegate|timemap|memento|first memento|prev memento|next memento|last memento)"', headers['Link'])
    for url, url_type in links:
        memento_links[url_type] = url
    return memento_links

def query_timegate(timegate, url, date=None, tz='Australia/Canberra'):
    headers = {}
    if date:
        formatted_date = format_date_for_headers(date, tz)
        headers['Accept-Datetime'] = formatted_date
    # Note that you don't get a timegate response if you leave off the trailing slash, but extras don't hurt!
    tg_url = f'{TIMEGATES[timegate]}{url}/'
    response = requests.get(tg_url, headers=headers)
    return parse_links_from_headers(response.headers)

def get_memento():
    links = query_timegate(repository.value, target_url.value, target_date.value)
    return links['memento']
    
def get_full_page_screenshot(url, save_width=200):
    '''
    Gets a full page screenshot of the supplied url.
    By default resizes the screenshot to a maximum width of 200px.
    Provide a 'save_width' value to change this.
    
    NOTE the webdriver sometimes fails for unknown reasons. Just try again.
    '''
    domain = urlparse(url)[1].replace('www.', '')
    # NZ and IA inject content into the page, so we use if_ to get the original page (with rewritten urls)
    if domain in wayback and 'if_' not in url:
        url = re.sub(r'/(\d{14})/http', r'/\1if_/http', url)
    date_str, site = re.search(r'/(\d{14})(?:if_|mp_)*/https*://(.+/)', url).groups()
    ss_file = Path('screenshots', f'{slugify(site)}-{date_str}-{save_width}.png')
    options = webdriver.FirefoxOptions()
    options.headless = True
    driver = webdriver.Firefox(executable_path=geckodriver, options=options)
    driver.get(url)
    # Give some time for everything to load
    time.sleep(5)
    driver.maximize_window()
    current_width = driver.get_window_size()['width']
    # UK and AU use pywb in framed replay mode, so we need to switch to the framed content
    if domain in pywb:
        driver.switch_to.frame(0)
    try:
        ss = driver.find_element_by_tag_name('body').screenshot_as_base64
    except selenium.common.exceptions.NoSuchElementException:
        ss = driver.find_element_by_tag_name('frameset').screenshot_as_base64
    driver.quit()
    img = Image.open(io.BytesIO(base64.b64decode(ss)))
    ratio = save_width / current_width
    (width, height) = (round(img.width * ratio), round(img.height * ratio))
    resized_img = img.resize((width, height), PIL.Image.LANCZOS)
    resized_img.save(ss_file)
    out.clear_output()
    with out:
        display(HTML(f'<img src="{ss_file}">'))
        
def start(e):
    out.clear_output()
    with out:
        print('Generating screenshot...')
    memento = get_memento()
    get_full_page_screenshot(memento, save_width=width.value)

repository = widgets.Dropdown(
    options=[('UK Web Archive', 'bl'), ('National Library of Australia', 'nla'), ('National Library of New Zealand', 'nlnz'), ('Internet Archive', 'ia')],
    value='bl',
    description='Archive:',
    disabled=False,
)

target_url = widgets.Text(description='Target URL:')

target_date = widgets.DatePicker(
    description='Target date: ',
    disabled=False
)

width = widgets.IntSlider(
    value=7,
    min=200,
    max=1000,
    step=100,
    description='Width:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)

out = widgets.Output()
ss_button = widgets.Button(description='Get screenshot')
ss_button.on_click(start)
display(widgets.HBox([widgets.VBox([repository, target_date]), widgets.VBox([target_url, width])]), ss_button, out)

HBox(children=(VBox(children=(Dropdown(description='Archive:', options=(('UK Web Archive', 'bl'), ('National L…

Button(description='Get screenshot', style=ButtonStyle())

Output()