# Comparing web pages from the Wayback Machine

<p class="alert alert-warning">Work in progress – this notebook isn't finished yet. Check back later for more...<p>

In [136]:
from difflib import HtmlDiff
import requests
from IPython.display import display, HTML
import re
import arrow
from readability import Document
from htmldiff import render_html_diff
from bs4 import BeautifulSoup, Tag
import ipywidgets as widgets
from selenium import webdriver
from PIL import Image
import PIL
import io
import base64
import time
from slugify import slugify
from webdriverdownloader import GeckoDriverDownloader
from pathlib import Path

gdd = GeckoDriverDownloader()
geckodriver = gdd.download_and_install("v0.26.0")[1]

# Add styles for the diff
HTML('<style>.diff_add {background-color: #d0e9c6;}.diff_sub {background-color: #ebcccc;} table.diff, table.diff thead {border: 1px solid black;} table.diff {table-layout: fixed; width: 100%;} th.diff_next, td.diff_next {width: 4%;} table.diff th.diff_header {text-align: left;} td {word-wrap: break-word;}</style>')

In [67]:
HTML('<style>.diff ins {background-color: #d0e9c6;}.diff del {background-color: #ebcccc;}</style>')

In [149]:
urls = [
    'https://web.archive.org/web/2016id_/http://www.nla.gov.au/collection-development-policy/intro',
    'https://web.archive.org/web/2019id_/http://www.nla.gov.au/collection-development-policy/intro'
]

In [161]:
def get_html(url):
    response = requests.get(url, headers={'User-Agent': ''})
    timestamp = re.search(r'\/web\/(\d{14})id_', response.url).group(1)
    return {'url': response.url, 'html': response.text}

def format_date(url):
    timestamp = re.search(r'\/web\/(\d{14})id_', url).group(1)
    return arrow.get(timestamp, 'YYYYMMDDHHmmss').format('d MMMM YYYY')

html_data = []
for url in urls:
    html_data.append(get_html(url))

## Screenshots

In [165]:
def get_full_page_screenshot(url, save_width=200):
    '''
    Gets a full page screenshot of the supplied url.
    By default resizes the screenshot to a maximum width of 200px.
    Provide a 'save_width' value to change this.
    '''
    # print(url)
    date_str, site = re.search(r'\/web\/(\d+)if_\/https*:\/\/(.+\/)', url).groups()
    ss_file = Path('screenshots', f'{slugify(site)}-{date_str}-{save_width}.png')
    if not ss_file.exists():
        options = webdriver.FirefoxOptions()
        options.headless = True
        driver = webdriver.Firefox(executable_path=geckodriver, options=options)
        driver.get(url)
        # Give some time for everything to load
        time.sleep(10)
        # Can just use maximize_window() instead of the below (only in Geckodriver?)
        # S = lambda X: driver.execute_script('return document.body.parentNode.scroll'+X)
        # driver.set_window_size(capture_width, S('Height') + 50) # May need manual adjustment
        driver.maximize_window()
        current_width = driver.get_window_size()['width']
        try:
            ss = driver.find_element_by_tag_name('body').screenshot_as_base64
        except NoSuchElementException:
            ss = driver.find_element_by_tag_name('frameset').screenshot_as_base64
        driver.quit()
        img = Image.open(io.BytesIO(base64.b64decode(ss)))
        ratio = save_width / current_width
        (width, height) = (round(img.width * ratio), round(img.height * ratio))
        resized_img = img.resize((width, height), PIL.Image.LANCZOS)
        resized_img.save(ss_file)
    return ss_file

In [168]:
def compare_screenshots():
    ss_out = widgets.Output()
    display(ss_out)
    html = ''
    for page in html_data:
        ss_file = get_full_page_screenshot(page['url'].replace('id_', 'if_'), save_width=400)
        date = format_date(page['url'])
        html += f'<div style="float:left; margin-left: 20px;"><p><b>{date}</b></p><p><img src="{ss_file}"></p></div>'
    with ss_out:
        display(HTML(html))
        
compare_screenshots()

Output()

## Changes in the code

In [128]:
differ = HtmlDiff()
# context=False -- shows the whole document
# numlines -- when context=True, the number of lines to show around the diff 
# numline=0 -- just the diffs
html = differ.make_table(html_data[0]['html'].splitlines(), html_data[1]['html'].splitlines(), context=False, numlines=0, fromdesc=format_date(html_data[0]['url']), todesc=format_date(html_data[1]['url']))
# Rewrite the table html to make the column widths work better
html = re.sub(r'<th colspan="2" class="diff_header"', r'<th class="diff_next"></th><th class="diff_header"', html)
display(HTML(html))

Unnamed: 0,Unnamed: 1,4 October 2016,Unnamed: 3,Unnamed: 4,3 April 2020
f,1.0,<!DOCTYPE html>,f,1.0,<!DOCTYPE html>
,2.0,"<!--[if IEMobile 7]><html class=""iem7"" lang=""en"" dir=""ltr""><![endif]-->",,2.0,"<!--[if IEMobile 7]><html class=""iem7"" lang=""en"" dir=""ltr""><![endif]-->"
,3.0,"<!--[if lte IE 6]><html class=""lt-ie9 lt-ie8 lt-ie7"" lang=""en"" dir=""ltr""><![endif]-->",,3.0,"<!--[if lte IE 6]><html class=""lt-ie9 lt-ie8 lt-ie7"" lang=""en"" dir=""ltr""><![endif]-->"
,4.0,"<!--[if (IE 7)&(!IEMobile)]><html class=""lt-ie9 lt-ie8"" lang=""en"" dir=""ltr""><![endif]-->",,4.0,"<!--[if (IE 7)&(!IEMobile)]><html class=""lt-ie9 lt-ie8"" lang=""en"" dir=""ltr""><![endif]-->"
,5.0,"<!--[if IE 8]><html class=""lt-ie9"" lang=""en"" dir=""ltr""><![endif]-->",,5.0,"<!--[if IE 8]><html class=""lt-ie9"" lang=""en"" dir=""ltr""><![endif]-->"
,6.0,"<!--[if (gte IE 9)|(gt IEMobile 7)]><!--><html lang=""en"" dir=""ltr"" prefix=""og: http://ogp.me/ns# article: http://ogp.me/ns/article# book: http://ogp.me/ns/book# profile: http://ogp.me/ns/profile# video: http://ogp.me/ns/video# product: http://ogp.me/ns/product#""><!--<![endif]-->",,6.0,"<!--[if (gte IE 9)|(gt IEMobile 7)]><!--><html lang=""en"" dir=""ltr"" prefix=""og: http://ogp.me/ns# article: http://ogp.me/ns/article# book: http://ogp.me/ns/book# profile: http://ogp.me/ns/profile# video: http://ogp.me/ns/video# product: http://ogp.me/ns/product#""><!--<![endif]-->"
,7.0,,,7.0,
,8.0,<head>,,8.0,<head>
n,,,n,9.0,<!--[if IE]><![endif]-->
,9.0,"<meta charset=""utf-8"" />",,10.0,"<meta charset=""utf-8"" />"


## Changes in the main text

In [113]:
def find_no_changes(tag):
    return not(tag.find_all('ins') or tag.find_all('del'))

def filter_text_differences(html):
    soup = BeautifulSoup(html)
    no_changes = soup.find('div', class_='diff').find('div').find('div').find_all(find_no_changes, recursive=False)
    for elem in no_changes:
        prev = elem.previous_sibling
        if prev:
            while(prev and not isinstance(prev, Tag)):
                prev = prev.previous_sibling
        if prev and prev.has_attr('class') and prev['class'] == 'placeholder':
            # print(elem.previous_sibling)
            elem.decompose()
        else:
            placeholder = soup.new_tag('p')
            placeholder['class'] = 'placeholder'
            placeholder.string = '...'
            elem.replace_with(placeholder)
    return str(soup)

out = widgets.Output()

def show_changes(ev):
    out.clear_output(wait=True)
    show_text_differences()
    
def show_all(ev):
    out.clear_output(wait=True)
    show_text_differences(show_context=True)

def show_text_differences(show_context=False):
    doc1 = Document(html_data[0]['html'])
    doc2 = Document(html_data[1]['html'])
    html = render_html_diff(doc1.summary(html_partial=True), doc2.summary(html_partial=True))
    button = widgets.Button()
    if show_context == True:
        button.description = 'Show changes only'
        button.on_click(show_changes)
    else:
        html = filter_text_differences(html)
        button.description = 'Show full context'
        button.on_click(show_all)
    with out:
        display(HTML(html))
        display(button)

display(out)
show_changes('e')

Output()