## Wayback Machine API 

The purpose of this notebook is to test out using the wayback machines API, with selenium to take screenshots of webpages across many years.



### Dependencies:

`pip install webdriver_manager selenium requests`

- selenium - Selenium is an open-source tool used for automating web browsers
- webdriver_manager - library to automatically manage the web driver

### Links:

- https://selenium-python.readthedocs.io/getting-started.html
- https://github.com/SeleniumHQ


### Notes

> *October 2019, users are limited to 15 archival requests and retrievals per minute*

In [1]:
import os
import time
import requests
from io import BytesIO
from pathlib import Path
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

from PIL import Image
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
def get_snapshot_status(url, timestamp, max_attempts=3):
    """
    Get the closest snapshot and its status.
    
    :param url: URL to get the snapshot for
    :param timestamp: Timestamp in the format YYYYMMDDHHMMSS
    :param max_attempts: Maximum attempts to retry on failure
    :return: Tuple containing snapshot URL and status (green/red) or rate limiting information
    """
    api_url = "http://archive.org/wayback/available"
    params = {
        'url': url,
        'timestamp': timestamp
    }
    
    for attempt in range(max_attempts):
        try:
            response = requests.get(api_url, params=params)
            if response.status_code == 429:
                retry_after = response.headers.get('Retry-After', 60)  # Default to 60 seconds if not provided
                print(f"Rate limited. Retry after {retry_after} seconds.")
                time.sleep(int(retry_after))
                continue

            response.raise_for_status()  # Raise an HTTPError for bad responses other than 429
            data = response.json()

            print(data)
            
            if 'archived_snapshots' in data and 'closest' in data['archived_snapshots']:
                closest_snapshot = data['archived_snapshots']['closest']
                snapshot_url = closest_snapshot['url']
                snapshot_status = 'green' if closest_snapshot.get('status') == "200" else 'red'
                return snapshot_url, snapshot_status
            else:
                return None, 'red'
        except requests.RequestException as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            time.sleep(2 ** attempt)  # Exponential backoff

    return None, 'red'

def get_embedded_links(snapshot_url):
    """
    Extract all embedded links from the snapshot.
    
    :param snapshot_url: URL of the snapshot to extract links from
    :return: List of extracted links
    """
    response = requests.get(snapshot_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = [a['href'] for a in soup.find_all('a', href=True)]
    return links

def archive_links(links):
    """
    Archive the extracted links using the Wayback Machine save API.
    
    :param links: List of links to archive
    """
    save_api_url = "http://web.archive.org/save/"
    for link in links:
        response = requests.get(save_api_url + link)
        if response.status_code == "200":
            print(f"Successfully archived {link}")
        else:
            print(f"Failed to archive {link}")

# def take_screenshot(url, output_path):
#     """
#     Take a screenshot of a web page.
    
#     :param url: URL of the web page to take a screenshot of
#     :param output_path: Path to save the screenshot
#     """
#     os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
#     options = Options()
#     options.headless = True
#     driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
#     driver.get(url)
#     total_height = driver.execute_script("return document.body.scrollHeight")
#     driver.set_window_size(1920, total_height)
    
#     # Ensure the page has loaded completely
#     driver.implicitly_wait(10)
    
#     driver.save_screenshot(output_path)
#     driver.quit()


def take_full_page_screenshot(url, output_path):
    """
    Take a full-page screenshot of a web page by scrolling and capturing segments.
    
    :param url: URL of the web page to take a screenshot of
    :param output_path: Path to save the final stitched screenshot
    """
    # Create the directory if it does not exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    options = Options()
    options.headless = True
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
    driver.get(url)
    time.sleep(2)  # Allow some time for the page to load completely
    
    total_height = driver.execute_script("return document.body.scrollHeight")
    viewport_height = driver.execute_script("return window.innerHeight")
    driver.set_window_size(1920, viewport_height)

    screenshots = []
    for i in range(0, total_height, viewport_height):
        driver.execute_script(f"window.scrollTo(0, {i})")
        time.sleep(2)  # Allow some time for the page to scroll and render content
        screenshot = driver.get_screenshot_as_png()
        screenshots.append(Image.open(BytesIO(screenshot)))

    # Stitch the screenshots together
    stitched_image = Image.new('RGB', (screenshots[0].width, total_height))
    y_offset = 0
    for screenshot in screenshots:
        stitched_image.paste(screenshot, (0, y_offset))
        y_offset += screenshot.height

    stitched_image.save(output_path)
    driver.quit()

In [3]:
url = "https://youtube.com"
timestamp = "20140701000000"  # April 1, 2024, 00:00:00
print(timestamp)
snapshot_url, snapshot_status = get_snapshot_status(url, timestamp)



if snapshot_url:
    print(f"Snapshot URL: {snapshot_url}")
    print(f"Snapshot Status: {snapshot_status}")
    
    # Take a screenshot of the snapshot
    output_path = Path(f"_img/{url.replace('https://', '')}/{timestamp}.png")
    print(output_path)
    take_full_page_screenshot(snapshot_url, output_path)
    print(f"Screenshot saved to {output_path}")
    
    if snapshot_status == 'green':
        # Extract embedded links and archive them
        links = get_embedded_links(snapshot_url)
        # archive_links(links)
    else:
        print("Snapshot is not fully successful (red status).")
else:
    print("No snapshot available for the given URL and timestamp.")


20140701000000
{'url': 'https://youtube.com', 'archived_snapshots': {'closest': {'status': '200', 'available': True, 'url': 'http://web.archive.org/web/20140701000037/http://www.youtube.com/', 'timestamp': '20140701000037'}}, 'timestamp': '20140701000000'}
Snapshot URL: http://web.archive.org/web/20140701000037/http://www.youtube.com/
Snapshot Status: green
_img/youtube.com/20140701000000.png
Screenshot saved to _img/youtube.com/20140701000000.png


In [None]:
# {'url': 'https://yappstore.ai', 'archived_snapshots': {}, 'timestamp': '20240401000000'}
# No snapshot available for the given URL and timestamp.