# Wayback 

> In this example, we will scrape the historic Deliveroo pages archived by the Wayback Machine at the Internet Archive for pages that filter Deliveroo Editions facilities.

In [None]:
#| default_exp wayback

In [None]:
#| hide
import time
import pandas as pd
from nbdev.showdoc import *
import requests
from bs4 import BeautifulSoup
from deliveroo_editions.selenium_utils import *
from deliveroo_editions.deliveroo_utils import *
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from tqdm import tqdm

ValueError: There is no such driver by url https://chromedriver.storage.googleapis.com/LATEST_RELEASE_115.0.5790

The following URL searches the Internet Archive for all archived Deliveroo webpages, including search results. With selenium, we can visit this page and filter the results to get all the URLs containing the "deliveroo+editions" filter by adding this text to an input HTML element on the Internet Archive site:

In [None]:
base_url = "https://web.archive.org/web/*/https://deliveroo.co.uk/restaurants/*"

Lets go ahead an start driving a Chrome Browser. We will set headless to `True` so we won't be able to see the browser. You can set `headless=False` if you'd like to view the browser.

In [None]:
driver = initialise_driver(service, True)

In [None]:
driver.get(base_url)
wait = WebDriverWait(driver, 20)    
filter_input = wait.until(EC.presence_of_element_located((By.ID, 'resultsUrl_filter')))

If the function didn't return an error then we know the element successfully loaded. Now we'd like to interact with a child of the `resultsUrl_filter` element and submit the `deliveroo+editions` filter to get the relevant results: 

In [None]:
input_element = filter_input.find_element(By.TAG_NAME, 'input')
input_element.clear()
input_element.send_keys('deliveroo+editions')
time.sleep(1)

Lets now grab all the results from the table:

In [None]:
editions_urls = []
while not editions_urls: 
    table = driver.find_element(By.ID, 'resultsUrl')
    cells = table.find_elements(By.CLASS_NAME, 'url')
    editions_urls = []
    for td in table.find_elements(By.CSS_SELECTOR, 'td.url'):
        url = td.find_element(By.CSS_SELECTOR, 'a').get_attribute('href')
        editions_urls.append(url)

In [None]:
# check that table results were scraped:
assert editions_urls

We can print all the archived Deliveroo search results filtering for editions: 

In [None]:
editions_urls[0:10]

### Get Captures

For each of these historic URLs, we will now get all the captures for each. With these captures we can view versions of these webpages that have been captured over time and extract data from them. Let's try this with an example url:

In [None]:
editions_urls[5]

Information on the number of captures including the first and last capture are included conveniently within an element with `class_name=captures-range-info`

In [None]:
driver.get(editions_urls[5])
wait = WebDriverWait(driver, 20)    
range_info = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "captures-range-info")))

In [None]:
assert range_info

In [None]:
range_info.text

We can see that there are captures between 2020 and 2021. Lets extract these years programmatically: 

In [None]:
editions_urls[5]

In [None]:
def get_range_info(url:str,  # Wayback calendar view URL
                  ):
    "Returns the range of years for which url captures exist."
    driver = initialise_driver(service,True)
    driver.get(url)
    wait = WebDriverWait(driver, 10)    
    range_info = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "captures-range-info")))
    capture_links = range_info.find_elements(By.CSS_SELECTOR, 'a')
    oldest_capture = capture_links[0].get_attribute('href')
    latest_capture = capture_links[1].get_attribute('href') 
    start_timestamp = oldest_capture.split('/')[4]
    end_timestamp = latest_capture.split('/')[4]   
    start_year = start_timestamp[:4]
    end_year = end_timestamp[:4]
    return start_year, end_year, driver

start_year, end_year, driver = get_range_info(editions_urls[5])
print(f"Captures between {start_year} and {end_year}.")

In [None]:
assert start_year == "2020"
assert end_year == "2021"

Now we want to get every capture between these years from the calendar UI. To access the html with captures for a given year, we need to find the clickable element representing that year with `class=sparkline-year-label` and click on this element to load the html. 

In [None]:
year = start_year
year_selector = driver.find_element(By.XPATH, f'//*[contains(@class, "sparkline-year-label") and text()="{year}"]')
assert year_selector

In [None]:
year_selector.click()

In [None]:
wait = WebDriverWait(driver, 10) 
calendar = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "calendar-grid")))

In [None]:
captures = calendar.find_elements(By.CSS_SELECTOR, 'a')
capture_urls = []
for capture in captures:
    capture_urls.append(capture.get_attribute('href'))
assert capture_urls

We've now got all the URLs for a single year: 

In [None]:
capture_urls

Lets now get every capture for every year, for every url: 

We may encounter a slight hiccup for all editions_urls with only 1 capture as the url will take us to the capture page rather than the calendar view, therefore we should try to find "capture-range-info" and if not just add the url to our capture_urls:

In [None]:
capture_urls = []
for i, url in enumerate(tqdm(editions_urls)):
    if "*" in url: 
        attempts = 0
        while attempts < 2: 
            try:
                start_year, end_year, driver = get_range_info(url)
                for year in range(int(start_year), int(end_year)+1):
                    year_selector = driver.find_element(By.XPATH, f'//*[contains(@class, "sparkline-year-label") and text()="{year}"]')
                    year_selector.click()
                    wait = WebDriverWait(driver, 10) 
                    calendar = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "calendar-grid")))
                    captures = calendar.find_elements(By.CSS_SELECTOR, 'a')
                    for capture in captures:
                        capture_urls.append(capture.get_attribute('href'))
                        attempts = 2
                        break
            except Exception as e: 
                print(e)
                attempts += 1
                time.sleep(1)
                print(url)
    else:
        capture_urls.append(url) 

In [None]:
capture_urls

We can save these urls to a file for access later:

In [None]:
import json

# with open("data/capture_urls.json", 'w') as f:
#     json.dump(capture_urls, f) 

with open("data/capture_urls.json", 'r') as f:
    capture_urls = json.load(f)

# print(capture_urls)

For each of the captures, we would now like to visit the url and extract all restaurant information. On each URL, there is a grid of restaurants within a `<ul>` element containing the `HomeFeedGrid` substring within the class name. We want to first locate all the `<li>` elements within `<ul>` without any parent `<li>` elements as this will give us list items corresponding to each restuarant. For this we can use `get_restaurants`:

In [None]:
# driver= initialise_driver(service,True)
url = capture_urls[0]
filtered_tags = get_restaurant_tags(url)
len(filtered_tags)

If we check the number of `<li>` elements within `filtered_li_tags`, we should find it equates to the number of restuarants listed on the webpage. Next we want to iterate through these items, locating another unordered list element containing 3 list items (restuarant name, description and delivery details). The element may also contain a link to the restaurant's Deliveroo url. We can use `get_restaurants` to get both the tags from before and the metadata all in one step:

In [None]:
get_restaurants(url)

Bringing it altogether, lets iterate through the capture urls getting the metadata for each restaurant in each capture. We also need to be careful of rate limits as Wayback limits us to [15 retrieval's per minute](https://en.wikipedia.org/wiki/Wayback_Machine#:~:text=Starting%20in%20October%202019%2C%20users,requests%20and%20retrievals%20per%20minute.) 

In [None]:
editions = []
driver = initialise_driver(service,True)
retrievals = 0
for i, capture in enumerate(tqdm(capture_urls)):
    # API limit of 15 retrieval's per minute
    attempts = 0
    while attempts < 2:
        try:
            metadata = get_restaurants(capture, driver)
            timestamped_metadata = add_timestamps_to_restaurants(metadata, capture)
            editions += metadata
            retrievals += 1
            attempts = 2
        except Exception as e:
            print(e)
            # reinitialise the driver if error:
            driver.close()
            driver = initialise_driver(service,True)
            attempts += 1   

In [None]:
editions

Lets now process the data such that `timestamp` and `timestamp_url` are grouped:

In [None]:
editions_list = []
for edition in editions:
    if editions_list:
        found=False
        index = ""
        for i,d in enumerate(editions_list):
            if d.get('name') == edition['name'] and d.get('location') == edition['location'] and edition['timestamp'] not in d.get('timestamps'):
                index = i
                found=True
                break
        if found:
            editions_list[index]['timestamps'].append(edition['timestamp'])
            editions_list[index]['timestamp_urls'].append(edition['timestamp_url'])
        else:
            editions_list.append({'name': edition['name'], 'location': edition['location'], 'timestamps': [edition['timestamp']], 'timestamp_urls': [edition['timestamp_url']], 'restaurant_url': edition['restaurant_url']})
    else: 
        editions_list.append({'name': editions[0]['name'], 'location': editions[0]['location'], 'timestamps': [editions[0]['timestamp']], 'timestamp_urls': [editions[0]['timestamp_url']], 'restaurant_url': editions[0]['restaurant_url']})

In [None]:
editions_list

In [None]:
editions_wayback_df = pd.DataFrame.from_dict(editions_list)

In [None]:
editions_wayback_df.to_csv('data/editions_wayback.csv', encoding="utf-8")

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()