# Web Scraping

In [None]:
from time import time
import pandas as pd
import requests                 # HTTP programming
from bs4 import BeautifulSoup   # HTML parsing
from selenium import webdriver  # Browser automation
from selenium.common.exceptions import NoSuchElementException

In [None]:
latest_xkcd_comic = 2436
oldest_xkcd_comic = 2350

## HTTP programming

In [None]:
class xkcdComicJson:
    """
    Uses the JSON interface at https://xkcd.com/ for retrieving information about a single xkcd comic.
    """

    def __init__(self, comic_no):
        pass
        
    def save_img_to_disk(self, directory='./'):
        response = requests.get(self.img_url)
        response.raise_for_status()
        if directory[-1] != '/':
            directory += '/'
        with open(directory + f'{self.number}-{self.img_name}', mode='wb') as f:
            f.write(response.content)

In [None]:
df_json_rows = []
t0 = time()
for no in range(oldest_xkcd_comic, latest_xkcd_comic+1):
    comic = xkcdComicJson(no)
    df_json_rows.append(comic.json)
    # comic.save_img_to_disk()
t1 = time()
time_json = t1 - t0
df_json = pd.DataFrame(df_json_rows)
print("Data download completed in {:.3f} seconds.".format(time_json))

<sup>We never grow a `pandas.DataFrame` iteratively, row by row. An accurate and detailed account on the reason is found [here](https://stackoverflow.com/a/56746204).</sup>

In [None]:
df_json.tail()

## HTML parsing

In [None]:
class xkcdComicSoup:
    """
    Uses Beautiful Soup to parse the HTML page for a given comic.
    """
    
    def __init__(self, comic_no):
        pass
    
    def save_img_to_disk(self, directory='./'):
        if directory[-1] != '/':
            directory += '/'
        with open(directory + f'{self.number}-{self.img_name}', mode='wb') as f:
            f.write(self.img_response.content)

In [None]:
df_soup_rows = []
t0 = time()
for no in range(oldest_xkcd_comic, latest_xkcd_comic+1):
    comic = xkcdComicSoup(no)
    row = {
        'number':   comic.number,
        'date':     comic.date,
        'title':    comic.title,
        'caption':  comic.caption,
        'img_name': comic.img_name,
        'img':      comic.img_url
    }
    df_soup_rows.append(row)
    # comic.save_img_to_disk()
t1 = time()
time_soup = t1 - t0
df_soup = pd.DataFrame(df_soup_rows)
print("Data download completed in {:.3f} seconds.".format(time_soup))

In [None]:
df_soup.tail()

## Browser Automation

In [None]:
browser = webdriver.Firefox(executable_path='C:/Users/Andrea/Documents/geckodriver.exe')

In [None]:
df_dom_rows = []
t0 = time()
browser.get('https://xkcd.com')  # point the browser to the homepage
number = 3000

while number > oldest_xkcd_comic:
    # Find the number of the comic
    pass
    
    # Find the title of the comic
    pass
    
    # Find the caption of the comic
    pass
    
    # Find the URL of the comic image
    pass
    
    # Find the name of the PNG file
    pass
    
    # Collect information for dataset
    row = {
        'number': number,
        'title': title,
        'caption': caption,
        'img_name': img_name,
        'img': img_url
    }
    
    # Append information to list
    df_dom_rows.append(row)
    
    # Go to the previous comic
    pass
    
browser.quit()  # close the automated browser window
t1 = time()
time_dom = t1-t0
print("Data download completed in {:.3f} seconds.".format(time_dom))

In [None]:
df_dom = pd.DataFrame(df_dom_rows)
df_dom.head()

In [None]:
print('Comics retrieved: {:0d}.'.format(latest_xkcd_comic - oldest_xkcd_comic))
print('HTTP programming took   {:.3f} seconds.'.format(time_json))
print('HTML parsing took       {:.3f} seconds.'.format(time_soup))
print('Browser automation took {:.3f} seconds.'.format(time_dom))