In [1]:
# Import dependencies

from splinter import Browser
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

In [2]:
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

## Scrape book data and load into a DataFrame

In [3]:
url = 'http://books.toscrape.com/'
browser.visit(url)
browser_url = browser.url

In [4]:
# Initialize empty lists for storing scraped book titles, links, product descriptions, prices
titles = []
full_links = []
product_descriptions = []
prices = []

In [5]:
# Iterate through all pages
for x in range(50):
    # HTML object
    html = browser.html
    # Parse HTML with Beautiful Soup
    soup = BeautifulSoup(html, 'html.parser')
    # Retrieve all elements that contain book information
    articles = soup.find_all('article', class_='product_pod')
    browser_url = '/'.join(browser.url.rstrip('/').split('/')[:-1])

    # Iterate through each book
    for article in articles:
        # Use Beautiful Soup's find() method to navigate and retrieve the anchor tag pertaining to each book
        h3 = article.find('h3')
        link = h3.find('a')

        # Compile the book-specific web address, and handle the landing page's peculiar formatting
        href = link['href']
        if x == 0:
            href = "books.toscrape.com/" + href
        
        # Complete the concatenation of the book page url
        full_link = browser_url + "/" + href
        full_links.append(full_link)

        # Retrieve the title of the book and add it to our list of books
        title = link['title']
        titles.append(title)
        

    # Click the 'Next' button on each page, otherwise print that scraping is complete
    try:
        browser.click_link_by_text('next')
          
    except:
        print("Scraping Complete")



Scraping Complete


In [None]:
# Aborted attempt to grab the star rating for a particular title

#browser.visit("http://books.toscrape.com/catalogue/frankenstein_20/index.html")
#html = browser.html
#soup = BeautifulSoup(html, 'html.parser')
#star_thing = soup.find_all('div', class_ = "col-sm-6 product_main")

#star_p = soup.find_all('p')

#rating_p = star_p[2]

#rating_p

In [7]:
# Follow each book's link to grab the production description and price for each book; store if successful
for link in full_links:
    try:
        browser.visit(link)
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')

        # Find and append the product description of the current book to our list
        product_description = soup.find_all('p')[3].text
        product_descriptions.append(product_description)
        
        # Find and append the float-formatted price of the current book to our list
        price = float(soup.find_all('p', class_='price_color')[0].text.strip('£'))
        prices.append(price)
        
    except:
        print(f"Page not found at address: {link}")
        product_descriptions.append("Description not found")

In [8]:
# Create a dataframe containins the scraped book data (urls, titles, descriptions, and prices)
books_df = pd.DataFrame(
    {"link": full_links,
     "title": titles,
     "description": product_descriptions,
     "price": prices
     }
)

In [9]:
# Preview the books DataFrame
books_df.head()

Unnamed: 0,link,title,description,price
0,http://books.toscrape.com/catalogue/a-light-in...,A Light in the Attic,It's hard to imagine a world without A Light i...,51.77
1,http://books.toscrape.com/catalogue/tipping-th...,Tipping the Velvet,"""Erotic and absorbing...Written with starling ...",53.74
2,http://books.toscrape.com/catalogue/soumission...,Soumission,"Dans une France assez proche de la nôtre, un h...",50.1
3,http://books.toscrape.com/catalogue/sharp-obje...,Sharp Objects,"WICKED above her hipbone, GIRL across her hear...",47.82
4,http://books.toscrape.com/catalogue/sapiens-a-...,Sapiens: A Brief History of Humankind,From a renowned historian comes a groundbreaki...,54.23


## Scrape quote data and load into DataFrames

In [14]:
# Point the scraper at the quotes site
quotes_url = 'http://quotes.toscrape.com/'
browser.visit(quotes_url)

In [15]:
# Initialize an empty list for storing speaker names
speakers = []

In [16]:
# Iterate through each quote
for x in range(1, 11):

    
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    quotes = soup.find_all('small', class_='author')

    for quote in quotes:
        speakers.append(quote.text)

    if x != 10:
        browser.click_link_by_partial_text('Next')

In [17]:
# Create a dataframe containing the scraped quotes data (speakers)
quotes_df = pd.DataFrame(
    {"quote_speakers": speakers
     }
)

In [18]:
# Create an array that has an ordered array of unique quote speakers
unique_speakers = np.unique(np.array(speakers))

In [19]:
# Convert the unique speakers back into a list which can be loaded into a DataFrame
unique_speakers_list = unique_speakers.tolist()


In [20]:
# Create a dataframe containing the unique speakers
speakers_df = pd.DataFrame(
    {"speakers": unique_speakers_list
     }
)

## Find speakers in book descriptions and build a DataFrame of "matches"

In [22]:
# Initialize empty list for storing IDs of the books and speakers who match
matches_book_id_list = []
matches_speaker_id_list = []

In [24]:
# Loop through the speaker list and record each book description mentioning the speaker

num_speakers = np.arange(0,len(unique_speakers_list))

for i in num_speakers:

    match_book_id_index = books_df[books_df['description'].str.contains(unique_speakers_list[i])].index
    
    if len(match_book_id_index) > 0:       
        
        for foo in match_book_id_index:
            matches_book_id_list.append(foo)
            matches_speaker_id_list.append(i)
   
    i = i + 1

In [25]:
matches_df = pd.DataFrame({
    "book_id": matches_book_id_list,
    "speaker_id": matches_speaker_id_list
})