# Imports and Setup

In [1]:
# Import splinter and beautiful soup
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager 

In [2]:
# Set up Splinter (which means we are preparing our automated browser which we specify as chrome)
# set the executable path and initialize a browser.
executable_path = {'executable_path': 'chromedriver.exe'}

In [3]:
# **executable_path is unpacking the dictionary we've stored the path in – think of it as unpacking a suitcase. 
# headless=False means that all of the browser's actions will be displayed in a Chrome window so we can see them.
browser = Browser('chrome', **executable_path, headless=False)

In [4]:
# assign the url to scrape to a variable 
url = 'http://quotes.toscrape.com/'
# tell the code to visit that url within the browser
browser.visit(url)

In [5]:
# Parse the HTML
# This uses BeautifilSoup to parse through the html on the webpage and stores it into a variable (html_soup) 
# In our code, we're using ‘html.parser’ to parse the information, but there are other options available as well.
html = browser.html
html_soup = soup(html, 'html.parser')

# html code

In [6]:
# This is the particular section of html code we are looking at
# The code above will look at the html of the whole webpage
# This code stores the list of top 10 tags which is what we want to parse

# The col-md-4 class is a Bootstrap feature. 
# Bootstrap is an HTML and CSS framework that simplifies adding functional components that look nice by default. 
# In this case, col-md-4 means that this webpage is using a grid layout, and it's a common class that many webpages use. 

# <div class="col-md-4 tags-box">
        
#             <h2>Top Ten tags</h2>
            
#             <span class="tag-item">
#             <a class="tag" style="font-size: 28px" href="/tag/love/">love</a>
#             </span>
            
#             <span class="tag-item">
#             <a class="tag" style="font-size: 26px" href="/tag/inspirational/">inspirational</a>
#             </span>
            
#             <span class="tag-item">
#             <a class="tag" style="font-size: 26px" href="/tag/life/">life</a>
#             </span>
            
#             <span class="tag-item">
#             <a class="tag" style="font-size: 24px" href="/tag/humor/">humor</a>
#             </span>
            
#             <span class="tag-item">
#             <a class="tag" style="font-size: 22px" href="/tag/books/">books</a>
#             </span>
            
#             <span class="tag-item">
#             <a class="tag" style="font-size: 14px" href="/tag/reading/">reading</a>
#             </span>
            
#             <span class="tag-item">
#             <a class="tag" style="font-size: 10px" href="/tag/friendship/">friendship</a>
#             </span>
            
#             <span class="tag-item">
#             <a class="tag" style="font-size: 8px" href="/tag/friends/">friends</a>
#             </span>
            
#             <span class="tag-item">
#             <a class="tag" style="font-size: 8px" href="/tag/truth/">truth</a>
#             </span>
            
#             <span class="tag-item">
#             <a class="tag" style="font-size: 6px" href="/tag/simile/">simile</a>
#             </span>
            
        
#     </div>

In [7]:
# Scrape the Title

# We wanted to scrape the 'top ten tags' list on the website so we 'inspected' the page 
# In doing so we can see that they are within the <h2 /> tag
# If we do a search (ctrl + f) on 'h2' we only see 1 result meaning there is only 1 tag named h2 

# We used our html_soup object we created earlier and chained find() to it to search for the <h2 /> tag.
# We've also extracted only the text within the HTML tags by adding .text to the end of the code.

title = html_soup.find('h2').text
title

'Top Ten tags'

In [8]:
# Scrape Multiple Items

# Scrape the top ten tags
# If we look under our title's tag <h2 /> we can see that there are items in stored in <span /> tags which stores our list

# These tags are all under a <div /> tag with a class of 'tags-box'
# We will use .find to look for the <div /> tag and tags-box class within our html_soup variable (code) and store it 
tag_box = html_soup.find('div', class_='tags-box')

# In order to pull the 10 items within that list, we need to drill down more
# We will drill into the <a /> tag and 'tag' class
# Use .find_all to get all the results 
tags = tag_box.find_all('a', class_='tag')

# Using a for loop, we will loop through that list, and store just the text from the html code into a variable and print it 
for tag in tags:
    word = tag.text
    print(word)

love
inspirational
life
humor
books
reading
friendship
friends
truth
simile


In [9]:
# Scrapte Multiple Pages

# Define the url to look at and visit it
url = 'http://quotes.toscrape.com/'
browser.visit(url)

# Create a loop to loop through the first 5 pages 
for x in range(1, 6):
    # Create the html object and assign it to a variable
    html = browser.html
    # Use BeautifulSoup to parse the date 
    quote_soup = soup (html, 'html.parser')
    # Use BeautifulSoup to find all the <span /> tags that have a class of "text"
    quotes = quote_soup.find_all('span', class_='text')
    # Loop through those results and print the results 
    for quote in quotes:
        print('page:', x, '----------')
        print(quote.text)
    # Use Splinter to click the "next" button to load the next page 
    browser.links.find_by_partial_text('Next')

page: 1 ----------
“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
page: 1 ----------
“It is our choices, Harry, that show what we truly are, far more than our abilities.”
page: 1 ----------
“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”
page: 1 ----------
“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”
page: 1 ----------
“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”
page: 1 ----------
“Try not to become a man of success. Rather become a man of value.”
page: 1 ----------
“It is better to be hated for what you are than to be loved for what you are not.”
page: 1 ----------
“I have not failed. I've just found 10,000 ways that won't work.”
page: 1 ----------
“A woman is like a tea bag; you never know how strong it is u

In [27]:
# Skills Challenge 

# Get the link/url for the first book on this website

url = 'http://books.toscrape.com/'
browser.visit(url)

# Create the html object and assign it to a variable
html = browser.html
# Use BeautifulSoup to parse the date 
first_url_soup = soup (html, 'html.parser')

first_url_section = first_url_soup.find('li', class_='col-xs-6 col-sm-4 col-md-3 col-lg-3')
#first_url_section

first_url_location = first_url_section.find('div', class_='image_container')
first_url_location

# TO DO - how to get the url from this 
#first_url = first_url_location.find('a', 'href')


<div class="image_container">
<a href="catalogue/a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>
</div>

In [28]:
from flask_pymongo import PyMongo

ModuleNotFoundError: No module named 'flask_pymongo'