In [4]:
# import the tools Splinter, BeautifulSoup, and ChromeDriverManager.

from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager

In [6]:
# Next, we'll set the executable path and initialize a browser.

# We use the ChromeDriverManager().install() to install the ChromeDriver and set up the executable path:

# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)


[WDM] - Current google-chrome version is 87.0.4280
[WDM] - Get LATEST driver version for 87.0.4280
[WDM] - Driver [/Users/josefanolin/.wdm/drivers/chromedriver/mac64/87.0.4280.88/chromedriver] found in cache


 


In [9]:
# Path to Chromedriver (Not mentioned in reading, but in example. Might be important later.)
# !which chromedriver

# RUnning this code ending up showing us "chromedriver not found" but the browser window still popped up. Guess its
# just an outdated piece of code

In [None]:
# With these two lines of code, we are creating an instance of a Splinter browser. 

# This means that we're prepping our automated browser. 

# We're also specifying that we'll be using Chrome as our browser. 

# **executable_path is unpacking the dictionary we've stored the path in – think of it as unpacking a suitcase. 

# headless=False means that all of the browser's actions will be displayed in a Chrome window so we can see them.



In [None]:
# an empty webpage should automatically open, ready for instructions.
# You'll know that it's an automated browser because it'll have a special message stating so, right under the tab

# While the window can be closed at anytime, it's generally not a good idea to shut down the browser without ending 
# the session properly – there's an excellent chance your code will fail or an error will be generated. 


In [10]:
# Given a practice page to scrape. Check it out. 


In [None]:
# Scrape the Top 10 Tags

# Interacting with webpages is Splinter's specialty, and there are lots of things to interact with on this one, such 
# as the login button and tags. Our goal for this practice is to scrape the "Top Ten tags" text.



In [None]:
# use the DevTools to look at the details of this line. 
# Right-click the webpage and select "Inspect." 
# From the DevTools window, we can actually select an element on the page instead of searching through the tags.

# First, select the inspect icon (the one to the far left).

# Then, click the element you want to select on the page, such as the humor tag. 
# This will direct your DevTools to the line of code the humor tag is nested in.

# being able to select items directly from the webpage helps scale down time immensely. So with this shortcut, we've 
# been able to select the <h2 /> tag holding the text we want.

# But what if there is more than one <h2 /> tag on the page? When scraping one particular item, we will often need 
# to be more specific in choosing the tag we're scraping from. 
# We can narrow this down even further by using the search function in our DevTools.

In [None]:
# So we’re using DevTools to search through the HTML of the webpage. We know we want text that’s inside an <h2 /> tag.
# What else could we do to get more specific?

# While DevTools is active, press Command + F or CTRL + F to activate the search function, then search for the tags or
# text you’re looking for.

# The search function will identify the items you’re looking for within the HTML code of the page. 
# For example, searching for “h2” will return all <h2 /> tags on the page.


In [None]:
# Search for Elements

# Searching within the HTML code is another useful way to quickly find items we're looking for. 
# Earlier, we were able to select a particular component from the page with the select tool. 
# But there are times where we need to know how many of a certain type of tag are in the page. 
# For example, the title we want to scrape is in an <h2 /> tag, but there are several others on the page as well. 
# Knowing this, we can expect to tailor our code to pull only the <h2 /> tag we want. 


# First, let's practice searching in the HTML.

# With the DevTools still active, press Command + F if you use a Mac, or CTRL + F if you use a Windows computer. 
# This activates the search functionality, only instead of searching the webpage, we're searching the HTML of the 
# webpage. So if we search for all of the <h2 /> tags in the document, we'll know if we need to make our search more 
# specific by adding attributes such as a class name or an id.

# In the search bar that we just activated, type h2 and then press Enter on your keyboard.

In [None]:
# The result of our search immediately shows us two things: that the first tag we've searched for is highlighted, and 
# also the number of those tags in the document.

# Because there is only "1 of 1" h2 tags in the document, we know that we can scrape for an <h2 /> without being more 
# specific. In most other cases, we'll need to include a class or id, so we'll practice that in a little bit.



In [11]:
# Scrape the Title
# Now let's scrape that title. In the next cell in Jupyter Notebook, type the following:
# This code tells Splinter which site we want to visit by assigning the link to a URL.

# Visit the Quotes to Scrape site
url = 'http://quotes.toscrape.com/'
browser.visit(url)

In [12]:
# After executing the cell above, we will use BeautifulSoup to parse the HTML. 
# In the next cell, we'll add two more lines of code:

# This code will parse all of the HTML on the page. That means that BeautifulSoup will take a look at the different 
# components and can now access them. Specifically, BeautifulSoup parses the HTML text and then stores it as an object.

# In our code, we're using ‘html.parser’ to parse the information, but there are other options available as well.

# Parse the HTML
html = browser.html
html_soup = soup(html, 'html.parser')

In [13]:
# In our next cell, we will find the title and extract it.

# 1. We use our html_soup object we created earlier and chained find() to it to search for the <h2 /> tag.
# 2. We've also extracted only the text within the HTML tags by adding .text to the end of the code.

# Scrape the Title
title = html_soup.find('h2').text
title

'Top Ten tags'

In [None]:
# Scrape All of the Tags

# Using our DevTools again, look at the code for the tags. We want all of the tags instead of just one, so we want to 
# first use our select tool to highlight the <div /> container that holds all of the tags.

# Notice that the <div /> container holding all of the tags has two classes. 

# The col-md-4 class is a Bootstrap feature. Bootstrap is an HTML and CSS framework that simplifies adding functional 
# components that look nice by default. In this case, col-md-4 means that this webpage is using a grid layout, and 
# it's a common class that many webpages use. We'll dive into that more later.

# The other class, tags-box, looks custom, though. Let's make sure first by searching for it using our search box.

In [14]:
# After searching for tags-box, we can see that only one result is returned. This means that it's unique in the HTML
# and can be used to locate specific data. 

#Next, expand the tags-box div to take a look at the contents.

# From here, we can see a list of <span /> elements, each with a class of tag-item.
# Open some of the <span /> elements to see what they contain; if you see <a /> elements with the names in the list 
# that we're targeting, then we're in the right place.

# Since there are 10 items in the list displayed in the browser, let's use the dev tools' search function to verify 
# the list item count. 

# Search for tag-item and note the number of returned results. If there are 10, then we're ready to go.

In [None]:
# Let's scrape each tag-item

# This code will look really similar to our last, but we've increased the difficulty a bit by incorporating a for loop,
# but let's start at the beginning.

# 1. The first line, tag_box = html_soup.find('div', class_='tags-box'), creates a new variable tag_box, which will 
# be used to store the results of a search. 
# In this case, we're looking for <div /> elements with a class of tags-box, and we're searching for it in the HTML we
# parsed earlier and stored in the html_soup variable.

# 2. The second line, tags = tag_box.find_all('a', class_='tag'), is similar to the first but with a few tweaks to 
# make the search more specific. 
# The new "tags" variable will hold the results of a find_all, but this time we're searching through the parsed 
# results stored in our tag_box variable to find <a /> elements with a tag class.

# We used find_all this time because we want to capture all results, instead of a single or specific one.

# Next, we've added a for loop. This for loop cycles through each tag in the tags variable, strips the HTML code out 
# of it, and then prints only the text of each tag.



In [15]:
# Scrape the top ten tags
tag_box = html_soup.find('div', class_='tags-box')
# tag_box
tags = tag_box.find_all('a', class_='tag')

for tag in tags:
    word = tag.text
    print(word)

love
inspirational
life
humor
books
reading
friendship
friends
truth
simile


In [None]:
# Scrape Across Pages

# Now that we've practiced scraping items from a single page, we're going to up the ante by scraping items that span 
# multiple pages. Our next section of code will scrape the quotes on the first page, click the "Next" button, then 
# scrape more quotes and so on (five pages worth of quotes).



In [17]:
# Our next code two lines do two things: 
# They assign an actual URL to the variable named "url" and then tell Splinter to visit that webpage. 
# Go ahead and execute this cell. This will cause the automated browser to navigate there.

url = 'http://quotes.toscrape.com/'
browser.visit(url)

In [18]:
# In the next cell, we'll create a for loop to collect each quote, "click" the next button, then collect the next set 
# of quotes. We'll use range(1, 6) in our for loop to visit the first five pages of the website.

for x in range(1, 6):
   html = browser.html
   quote_soup = soup (html, 'html.parser')
   quotes = quote_soup.find_all('span', class_='text')
   for quote in quotes:
      print('page:', x, '----------')
      print(quote.text)
   browser.links.find_by_partial_text('Next')

page: 1 ----------
“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
page: 1 ----------
“It is our choices, Harry, that show what we truly are, far more than our abilities.”
page: 1 ----------
“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”
page: 1 ----------
“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”
page: 1 ----------
“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”
page: 1 ----------
“Try not to become a man of success. Rather become a man of value.”
page: 1 ----------
“It is better to be hated for what you are than to be loved for what you are not.”
page: 1 ----------
“I have not failed. I've just found 10,000 ways that won't work.”
page: 1 ----------
“A woman is like a tea bag; you never know how strong it is u

In [None]:
# It's important to note that there are many ways that BeautifulSoup can search for text, but the syntax is typically 
# the same: we look for a tag first, then an attribute. 

# We can search for items using only a tag, such as a <span /> or <h1 />, but a class or id attribute makes the search
# that much more specific.

# By including an attribute, we have a far better chance of scraping the data we want.

In [None]:
# To create a new div element that acts as a container and has an id of “box”, which of these options is would you use?
# <div class=”container” id=”box”></div>

In [None]:
# What would happen we ran soup.find_all('div', class_='quote') instead of soup.find_all('span', class='text')?

# We would scrape the parent element and grab everything instead of just the quotes.


In [20]:
# Finally, we can close out the browser by running 
browser.quit()

