# Page Scraper

Create an application which connects to a site and pulls out all links, or images, and saves them to a list. 

In [1]:
# Import requests and beautiful soup libraries
import requests
import bs4

In [2]:
# Extract web page HTML
res = requests.get('https://books.toscrape.com/')

In [3]:
# Create soup object for web page contents
soup = bs4.BeautifulSoup(res.text, "lxml")

In [4]:
# Show sample HTML element with CSS class of "product_pod"
# Contains info for each book displayed on website
soup.select('.product_pod')[0]

<article class="product_pod">
<div class="image_container">
<a href="catalogue/a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>
</div>
<p class="star-rating Three">
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
</p>
<h3><a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
<div class="product_price">
<p class="price_color">Â£51.77</p>
<p class="instock availability">
<i class="icon-ok"></i>
    
        In stock
    
</p>
<form>
<button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
</form>
</div>
</article>

In [5]:
# Show sample thumbnail class
soup.select('.thumbnail')[0]

<img alt="A Light in the Attic" class="thumbnail" src="media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/>

In [6]:
image1 = soup.select('.thumbnail')[0]

In [7]:
# Show image source (URL) of sample image
image1['src']

'media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg'

In [16]:
#Store image source as variable
image1 = soup.select('.thumbnail')[0]['src']

In [17]:
image1

'media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg'

In [18]:
url_stem = 'https://books.toscrape.com/'

image_url = url_stem + image1

In [19]:
image_url

'https://books.toscrape.com/media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg'

In [20]:
# Create list of image URLs from first page

# Store book elements as variable
books = soup.select('.product_pod')

# Store image URLs in list
images = []

# URL stem to which image source will be concatenated
url_stem = "https://books.toscrape.com/"

# If book has 5 star rating, add image URL to list
# Concatenate "to scrape" URL stem and image source for image
for book in books:
    if len(book.select('.star-rating.Five')) > 0:
        images.append(url_stem + book.select('.thumbnail')[0]['src'])    

In [21]:
len(images)

4

In [22]:
images

['https://books.toscrape.com/media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg',
 'https://books.toscrape.com/media/cache/5b/88/5b88c52633f53cacf162c15f4f823153.jpg',
 'https://books.toscrape.com/media/cache/94/b1/94b1b8b244bce9677c2f29ccc890d4d2.jpg',
 'https://books.toscrape.com/media/cache/81/c4/81c4a973364e17d01f217e1188253d5e.jpg']

In [23]:
# Modify code to iterate through all pages and grab image URLs from each page
# This version iterates through fixed number of pages

# Store image URLs in list
images = []

# Create base URL with placeholder for page numbers
base_url = "https://books.toscrape.com/catalogue/page-{}.html"

# URL stem to which image source will be concatenated
url_stem = "https://books.toscrape.com/"

for n in range(1,51):

    # Extract contents of each page
    res = requests.get(base_url.format(n))
    
    # Store all HTML contents in soup object
    soup = bs4.BeautifulSoup(res.text,"lxml")

    # Store book elements in a separate object
    books = soup.select('.product_pod')

    # If book has 5 star rating, add image URL to list
    # Concatenate "to scrape" URL stem and image source for image
    for book in books:
        if len(book.select('.star-rating.Five')) > 0:
            images.append(url_stem + book.select('.thumbnail')[0]['src'])      


In [24]:
len(images)

196

In [25]:
images[:10]

['https://books.toscrape.com/../media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg',
 'https://books.toscrape.com/../media/cache/5b/88/5b88c52633f53cacf162c15f4f823153.jpg',
 'https://books.toscrape.com/../media/cache/94/b1/94b1b8b244bce9677c2f29ccc890d4d2.jpg',
 'https://books.toscrape.com/../media/cache/81/c4/81c4a973364e17d01f217e1188253d5e.jpg',
 'https://books.toscrape.com/../media/cache/9c/2e/9c2e0eb8866b8e3f3b768994fd3d1c1a.jpg',
 'https://books.toscrape.com/../media/cache/44/cc/44ccc99c8f82c33d4f9d2afa4ef25787.jpg',
 'https://books.toscrape.com/../media/cache/2e/98/2e98c332bf8563b584784971541c4445.jpg',
 'https://books.toscrape.com/../media/cache/0f/7e/0f7ee69495c0df1d35723f012624a9f8.jpg',
 'https://books.toscrape.com/../media/cache/5d/7e/5d7ecde8e81513eba8a64c9fe000744b.jpg',
 'https://books.toscrape.com/../media/cache/65/71/6571919836ec51ed54f0050c31d8a0cd.jpg']

In [26]:
# Program should ideally be flexible enough to account for any pages that might be added to website

# Create base URL with placeholder for page numbers
base_url = "https://books.toscrape.com/catalogue/page-{}.html"

# Test to see what happens when attempting to query nonexistent page
res = requests.get(base_url.format(10000))
res.text

'<html>\r\n<head><title>404 Not Found</title></head>\r\n<body bgcolor="white">\r\n<center><h1>404 Not Found</h1></center>\r\n<hr><center>nginx/1.1.19</center>\r\n</body>\r\n</html>\r\n'

In [27]:
"404 Not Found" in res.text

True

In [28]:
# This version iterates through all pages until it has reached the last pages
# Rather than using fixed page number, this version uses a while loop that breaks when encountering a 404 error

# Store image URLs in list
images = []

# Create base URL with placeholder for page numbers
base_url = "https://books.toscrape.com/catalogue/page-{}.html"

# URL stem to which image source will be concatenated
url_stem = "https://books.toscrape.com/"

# Create variable for page numbers
page_num = 1

# Use while loop to keep searching through pages until 404 error is triggered
searching = True

while searching:

    # Extract contents of each page
    res = requests.get(base_url.format(page_num))
    
    # Stop searching upon 404 error 
    if "404 Not Found" in res.text:
        break
    
    else:    
    
        # Store all HTML contents in soup object
        soup = bs4.BeautifulSoup(res.text,"lxml")
    
        # Store book elements in a separate object
        books = soup.select('.product_pod')

        # If book has 5 star rating, add image URL to list
        # Concatenate "to scrape" URL stem and image source for image
        for book in books:
            if len(book.select('.star-rating.Five')) > 0:
                images.append(url_stem + book.select('.thumbnail')[0]['src'])    

        # Proceed to next page
        page_num += 1


In [29]:
len(images)

196

In [30]:
images[:10]

['https://books.toscrape.com/../media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg',
 'https://books.toscrape.com/../media/cache/5b/88/5b88c52633f53cacf162c15f4f823153.jpg',
 'https://books.toscrape.com/../media/cache/94/b1/94b1b8b244bce9677c2f29ccc890d4d2.jpg',
 'https://books.toscrape.com/../media/cache/81/c4/81c4a973364e17d01f217e1188253d5e.jpg',
 'https://books.toscrape.com/../media/cache/9c/2e/9c2e0eb8866b8e3f3b768994fd3d1c1a.jpg',
 'https://books.toscrape.com/../media/cache/44/cc/44ccc99c8f82c33d4f9d2afa4ef25787.jpg',
 'https://books.toscrape.com/../media/cache/2e/98/2e98c332bf8563b584784971541c4445.jpg',
 'https://books.toscrape.com/../media/cache/0f/7e/0f7ee69495c0df1d35723f012624a9f8.jpg',
 'https://books.toscrape.com/../media/cache/5d/7e/5d7ecde8e81513eba8a64c9fe000744b.jpg',
 'https://books.toscrape.com/../media/cache/65/71/6571919836ec51ed54f0050c31d8a0cd.jpg']

<img src= 'https://books.toscrape.com/../media/cache/65/71/6571919836ec51ed54f0050c31d8a0cd.jpg'>