## BeautifulSoup

In [5]:
import requests as r
from bs4 import BeautifulSoup

In [6]:
res = r.get("https://books.toscrape.com/")
res.content



In [7]:
soup = BeautifulSoup(res.content, "html.parser")

In [9]:
soup.title

<title>
    All products | Books to Scrape - Sandbox
</title>

### Tag

In [13]:
first_div = soup.div

In [14]:
type(first_div)

bs4.element.Tag

In [15]:
first_div.attrs

{'class': ['page_inner']}

In [16]:
first_div.attrs['class'].append('container')

In [17]:
soup.html

<html class="no-js" lang="en-us"> <!--<![endif]-->
<head>
<title>
    All products | Books to Scrape - Sandbox
</title>
<meta content="text/html; charset=utf-8" http-equiv="content-type"/>
<meta content="24th Jun 2016 09:29" name="created"/>
<meta content="" name="description"/>
<meta content="width=device-width" name="viewport"/>
<meta content="NOARCHIVE,NOCACHE" name="robots"/>
<!-- Le HTML5 shim, for IE6-8 support of HTML elements -->
<!--[if lt IE 9]>
        <script src="//html5shim.googlecode.com/svn/trunk/html5.js"></script>
        <![endif]-->
<link href="static/oscar/favicon.ico" rel="shortcut icon"/>
<link href="static/oscar/css/styles.css" rel="stylesheet" type="text/css"/>
<link href="static/oscar/js/bootstrap-datetimepicker/bootstrap-datetimepicker.css" rel="stylesheet"/>
<link href="static/oscar/css/datetimepicker.css" rel="stylesheet" type="text/css"/>
</head>
<body class="default" id="default">
<header class="header container-fluid">
<div class="page_inner container">


In [18]:
first_div.attrs['class'].remove('container')

### Parents, Children, And Descendants

In [23]:
print(soup.ul.prettify())

<ul class="breadcrumb">
 <li>
  <a href="index.html">
   Home
  </a>
 </li>
 <li class="active">
  All products
 </li>
</ul>



In [25]:
list(soup.ul.children)

['\n',
 <li>
 <a href="index.html">Home</a>
 </li>,
 '\n',
 <li class="active">All products</li>,
 '\n']

In [26]:
from bs4.element import NavigableString

In [27]:
list(filter(lambda x: type(x) != NavigableString, soup.ul.children))

[<li>
 <a href="index.html">Home</a>
 </li>,
 <li class="active">All products</li>]

In [28]:
def no_nav_strings(iterable):
    return list(filter(lambda x: type(x) != NavigableString, iterable))

In [29]:
no_nav_strings(soup.ul.children)

[<li>
 <a href="index.html">Home</a>
 </li>,
 <li class="active">All products</li>]

In [32]:
desc = no_nav_strings(soup.ul.descendants)

In [33]:
desc[0].parent

<ul class="breadcrumb">
<li>
<a href="index.html">Home</a>
</li>
<li class="active">All products</li>
</ul>

### Siblings

In [36]:
soup.ul.li\
        .next_sibling\
        .next_sibling\
        .previous_sibling\
        .previous_sibling

<li>
<a href="index.html">Home</a>
</li>

## Extracting Text

In [37]:
soup.a.get_text()

'Books to Scrape'

In [38]:
soup.a.text

'Books to Scrape'

In [39]:
soup.a.string

'Books to Scrape'

In [40]:
soup.ul.get_text()

'\n\nHome\n\nAll products\n'

In [41]:
soup.ul.string

In [45]:
soup.ul.get_text(separator=", ", strip=1)

'Home, All products'

In [46]:
print(soup.a.text, " of type ", type(soup.a.text), ".")
print(soup.a.get_text(), " of type ", type(soup.a.get_text()), ".")
print(soup.a.string, " of type ", type(soup.a.string), ".")

Books to Scrape  of type  <class 'str'> .
Books to Scrape  of type  <class 'str'> .
Books to Scrape  of type  <class 'bs4.element.NavigableString'> .


### All Strings

In [47]:
soup.stripped_strings

<generator object Tag.stripped_strings at 0x7f7989c2f200>

In [48]:
all_strings = list(soup.stripped_strings)

In [49]:
all_strings

['All products | Books to Scrape - Sandbox',
 'Books to Scrape',
 'We love being scraped!',
 'Home',
 'All products',
 'Books',
 'Travel',
 'Mystery',
 'Historical Fiction',
 'Sequential Art',
 'Classics',
 'Philosophy',
 'Romance',
 'Womens Fiction',
 'Fiction',
 'Childrens',
 'Religion',
 'Nonfiction',
 'Music',
 'Default',
 'Science Fiction',
 'Sports and Games',
 'Add a comment',
 'Fantasy',
 'New Adult',
 'Young Adult',
 'Science',
 'Poetry',
 'Paranormal',
 'Art',
 'Psychology',
 'Autobiography',
 'Parenting',
 'Adult Fiction',
 'Humor',
 'Horror',
 'History',
 'Food and Drink',
 'Christian Fiction',
 'Business',
 'Biography',
 'Thriller',
 'Contemporary',
 'Spirituality',
 'Academic',
 'Self Help',
 'Historical',
 'Christian',
 'Suspense',
 'Short Stories',
 'Novels',
 'Health',
 'Politics',
 'Cultural',
 'Erotica',
 'Crime',
 'All products',
 '1000',
 'results - showing',
 '1',
 'to',
 '20',
 '.',
 'This is a demo website for web scraping purposes. Prices and ratings here were 

In [50]:
len(all_strings)

147

In [51]:
len(list(soup.strings)) # stripped_strings get rid of all '\n' character

852

### Search

In [53]:
# - find() -> like find_all(), but returns only 1st result
# - find_all() 

In [57]:
price_tags = soup.find_all("p", attrs={"class":"price_color"})

In [58]:
[price.text for price in price_tags]

['£51.77',
 '£53.74',
 '£50.10',
 '£47.82',
 '£54.23',
 '£22.65',
 '£33.34',
 '£17.93',
 '£22.60',
 '£52.15',
 '£13.99',
 '£20.66',
 '£17.46',
 '£52.29',
 '£35.02',
 '£57.25',
 '£23.88',
 '£37.59',
 '£51.33',
 '£45.17']

In [161]:
def find_page_element(page, element, name_class="", text=""):
    finder = page.find_all(element, attrs={name_class : text})
    return [element.text for element in finder]

## Challenge

In [185]:
import requests as r
from bs4 import BeautifulSoup

In [186]:
res = r.get("https://books.toscrape.com/")
soup = BeautifulSoup(res.content)

In [187]:
soup.find_all()

[<html class="no-js" lang="en-us"> <!--<![endif]-->
 <head>
 <title>
     All products | Books to Scrape - Sandbox
 </title>
 <meta content="text/html; charset=utf-8" http-equiv="content-type"/>
 <meta content="24th Jun 2016 09:29" name="created"/>
 <meta content="" name="description"/>
 <meta content="width=device-width" name="viewport"/>
 <meta content="NOARCHIVE,NOCACHE" name="robots"/>
 <!-- Le HTML5 shim, for IE6-8 support of HTML elements -->
 <!--[if lt IE 9]>
         <script src="//html5shim.googlecode.com/svn/trunk/html5.js"></script>
         <![endif]-->
 <link href="static/oscar/favicon.ico" rel="shortcut icon"/>
 <link href="static/oscar/css/styles.css" rel="stylesheet" type="text/css"/>
 <link href="static/oscar/js/bootstrap-datetimepicker/bootstrap-datetimepicker.css" rel="stylesheet"/>
 <link href="static/oscar/css/datetimepicker.css" rel="stylesheet" type="text/css"/>
 </head>
 <body class="default" id="default">
 <header class="header container-fluid">
 <div class="p

In [188]:
book_tags = soup.find_all("article", attrs={"class":"product_pod"})

In [189]:
book_tags

[<article class="product_pod">
 <div class="image_container">
 <a href="catalogue/a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>
 </div>
 <p class="star-rating Three">
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 </p>
 <h3><a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
 <div class="product_price">
 <p class="price_color">£51.77</p>
 <p class="instock availability">
 <i class="icon-ok"></i>
     
         In stock
     
 </p>
 <form>
 <button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
 </form>
 </div>
 </article>,
 <article class="product_pod">
 <div class="image_container">
 <a href="catalogue/tipping-the-velvet_999/index.html"><img alt="Tipping the Velvet" class="thu

In [194]:
def clean_price(price):
    return "".join([char for char in price if char.isdigit() or char == "."])

In [198]:
import re

def clean_price(price):
    return float(re.sub("[^0-9.]", "", price))

In [201]:
def convert_rating(rating):
    rating_map = {"One":1,
    "Two":2,
    "Three":3,
    "Four":4,
    "Five":5}
    
    return rating_map[rating]

In [202]:
def extract_book_data(book_tag):
    title = book_tag.find("h3").find('a')["title"]
    price = book_tag.find("p", attrs={"class":"price_color"}).get_text()
    rating = book_tag.find("p", attrs={"class":"star-rating"})["class"][1]
    
    return {
        "title": title,
        "price":clean_price(price),
        "rating":convert_rating(rating)}

In [207]:
book_data = [extract_book_data(book_tag) for book_tag in book_tags]

In [208]:
book_data

[{'title': 'A Light in the Attic', 'price': 51.77, 'rating': 3},
 {'title': 'Tipping the Velvet', 'price': 53.74, 'rating': 1},
 {'title': 'Soumission', 'price': 50.1, 'rating': 1},
 {'title': 'Sharp Objects', 'price': 47.82, 'rating': 4},
 {'title': 'Sapiens: A Brief History of Humankind',
  'price': 54.23,
  'rating': 5},
 {'title': 'The Requiem Red', 'price': 22.65, 'rating': 1},
 {'title': 'The Dirty Little Secrets of Getting Your Dream Job',
  'price': 33.34,
  'rating': 4},
 {'title': 'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull',
  'price': 17.93,
  'rating': 3},
 {'title': 'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics',
  'price': 22.6,
  'rating': 4},
 {'title': 'The Black Maria', 'price': 52.15, 'rating': 1},
 {'title': 'Starving Hearts (Triangular Trade Trilogy, #1)',
  'price': 13.99,
  'rating': 2},
 {'title': "Shakespeare's Sonnets", 'price': 20.66, 'rating': 4},
 {'title': 'Set 

In [209]:
import pandas as pd

In [210]:
df = pd.DataFrame(book_data)

In [213]:
df[df.price < 20]

Unnamed: 0,title,price,rating
7,The Coming Woman: A Novel Based on the Life of...,17.93,3
10,"Starving Hearts (Triangular Trade Trilogy, #1)",13.99,2
12,Set Me Free,17.46,5


## Functional Search Patterns

In [215]:
soup.find_all(id="messages")

[<div id="messages">
 </div>]

In [217]:
len(soup.find_all(lambda x: x.has_attr('id')))

4

In [220]:
def fiction_category_anchor(tag):
    return tag.name == "a" and "category" in tag['href'] and 'Fiction' in tag.text

In [222]:
len(soup.find_all(fiction_category_anchor))

6

## Text Search

In [228]:
import re

soup.find_all(text=re.compile("Fiction", re.I)) # re.compile(r'Fiction', re.IGNORECASE|re.UNICODE)

['\n                            \n                                Historical Fiction\n                            \n                        ',
 '\n                            \n                                Womens Fiction\n                            \n                        ',
 '\n                            \n                                Fiction\n                            \n                        ',
 '\n                            \n                                Nonfiction\n                            \n                        ',
 '\n                            \n                                Science Fiction\n                            \n                        ',
 '\n                            \n                                Adult Fiction\n                            \n                        ',
 '\n                            \n                                Christian Fiction\n                            \n                        ']

In [229]:
text_matches = soup.find_all(text=re.compile("Fiction", re.I))

In [230]:
[text.strip() for text in text_matches]

['Historical Fiction',
 'Womens Fiction',
 'Fiction',
 'Nonfiction',
 'Science Fiction',
 'Adult Fiction',
 'Christian Fiction']

In [231]:
all_text = list(soup.stripped_strings)
[text for text in all_text if 'fiction' in text.lower()]

['Historical Fiction',
 'Womens Fiction',
 'Fiction',
 'Nonfiction',
 'Science Fiction',
 'Adult Fiction',
 'Christian Fiction']

## Searching By CSS

In [234]:
book_tags = soup.find_all('article', attrs={"class":"product_pod"})

In [235]:
titles = []
for tag in book_tags:
    title = tag.find("h3").find("a")["title"]
    titles.append(title)
titles

['A Light in the Attic',
 'Tipping the Velvet',
 'Soumission',
 'Sharp Objects',
 'Sapiens: A Brief History of Humankind',
 'The Requiem Red',
 'The Dirty Little Secrets of Getting Your Dream Job',
 'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull',
 'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics',
 'The Black Maria',
 'Starving Hearts (Triangular Trade Trilogy, #1)',
 "Shakespeare's Sonnets",
 'Set Me Free',
 "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)",
 'Rip it Up and Start Again',
 'Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991',
 'Olio',
 'Mesaerion: The Best Science Fiction Stories 1800-1849',
 'Libertarianism for Beginners',
 "It's Only the Himalayas"]

In [236]:
title_tags = soup.select("article.product_pod > h3 > a")
titles = [tag['title'] for tag in title_tags]

In [237]:
titles

['A Light in the Attic',
 'Tipping the Velvet',
 'Soumission',
 'Sharp Objects',
 'Sapiens: A Brief History of Humankind',
 'The Requiem Red',
 'The Dirty Little Secrets of Getting Your Dream Job',
 'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull',
 'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics',
 'The Black Maria',
 'Starving Hearts (Triangular Trade Trilogy, #1)',
 "Shakespeare's Sonnets",
 'Set Me Free',
 "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)",
 'Rip it Up and Start Again',
 'Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991',
 'Olio',
 'Mesaerion: The Best Science Fiction Stories 1800-1849',
 'Libertarianism for Beginners',
 "It's Only the Himalayas"]