# BeautifulSoup

In [73]:
import requests as r
from bs4 import BeautifulSoup

In [2]:
resp = r.get("https://books.toscrape.com/")

In [3]:
resp.content



In [4]:
soup = BeautifulSoup(resp.content, "html.parser")  # lxml

In [5]:
type(soup)

bs4.BeautifulSoup

In [6]:
soup.title

<title>
    All products | Books to Scrape - Sandbox
</title>

In [None]:
print(soup.prettify())

In [None]:
soup.html

In [9]:
soup.name

'[document]'

# Tags

In [10]:
soup.title

<title>
    All products | Books to Scrape - Sandbox
</title>

In [11]:
soup.h1

<h1>All products</h1>

In [12]:
soup.div

<div class="page_inner">
<div class="row">
<div class="col-sm-8 h1"><a href="index.html">Books to Scrape</a><small> We love being scraped!</small>
</div>
</div>
</div>

In [13]:
first_div = soup.div

In [14]:
type(first_div)

bs4.element.Tag

In [15]:
first_div.attrs

{'class': ['page_inner']}

In [16]:
first_div.div.div.attrs

{'class': ['col-sm-8', 'h1']}

In [17]:
first_div.attrs['class'].append("some-other-class")

In [18]:
first_div

<div class="page_inner some-other-class">
<div class="row">
<div class="col-sm-8 h1"><a href="index.html">Books to Scrape</a><small> We love being scraped!</small>
</div>
</div>
</div>

In [19]:
soup.div

<div class="page_inner some-other-class">
<div class="row">
<div class="col-sm-8 h1"><a href="index.html">Books to Scrape</a><small> We love being scraped!</small>
</div>
</div>
</div>

# Parents, Children, And Descendants

In [20]:
soup.ul

<ul class="breadcrumb">
<li>
<a href="index.html">Home</a>
</li>
<li class="active">All products</li>
</ul>

In [21]:
print(soup.ul.prettify())

<ul class="breadcrumb">
 <li>
  <a href="index.html">
   Home
  </a>
 </li>
 <li class="active">
  All products
 </li>
</ul>



In [22]:
soup.ul.children

<list_iterator at 0x1b3224ceef0>

In [23]:
list(soup.ul.children)

['\n',
 <li>
 <a href="index.html">Home</a>
 </li>,
 '\n',
 <li class="active">All products</li>,
 '\n']

In [24]:
from bs4.element import NavigableString

In [25]:
list(filter(lambda x: type(x) != NavigableString, soup.ul.children))

[<li>
 <a href="index.html">Home</a>
 </li>,
 <li class="active">All products</li>]

In [26]:
def no_nav_strings(iterable):
    return list(filter(lambda x: type(x) != NavigableString, iterable))

In [27]:
no_nav_strings(soup.ul.children)

[<li>
 <a href="index.html">Home</a>
 </li>,
 <li class="active">All products</li>]

In [28]:
list(soup.ul.descendants)

['\n',
 <li>
 <a href="index.html">Home</a>
 </li>,
 '\n',
 <a href="index.html">Home</a>,
 'Home',
 '\n',
 '\n',
 <li class="active">All products</li>,
 'All products',
 '\n']

In [29]:
desc = no_nav_strings(soup.ul.descendants)

In [30]:
desc[0]

<li>
<a href="index.html">Home</a>
</li>

In [31]:
desc[0].parent

<ul class="breadcrumb">
<li>
<a href="index.html">Home</a>
</li>
<li class="active">All products</li>
</ul>

# Siblings

In [32]:
soup.ul

<ul class="breadcrumb">
<li>
<a href="index.html">Home</a>
</li>
<li class="active">All products</li>
</ul>

In [33]:
soup.ul.li

<li>
<a href="index.html">Home</a>
</li>

In [34]:
soup.ul.li.next_sibling.next_sibling

<li class="active">All products</li>

In [35]:
soup.ul.li \
    .next_sibling \
    .next_sibling \
    .previous_sibling \
    .previous_sibling

<li>
<a href="index.html">Home</a>
</li>

# Extracting Text

In [36]:
soup.a

<a href="index.html">Books to Scrape</a>

In [37]:
soup.a.get_text()

'Books to Scrape'

In [38]:
soup.a.text

'Books to Scrape'

In [39]:
soup.a.string

'Books to Scrape'

In [40]:
soup.ul

<ul class="breadcrumb">
<li>
<a href="index.html">Home</a>
</li>
<li class="active">All products</li>
</ul>

In [41]:
soup.ul.get_text()

'\n\nHome\n\nAll products\n'

In [42]:
soup.ul.text

'\n\nHome\n\nAll products\n'

In [43]:
soup.ul.string  # None

In [44]:
print(soup.a.text, " of type ", type(soup.a.text))
print(soup.a.get_text(), " of type ", type(soup.a.get_text()))
print(soup.a.string, " of type ", type(soup.a.string))

Books to Scrape  of type  <class 'str'>
Books to Scrape  of type  <class 'str'>
Books to Scrape  of type  <class 'bs4.element.NavigableString'>


In [45]:
soup.ul.text

'\n\nHome\n\nAll products\n'

In [46]:
soup.ul.get_text()

'\n\nHome\n\nAll products\n'

In [47]:
soup.ul.get_text(separator=", ", strip=True)

'Home, All products'

# All Strings

In [48]:
soup.stripped_strings

<generator object PageElement.stripped_strings at 0x000001B321D50E10>

In [49]:
all_strings = list(soup.stripped_strings)

In [50]:
len(all_strings)

147

In [51]:
len(list(soup.strings))

852

In [52]:
list(soup.strings)[:10]

['\n',
 '\n',
 '\n',
 '\n',
 ' ',
 ' ',
 '\n',
 '\n',
 '\n    All products | Books to Scrape - Sandbox\n',
 '\n']

# Search

In [None]:
soup

In [54]:
# - find() -> like find_all(), but returns only 1st result 
# - find_all()

In [55]:
len(soup.find_all())

541

In [56]:
len(soup.find_all("a"))

94

In [57]:
len(soup.find_all(["a", "p"]))

154

In [58]:
# tag_name: p
# attr: class=price_color

In [77]:
price_tags = soup.find_all("p", attrs={"class": "price_color"})
price_tags

[<p class="price_color">£51.77</p>,
 <p class="price_color">£53.74</p>,
 <p class="price_color">£50.10</p>,
 <p class="price_color">£47.82</p>,
 <p class="price_color">£54.23</p>,
 <p class="price_color">£22.65</p>,
 <p class="price_color">£33.34</p>,
 <p class="price_color">£17.93</p>,
 <p class="price_color">£22.60</p>,
 <p class="price_color">£52.15</p>,
 <p class="price_color">£13.99</p>,
 <p class="price_color">£20.66</p>,
 <p class="price_color">£17.46</p>,
 <p class="price_color">£52.29</p>,
 <p class="price_color">£35.02</p>,
 <p class="price_color">£57.25</p>,
 <p class="price_color">£23.88</p>,
 <p class="price_color">£37.59</p>,
 <p class="price_color">£51.33</p>,
 <p class="price_color">£45.17</p>]

In [76]:
price_tags = soup.find_all("p", class_="price_color")
price_tags

[<p class="price_color">£51.77</p>,
 <p class="price_color">£53.74</p>,
 <p class="price_color">£50.10</p>,
 <p class="price_color">£47.82</p>,
 <p class="price_color">£54.23</p>,
 <p class="price_color">£22.65</p>,
 <p class="price_color">£33.34</p>,
 <p class="price_color">£17.93</p>,
 <p class="price_color">£22.60</p>,
 <p class="price_color">£52.15</p>,
 <p class="price_color">£13.99</p>,
 <p class="price_color">£20.66</p>,
 <p class="price_color">£17.46</p>,
 <p class="price_color">£52.29</p>,
 <p class="price_color">£35.02</p>,
 <p class="price_color">£57.25</p>,
 <p class="price_color">£23.88</p>,
 <p class="price_color">£37.59</p>,
 <p class="price_color">£51.33</p>,
 <p class="price_color">£45.17</p>]

In [61]:
[price.get_text() for price in price_tags]

['£51.77',
 '£53.74',
 '£50.10',
 '£47.82',
 '£54.23',
 '£22.65',
 '£33.34',
 '£17.93',
 '£22.60',
 '£52.15',
 '£13.99',
 '£20.66',
 '£17.46',
 '£52.29',
 '£35.02',
 '£57.25',
 '£23.88',
 '£37.59',
 '£51.33',
 '£45.17']

In [62]:
add_buttons = soup.find_all("button",
                            attrs={"data-loading-text": "Adding..."})

In [63]:
len(add_buttons)

20

In [64]:
# contains: "add", "remove", case-insensitive

In [65]:
add_buttons = soup.find_all("button",
                            attrs={"data-loading-text": lambda x: "add" in x.lower() or "remove" in x.lower()})

In [66]:
len(add_buttons)

20

# Challenge

> Extract the following elements from the 1st page of books.toscrape.com:

  * full book title 
  * price as float
  * rating as int

> Data should be stored as python list of dictionaries, where each book is 
dictionary

> e.g.
  
  {
    'title': 'Mesaerion: The Best Science Fiction Stories 1800-1849',
    'price': 37.59,
    'rating': 1
  }

In [2]:
import requests as r
from bs4 import BeautifulSoup
resp = r.get("https://books.toscrape.com/")
soup = BeautifulSoup(resp.content, "html.parser")
type(soup)

bs4.BeautifulSoup

In [3]:
len(soup.find_all())

541

In [6]:
book_tags = soup.find_all("article", attrs={"class": "product_pod"})
len(book_tags)

20

In [13]:
extract_book_data(book_tags[5])

('The Requiem Red', '£22.65', 'One')

In [14]:
print(book_tags[0].prettify())

<article class="product_pod">
 <div class="image_container">
  <a href="catalogue/a-light-in-the-attic_1000/index.html">
   <img alt="A Light in the Attic" class="thumbnail" src="media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/>
  </a>
 </div>
 <p class="star-rating Three">
  <i class="icon-star">
  </i>
  <i class="icon-star">
  </i>
  <i class="icon-star">
  </i>
  <i class="icon-star">
  </i>
  <i class="icon-star">
  </i>
 </p>
 <h3>
  <a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">
   A Light in the ...
  </a>
 </h3>
 <div class="product_price">
  <p class="price_color">
   £51.77
  </p>
  <p class="instock availability">
   <i class="icon-ok">
   </i>
   In stock
  </p>
  <form>
   <button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">
    Add to basket
   </button>
  </form>
 </div>
</article>



In [23]:
refinement_test = extract_book_data(book_tags[6])
refinement_test

{'title': 'The Dirty Little Secrets of Getting Your Dream Job',
 'price': 33.34,
 'rating': 4}

In [18]:
import re

def clean_price(price):
    return float(re.sub("[^0-9.]", "", price))

33.34

In [29]:
def map_rating(rating):
  rating_map = {"One": 1,
                "Two": 2,
                "Three": 3,
                "Four": 4,
                "Five": 5
  }
  
  return rating_map[rating]

In [30]:
def extract_book_data(book_tag):
  title = book_tag.find("h3").find("a")["title"]
  price = book_tag.find("p", attrs={"class": "price_color"}).get_text()
  rating = book_tag.find("p", attrs={"class": "star-rating"})["class"][-1]
  
  return {
    "title": title,
    "price": clean_price(price),
    "rating": map_rating(rating)
  }

In [32]:
book_tags = soup.find_all("article", attrs={"class": "product_pod"})

book_data = [extract_book_data(book_tag) for book_tag in book_tags]

In [33]:
len(book_data)

20

In [19]:
from random import choice

In [35]:
extract_book_data(choice(book_tags))

{'title': 'Sapiens: A Brief History of Humankind', 'price': 54.23, 'rating': 5}

In [37]:
book_data

[{'title': 'A Light in the Attic', 'price': 51.77, 'rating': 3},
 {'title': 'Tipping the Velvet', 'price': 53.74, 'rating': 1},
 {'title': 'Soumission', 'price': 50.1, 'rating': 1},
 {'title': 'Sharp Objects', 'price': 47.82, 'rating': 4},
 {'title': 'Sapiens: A Brief History of Humankind',
  'price': 54.23,
  'rating': 5},
 {'title': 'The Requiem Red', 'price': 22.65, 'rating': 1},
 {'title': 'The Dirty Little Secrets of Getting Your Dream Job',
  'price': 33.34,
  'rating': 4},
 {'title': 'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull',
  'price': 17.93,
  'rating': 3},
 {'title': 'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics',
  'price': 22.6,
  'rating': 4},
 {'title': 'The Black Maria', 'price': 52.15, 'rating': 1},
 {'title': 'Starving Hearts (Triangular Trade Trilogy, #1)',
  'price': 13.99,
  'rating': 2},
 {'title': "Shakespeare's Sonnets", 'price': 20.66, 'rating': 4},
 {'title': 'Set 

In [38]:
import pandas as pd

In [39]:
df = pd.DataFrame(book_data)

In [40]:
df

Unnamed: 0,title,price,rating
0,A Light in the Attic,51.77,3
1,Tipping the Velvet,53.74,1
2,Soumission,50.1,1
3,Sharp Objects,47.82,4
4,Sapiens: A Brief History of Humankind,54.23,5
5,The Requiem Red,22.65,1
6,The Dirty Little Secrets of Getting Your Dream...,33.34,4
7,The Coming Woman: A Novel Based on the Life of...,17.93,3
8,The Boys in the Boat: Nine Americans and Their...,22.6,4
9,The Black Maria,52.15,1


In [41]:
df.price.mean()

38.048500000000004

In [42]:
df[df.price < 20].title

7     The Coming Woman: A Novel Based on the Life of...
10       Starving Hearts (Triangular Trade Trilogy, #1)
12                                          Set Me Free
Name: title, dtype: object

In [None]:
df.to_csv("book.csv", index=False)

In [43]:
df.to_json("books.json", orient="records")

# Solution

In [1]:
import requests as r
from bs4 import BeautifulSoup

In [2]:
resp = r.get("https://books.toscrape.com/")
soup = BeautifulSoup(resp.content)

In [4]:
len(soup.find_all())

541

In [5]:
book_tags = soup.find_all("article", attrs={"class": "product_pod"})

In [6]:
len(book_tags)

20

In [17]:
def extract_book_data(book_tag):
    title = book_tag.find("h3").find("a")["title"]
    price = book_tag.find("p", attrs={"class": "price_color"}).get_text()
    rating = book_tag.find("p", attrs={"class": "star-rating"})["class"][-1]

    return title, price, rating

In [21]:
extract_book_data(book_tags[5])

('The Requiem Red', '£22.65', 'One')

In [8]:
print(book_tags[0].prettify())

<article class="product_pod">
 <div class="image_container">
  <a href="catalogue/a-light-in-the-attic_1000/index.html">
   <img alt="A Light in the Attic" class="thumbnail" src="media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/>
  </a>
 </div>
 <p class="star-rating Three">
  <i class="icon-star">
  </i>
  <i class="icon-star">
  </i>
  <i class="icon-star">
  </i>
  <i class="icon-star">
  </i>
  <i class="icon-star">
  </i>
 </p>
 <h3>
  <a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">
   A Light in the ...
  </a>
 </h3>
 <div class="product_price">
  <p class="price_color">
   £51.77
  </p>
  <p class="instock availability">
   <i class="icon-ok">
   </i>
   In stock
  </p>
  <form>
   <button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">
    Add to basket
   </button>
  </form>
 </div>
</article>



# Solution Refinement

In [24]:
len(book_tags)

20

In [25]:
def extract_book_data(book_tag):
    title = book_tag.find("h3").find("a")["title"]
    price = book_tag.find("p", attrs={"class": "price_color"}).get_text()
    rating = book_tag.find("p", attrs={"class": "star-rating"})["class"][-1]

    return title, price, rating

In [39]:
b = extract_book_data(book_tags[6])
b

('The Dirty Little Secrets of Getting Your Dream Job', '£33.34', 'Four')

In [36]:
def clean_price(price):
    return float("".join([char for char in price if char.isdigit() or char == "."]))

In [37]:
type(clean_price(b[1]))

float

In [38]:
clean_price(b[1])

33.34

In [40]:
import re


def clean_price(price):
    return float(re.sub("[^0-9.]", "", price))

In [41]:
clean_price(b[1])

33.34

In [51]:
def map_rating(rating):
    rating_map = {
        "One": 1,
        "Two": 2,
        "Three": 3,
        "Four": 4,
        "Five": 5,
    }

    return rating_map[rating]

In [63]:
def extract_book_data(book_tag):
    title = book_tag.find("h3").find("a")["title"]
    price = book_tag.find("p", attrs={"class": "price_color"}).get_text()
    rating = book_tag.find("p", attrs={"class": "star-rating"})["class"][-1]

    return {
        "title": title,
        "price": clean_price(price),
        "rating": map_rating(rating)
    }

In [64]:
book_tags = soup.find_all("article", attrs={"class": "product_pod"})

book_data = [extract_book_data(book_tag) for book_tag in book_tags]

In [66]:
len(book_data)

20

In [53]:
from random import choice

In [62]:
extract_book_data(choice(book_tags))

{'title': 'Set Me Free', 'price': 17.46, 'rating': 5}

# An Extra: pandas

In [67]:
book_data

[{'title': 'A Light in the Attic', 'price': 51.77, 'rating': 3},
 {'title': 'Tipping the Velvet', 'price': 53.74, 'rating': 1},
 {'title': 'Soumission', 'price': 50.1, 'rating': 1},
 {'title': 'Sharp Objects', 'price': 47.82, 'rating': 4},
 {'title': 'Sapiens: A Brief History of Humankind',
  'price': 54.23,
  'rating': 5},
 {'title': 'The Requiem Red', 'price': 22.65, 'rating': 1},
 {'title': 'The Dirty Little Secrets of Getting Your Dream Job',
  'price': 33.34,
  'rating': 4},
 {'title': 'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull',
  'price': 17.93,
  'rating': 3},
 {'title': 'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics',
  'price': 22.6,
  'rating': 4},
 {'title': 'The Black Maria', 'price': 52.15, 'rating': 1},
 {'title': 'Starving Hearts (Triangular Trade Trilogy, #1)',
  'price': 13.99,
  'rating': 2},
 {'title': "Shakespeare's Sonnets", 'price': 20.66, 'rating': 4},
 {'title': 'Set 

In [70]:
# find average price of all books
sum([book["price"] for book in book_data]) / len(book_data)

38.048500000000004

In [73]:
# find titles with a price less than 20
[book['title'] for book in book_data if book["price"] < 20]

['The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull',
 'Starving Hearts (Triangular Trade Trilogy, #1)',
 'Set Me Free']

In [74]:
!pip install pandas==1.5.3

Collecting pandas==1.5.3
  Downloading pandas-1.5.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.2 MB)
[?25l     |                                | 10 kB 27.7 MB/s eta 0:00:01     |                                | 20 kB 33.1 MB/s eta 0:00:01     |                                | 30 kB 40.7 MB/s eta 0:00:01     |                                | 40 kB 29.0 MB/s eta 0:00:01     |▏                               | 51 kB 31.0 MB/s eta 0:00:01     |▏                               | 61 kB 34.5 MB/s eta 0:00:01     |▏                               | 71 kB 37.2 MB/s eta 0:00:01     |▏                               | 81 kB 40.1 MB/s eta 0:00:01     |▎                               | 92 kB 35.5 MB/s eta 0:00:01     |▎                               | 102 kB 36.5 MB/s eta 0:00:01     |▎                               | 112 kB 36.5 MB/s eta 0:00:01     |▎                               | 122 kB 36.5 MB/s eta 0:00:01     |▍                               | 133 kB 36.5 

In [75]:
import pandas as pd

In [76]:
df = pd.DataFrame(book_data)

In [77]:
df

Unnamed: 0,title,price,rating
0,A Light in the Attic,51.77,3
1,Tipping the Velvet,53.74,1
2,Soumission,50.1,1
3,Sharp Objects,47.82,4
4,Sapiens: A Brief History of Humankind,54.23,5
5,The Requiem Red,22.65,1
6,The Dirty Little Secrets of Getting Your Dream...,33.34,4
7,The Coming Woman: A Novel Based on the Life of...,17.93,3
8,The Boys in the Boat: Nine Americans and Their...,22.6,4
9,The Black Maria,52.15,1


In [78]:
df.price.mean()

38.048500000000004

In [81]:
df[df.price < 20].title

In [84]:
df.to_csv("book.csv", index=False)

In [85]:
df.to_json("books.json", orient="records")

# Functional Search Patterns

In [89]:
soup.find_all(id="messages")

[<div id="messages">
 </div>]

In [90]:
soup.find_all(attrs={"id": "messages"})

[<div id="messages">
 </div>]

In [92]:
len(soup.find_all(attrs={"id": lambda x: x is not None}))

4

In [93]:
len(soup.find_all(id=lambda x: x is not None))

4

In [95]:
len(soup.find_all(lambda x: x.has_attr("id")))

4

In [97]:
def fiction_category_anchor(tag):
    return tag.name == "a" and "category" in tag["href"] and "Fiction" in tag.text

In [99]:
len(soup.find_all(fiction_category_anchor))

6

# Text Search

In [100]:
soup.find_all(text="Fiction")

[]

In [101]:
import re

re.compile("Fiction", re.I)

re.compile(r'Fiction', re.IGNORECASE|re.UNICODE)

In [105]:
soup.find_all(text=re.compile("Fiction", re.I))

['\n                            \n                                Historical Fiction\n                            \n                        ',
 '\n                            \n                                Womens Fiction\n                            \n                        ',
 '\n                            \n                                Fiction\n                            \n                        ',
 '\n                            \n                                Nonfiction\n                            \n                        ',
 '\n                            \n                                Science Fiction\n                            \n                        ',
 '\n                            \n                                Adult Fiction\n                            \n                        ',
 '\n                            \n                                Christian Fiction\n                            \n                        ']

In [44]:
text_matches = soup.find_all(text=re.compile("Fiction", re.I))
text_matches

  text_matches = soup.find_all(text=re.compile("Fiction", re.I))


['\n                            \n                                Historical Fiction\n                            \n                        ',
 '\n                            \n                                Womens Fiction\n                            \n                        ',
 '\n                            \n                                Fiction\n                            \n                        ',
 '\n                            \n                                Nonfiction\n                            \n                        ',
 '\n                            \n                                Science Fiction\n                            \n                        ',
 '\n                            \n                                Adult Fiction\n                            \n                        ',
 '\n                            \n                                Christian Fiction\n                            \n                        ']

In [107]:
[text.strip() for text in text_matches]

['Historical Fiction',
 'Womens Fiction',
 'Fiction',
 'Nonfiction',
 'Science Fiction',
 'Adult Fiction',
 'Christian Fiction']

In [108]:
all_text = list(soup.stripped_strings)
[text for text in all_text if "fiction" in text.lower()]

['Historical Fiction',
 'Womens Fiction',
 'Fiction',
 'Nonfiction',
 'Science Fiction',
 'Adult Fiction',
 'Christian Fiction']

In [115]:
# how about only anchors that meet this text-based criterion?

soup.find_all("a", text=re.compile("Fiction", re.I))

[<a href="catalogue/category/books/historical-fiction_4/index.html">
                             
                                 Historical Fiction
                             
                         </a>,
 <a href="catalogue/category/books/womens-fiction_9/index.html">
                             
                                 Womens Fiction
                             
                         </a>,
 <a href="catalogue/category/books/fiction_10/index.html">
                             
                                 Fiction
                             
                         </a>,
 <a href="catalogue/category/books/nonfiction_13/index.html">
                             
                                 Nonfiction
                             
                         </a>,
 <a href="catalogue/category/books/science-fiction_16/index.html">
                             
                                 Science Fiction
                             
                    

In [112]:
all_text = list(soup.strings) # NavigableStrings

In [114]:
[text.parent for text in all_text if "fiction" in text.lower() and text.parent.name == "a"]

[<a href="catalogue/category/books/historical-fiction_4/index.html">
                             
                                 Historical Fiction
                             
                         </a>,
 <a href="catalogue/category/books/womens-fiction_9/index.html">
                             
                                 Womens Fiction
                             
                         </a>,
 <a href="catalogue/category/books/fiction_10/index.html">
                             
                                 Fiction
                             
                         </a>,
 <a href="catalogue/category/books/nonfiction_13/index.html">
                             
                                 Nonfiction
                             
                         </a>,
 <a href="catalogue/category/books/science-fiction_16/index.html">
                             
                                 Science Fiction
                             
                    

# Searching By CSS

In [116]:
book_tags = soup.find_all("article", attrs={"class": "product_pod"})

In [119]:
titles = []
for tag in book_tags:
    title = tag.find("h3").find("a")["title"]
    titles.append(title)
    
titles

['A Light in the Attic',
 'Tipping the Velvet',
 'Soumission',
 'Sharp Objects',
 'Sapiens: A Brief History of Humankind',
 'The Requiem Red',
 'The Dirty Little Secrets of Getting Your Dream Job',
 'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull',
 'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics',
 'The Black Maria',
 'Starving Hearts (Triangular Trade Trilogy, #1)',
 "Shakespeare's Sonnets",
 'Set Me Free',
 "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)",
 'Rip it Up and Start Again',
 'Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991',
 'Olio',
 'Mesaerion: The Best Science Fiction Stories 1800-1849',
 'Libertarianism for Beginners',
 "It's Only the Himalayas"]

In [118]:
title_tags = soup.select("article.product_pod > h3 > a")
titles = [tag["title"] for tag in title_tags]
titles

['A Light in the Attic',
 'Tipping the Velvet',
 'Soumission',
 'Sharp Objects',
 'Sapiens: A Brief History of Humankind',
 'The Requiem Red',
 'The Dirty Little Secrets of Getting Your Dream Job',
 'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull',
 'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics',
 'The Black Maria',
 'Starving Hearts (Triangular Trade Trilogy, #1)',
 "Shakespeare's Sonnets",
 'Set Me Free',
 "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)",
 'Rip it Up and Start Again',
 'Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991',
 'Olio',
 'Mesaerion: The Best Science Fiction Stories 1800-1849',
 'Libertarianism for Beginners',
 "It's Only the Himalayas"]

In [120]:
soup.select("[title]")

[<a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a>,
 <a href="catalogue/tipping-the-velvet_999/index.html" title="Tipping the Velvet">Tipping the Velvet</a>,
 <a href="catalogue/soumission_998/index.html" title="Soumission">Soumission</a>,
 <a href="catalogue/sharp-objects_997/index.html" title="Sharp Objects">Sharp Objects</a>,
 <a href="catalogue/sapiens-a-brief-history-of-humankind_996/index.html" title="Sapiens: A Brief History of Humankind">Sapiens: A Brief History ...</a>,
 <a href="catalogue/the-requiem-red_995/index.html" title="The Requiem Red">The Requiem Red</a>,
 <a href="catalogue/the-dirty-little-secrets-of-getting-your-dream-job_994/index.html" title="The Dirty Little Secrets of Getting Your Dream Job">The Dirty Little Secrets ...</a>,
 <a href="catalogue/the-coming-woman-a-novel-based-on-the-life-of-the-infamous-feminist-victoria-woodhull_993/index.html" title="The Coming Woman: A Novel Based on the Life of the 

In [121]:
soup.select("[title*=Human]")

[<a href="catalogue/sapiens-a-brief-history-of-humankind_996/index.html" title="Sapiens: A Brief History of Humankind">Sapiens: A Brief History ...</a>]

In [123]:
len(soup.select("button.btn-primary[data-loading-text][class*=primary]"))

20

In [124]:
len(soup.select("button"))

20

# Just One Tag

In [125]:
# find_all() vs find()

# select() vs select_one()

In [126]:
soup.find_all("a", limit=1)

[<a href="index.html">Books to Scrape</a>]

In [127]:
soup.find("a")

<a href="index.html">Books to Scrape</a>

In [128]:
soup.find_all("a", limit=1)[0] is soup.find("a")

True

In [129]:
soup.select("a", limit=1)

[<a href="index.html">Books to Scrape</a>]

In [130]:
soup.select_one("a")

<a href="index.html">Books to Scrape</a>

In [131]:
soup.select("a", limit=1)[0] is soup.select_one("a")

True