### SECTION 15 : WEB SCRAPING

In [1]:
import requests

In [2]:
result = requests.get('http://www.example.com')

In [3]:
type(result)

requests.models.Response

In [4]:
result.text

'<!doctype html>\n<html>\n<head>\n    <title>Example Domain</title>\n\n    <meta charset="utf-8" />\n    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />\n    <meta name="viewport" content="width=device-width, initial-scale=1" />\n    <style type="text/css">\n    body {\n        background-color: #f0f0f2;\n        margin: 0;\n        padding: 0;\n        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;\n        \n    }\n    div {\n        width: 600px;\n        margin: 5em auto;\n        padding: 2em;\n        background-color: #fdfdff;\n        border-radius: 0.5em;\n        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);\n    }\n    a:link, a:visited {\n        color: #38488f;\n        text-decoration: none;\n    }\n    @media (max-width: 700px) {\n        div {\n            margin: 0 auto;\n            width: auto;\n        }\n    }\n    </style>    \n</head>\n\n<body>\n<div>\n    <

In [5]:
import bs4

In [6]:
soup = bs4.BeautifulSoup(result.text,'lxml')

In [7]:
soup

<!DOCTYPE html>
<html>
<head>
<title>Example Domain</title>
<meta charset="utf-8"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-type"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<style type="text/css">
    body {
        background-color: #f0f0f2;
        margin: 0;
        padding: 0;
        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
        
    }
    div {
        width: 600px;
        margin: 5em auto;
        padding: 2em;
        background-color: #fdfdff;
        border-radius: 0.5em;
        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
    }
    a:link, a:visited {
        color: #38488f;
        text-decoration: none;
    }
    @media (max-width: 700px) {
        div {
            margin: 0 auto;
            width: auto;
        }
    }
    </style>
</head>
<body>
<div>
<h1>Example Domain</h1>
<p>This domain is for use in illustrative examples

In [8]:
soup.select('title')

[<title>Example Domain</title>]

In [9]:
soup.select('p')

[<p>This domain is for use in illustrative examples in documents. You may use this
     domain in literature without prior coordination or asking for permission.</p>,
 <p><a href="https://www.iana.org/domains/example">More information...</a></p>]

In [10]:
soup.select('h1')

[<h1>Example Domain</h1>]

In [11]:
soup.select('title')[0]

<title>Example Domain</title>

In [12]:
soup.select('title')[0].getText()

'Example Domain'

In [13]:
site_paragraphs = soup.select('p')

In [14]:
site_paragraphs[0]

<p>This domain is for use in illustrative examples in documents. You may use this
    domain in literature without prior coordination or asking for permission.</p>

In [15]:
type(site_paragraphs[0])

bs4.element.Tag

###### Grabbing all elements of a class

In [16]:
res = requests.get("https://en.wikipedia.org/wiki/Grace_Hopper")

In [17]:
soup = bs4.BeautifulSoup(res.text,"lxml")

In [18]:
#soup

In [19]:
soup.select('.toctext')

[<span class="toctext">Early life and education</span>,
 <span class="toctext">Career</span>,
 <span class="toctext">World War II</span>,
 <span class="toctext">UNIVAC</span>,
 <span class="toctext">COBOL</span>,
 <span class="toctext">Standards</span>,
 <span class="toctext">Retirement</span>,
 <span class="toctext">Post-retirement</span>,
 <span class="toctext">Anecdotes</span>,
 <span class="toctext">Death</span>,
 <span class="toctext">Dates of rank</span>,
 <span class="toctext">Awards and honors</span>,
 <span class="toctext">Military awards</span>,
 <span class="toctext">Other awards</span>,
 <span class="toctext">Legacy</span>,
 <span class="toctext">Places</span>,
 <span class="toctext">Programs</span>,
 <span class="toctext">In popular culture</span>,
 <span class="toctext">Grace Hopper Celebration of Women in Computing</span>,
 <span class="toctext">See also</span>,
 <span class="toctext">Notes</span>,
 <span class="toctext">Obituary notices</span>,
 <span class="toctext">Re

In [20]:
type(soup.select('.toctext')[0])

bs4.element.Tag

In [21]:
first_item = soup.select('.toctext')[0]

In [22]:
first_item.text

'Early life and education'

In [23]:
for item in soup.select('.toctext'):
    print(item.text)

Early life and education
Career
World War II
UNIVAC
COBOL
Standards
Retirement
Post-retirement
Anecdotes
Death
Dates of rank
Awards and honors
Military awards
Other awards
Legacy
Places
Programs
In popular culture
Grace Hopper Celebration of Women in Computing
See also
Notes
Obituary notices
References
Further reading
External links


###### Grabbing an Image

In [24]:
res = requests.get("https://en.wikipedia.org/wiki/Deep_Blue_(chess_computer)")

In [25]:
soup = bs4.BeautifulSoup(res.text,'lxml')

In [26]:
soup.select('img')

[<img alt="Deep Blue.jpg" data-file-height="601" data-file-width="400" decoding="async" height="331" src="//upload.wikimedia.org/wikipedia/commons/thumb/b/be/Deep_Blue.jpg/220px-Deep_Blue.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/b/be/Deep_Blue.jpg/330px-Deep_Blue.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/b/be/Deep_Blue.jpg 2x" width="220"/>,
 <img alt="Chess Programming.svg" data-file-height="60" data-file-width="60" decoding="async" height="150" src="//upload.wikimedia.org/wikipedia/commons/thumb/5/52/Chess_Programming.svg/150px-Chess_Programming.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/5/52/Chess_Programming.svg/225px-Chess_Programming.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/5/52/Chess_Programming.svg/300px-Chess_Programming.svg.png 2x" width="150"/>,
 <img alt="" class="thumbimage" data-file-height="600" data-file-width="800" decoding="async" height="165" src="//upload.wikimedia.org/wikipedia/commons/thumb/6/6f/

In [27]:
soup.select('img')[0]

<img alt="Deep Blue.jpg" data-file-height="601" data-file-width="400" decoding="async" height="331" src="//upload.wikimedia.org/wikipedia/commons/thumb/b/be/Deep_Blue.jpg/220px-Deep_Blue.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/b/be/Deep_Blue.jpg/330px-Deep_Blue.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/b/be/Deep_Blue.jpg 2x" width="220"/>

In [28]:
soup.select('.thumbimage')

[<img alt="" class="thumbimage" data-file-height="600" data-file-width="800" decoding="async" height="165" src="//upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Kasparov_Magath_1985_Hamburg-2.png/220px-Kasparov_Magath_1985_Hamburg-2.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Kasparov_Magath_1985_Hamburg-2.png/330px-Kasparov_Magath_1985_Hamburg-2.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Kasparov_Magath_1985_Hamburg-2.png/440px-Kasparov_Magath_1985_Hamburg-2.png 2x" width="220"/>]

In [29]:
computer = soup.select('.thumbimage')[0]

In [30]:
computer

<img alt="" class="thumbimage" data-file-height="600" data-file-width="800" decoding="async" height="165" src="//upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Kasparov_Magath_1985_Hamburg-2.png/220px-Kasparov_Magath_1985_Hamburg-2.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Kasparov_Magath_1985_Hamburg-2.png/330px-Kasparov_Magath_1985_Hamburg-2.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Kasparov_Magath_1985_Hamburg-2.png/440px-Kasparov_Magath_1985_Hamburg-2.png 2x" width="220"/>

In [31]:
type(computer)

bs4.element.Tag

In [32]:
computer['src']

'//upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Kasparov_Magath_1985_Hamburg-2.png/220px-Kasparov_Magath_1985_Hamburg-2.png'

In [33]:
image_link = requests.get('https://upload.wikimedia.org/wikipedia/commons/thumb/b/be/Deep_Blue.jpg/220px-Deep_Blue.jpg')

In [34]:
#image_link.content

In [35]:
f = open ('my_computer_image.jpg','wb')

In [36]:
f.write(image_link.content)

16806

In [37]:
f.close()

##### Book Examples_Part_One - Working with Multiple Pages and Items

In [38]:
# Get the title of every book with a 2 star reading
import requests
import bs4

In [39]:
'http://books.toscrape.com/catalogue/page-2.html'

'http://books.toscrape.com/catalogue/page-2.html'

In [40]:
'http://books.toscrape.com/catalogue/page-3.html'

'http://books.toscrape.com/catalogue/page-3.html'

In [41]:
base_url = 'http://books.toscrape.com/catalogue/page-{}.html'

In [42]:
base_url.format('20')

'http://books.toscrape.com/catalogue/page-20.html'

In [43]:
page_num = 12
base_url.format(page_num)

'http://books.toscrape.com/catalogue/page-12.html'

In [44]:
res = requests.get(base_url.format(1))

In [45]:
soup = bs4.BeautifulSoup(res.text,'lxml')

In [46]:
len(soup.select(".product_pod"))

20

In [47]:
products = soup.select(".product_pod")

##### Book Examples_Part_Two - Working with Multiple Pages and Items

In [48]:
example = products[0]

In [49]:
example

<article class="product_pod">
<div class="image_container">
<a href="a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>
</div>
<p class="star-rating Three">
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
</p>
<h3><a href="a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
<div class="product_price">
<p class="price_color">Â£51.77</p>
<p class="instock availability">
<i class="icon-ok"></i>
    
        In stock
    
</p>
<form>
<button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
</form>
</div>
</article>

In [50]:
str(example)

'<article class="product_pod">\n<div class="image_container">\n<a href="a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>\n</div>\n<p class="star-rating Three">\n<i class="icon-star"></i>\n<i class="icon-star"></i>\n<i class="icon-star"></i>\n<i class="icon-star"></i>\n<i class="icon-star"></i>\n</p>\n<h3><a href="a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>\n<div class="product_price">\n<p class="price_color">Â£51.77</p>\n<p class="instock availability">\n<i class="icon-ok"></i>\n    \n        In stock\n    \n</p>\n<form>\n<button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>\n</form>\n</div>\n</article>'

In [51]:
'star-rating Three' in example

False

In [52]:
example.select(".star-rating.Three")

[<p class="star-rating Three">
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 </p>]

In [53]:
example.select(".star-rating.Two")

[]

In [54]:
[] == example.select(".star-rating.Two")

True

In [55]:
example.select('a')

[<a href="a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>,
 <a href="a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a>]

In [56]:
example.select('a')[1]

<a href="a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a>

In [57]:
example.select('a')[1]['title']

'A Light in the Attic'

- We can check if something is 2 stars (string call in, example.select(rating))
- example.select('a')[1]['title'] to grab the book title

In [58]:
two_star_titles = []

for n in range(1,51):
    
    scrape_url = base_url.format(n)
    res = requests.get(scrape_url)
    
    soup = bs4.BeautifulSoup(res.text,'lxml')
    books = soup.select(".product_pod")
    
    for book in books:

        if len(book.select('.star-rating-Two'))!=0:
            book_title = book.select('a')[1]['title']
            two_star_titles.append(book_title)

In [59]:
two_star_titles

[]