## GRABBING A TITLE FROM A WEBSITE

In [4]:
import requests

In [5]:
import bs4

In [6]:
result = requests.get("http://www.example.com")

In [7]:
type(result)

requests.models.Response

In [8]:
result.text

'<!doctype html>\n<html>\n<head>\n    <title>Example Domain</title>\n\n    <meta charset="utf-8" />\n    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />\n    <meta name="viewport" content="width=device-width, initial-scale=1" />\n    <style type="text/css">\n    body {\n        background-color: #f0f0f2;\n        margin: 0;\n        padding: 0;\n        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;\n        \n    }\n    div {\n        width: 600px;\n        margin: 5em auto;\n        padding: 2em;\n        background-color: #fdfdff;\n        border-radius: 0.5em;\n        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);\n    }\n    a:link, a:visited {\n        color: #38488f;\n        text-decoration: none;\n    }\n    @media (max-width: 700px) {\n        div {\n            margin: 0 auto;\n            width: auto;\n        }\n    }\n    </style>    \n</head>\n\n<body>\n<div>\n    <

In [9]:
soup = bs4.BeautifulSoup(result.text,"lxml")

In [10]:
soup

<!DOCTYPE html>
<html>
<head>
<title>Example Domain</title>
<meta charset="utf-8"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-type"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<style type="text/css">
    body {
        background-color: #f0f0f2;
        margin: 0;
        padding: 0;
        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
        
    }
    div {
        width: 600px;
        margin: 5em auto;
        padding: 2em;
        background-color: #fdfdff;
        border-radius: 0.5em;
        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
    }
    a:link, a:visited {
        color: #38488f;
        text-decoration: none;
    }
    @media (max-width: 700px) {
        div {
            margin: 0 auto;
            width: auto;
        }
    }
    </style>
</head>
<body>
<div>
<h1>Example Domain</h1>
<p>This domain is for use in illustrative examples

In [11]:
soup.select("title")

[<title>Example Domain</title>]

In [12]:
soup.select("title")[0].getText()

'Example Domain'

In [13]:
site_paragraph = soup.select("p")

In [14]:
site_paragraph

[<p>This domain is for use in illustrative examples in documents. You may use this
     domain in literature without prior coordination or asking for permission.</p>,
 <p><a href="https://www.iana.org/domains/example">More information...</a></p>]

In [15]:
site_paragraph[0].getText()

'This domain is for use in illustrative examples in documents. You may use this\n    domain in literature without prior coordination or asking for permission.'

In [16]:
site_paragraph[1].getText()

'More information...'

## Grabbing a class

* we previously mentioned a big part of a web scraping with a BeautifulSoup library is figuring out what string syntax to pass into the soup.select() method.

* Let's go through the table with some common examples (these make a lot of sense if we use a CSS syntax)

|Syntax||Match Results|
|------||-------------|
|soup.select("div")||All elements with "div" tag|
|soup.select("#some_id")||Elements containg id="some_id"|
|soup.select(".some_class")||Elements containg class ="some_class"|
|soup.select("div span")|| Any elements named span within a div element.
|soup.select("div>span")|| Any elements named span directly within a div element,with nothing in between.|



In [35]:
res = requests.get("https://en.wikipedia.org/wiki/Sam_Manekshaw")

In [36]:
res.text

'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Sam Manekshaw - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"6f5114a8-9ab9-45df-bf21-1da132f7fada","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Sam_Manekshaw","wgTitle":"Sam Manekshaw","wgCurRevisionId":1031743236,"wgRevisionId":1031743236,"wgArticleId":1704980,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Pages using infobox officeholder with unknown parameters","Articles with short description","Short description matches Wikidata","Good articles","Use dmy dates f

In [37]:
soup_1 = bs4.BeautifulSoup(res.text,"lxml")     # here we have small t in .text

In [38]:
soup_1

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Sam Manekshaw - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"6f5114a8-9ab9-45df-bf21-1da132f7fada","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Sam_Manekshaw","wgTitle":"Sam Manekshaw","wgCurRevisionId":1031743236,"wgRevisionId":1031743236,"wgArticleId":1704980,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Pages using infobox officeholder with unknown parameters","Articles with short description","Short description matches Wikidata","Good articles","Use dmy dates from Ap

In [39]:
contents = soup_1.select(".toctext")  # . signifies that it is a class!

In [40]:
contents[0].getText()    # here we have capital T in getText

'Early life and education'

In [47]:
for item in contents:
    print(item.text)
    

Early life and education
Indian Military Academy
Military career
World War II
Post-independence
Chief of Army Staff
Indo-Pakistani War of 1971
Promotion to field marshal
Honours and post-retirement
Personal life and death
Legacy
Awards
Dates of rank
See also
Notes
References
External links


## Grabbing a Image
* Now that we understand how to grab the text information based on tags and element names,lets's explore how to grab images from a website
* Images on a website typically have their own URL link (ending in .jpg or . png)

* Beautifilsoup can scan a page ,locate the <img> tags and grab these URL's
* Then we can download the URL's as images and write them to the computer.
* NOTE: you should always check the copyright permission before downloading and using an image from a website.

In [55]:
Res = requests.get("https://en.wikipedia.org/wiki/Deep_Blue_(chess_computer)") # deep blue (wikipedia)

In [56]:
Res.text

'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Deep Blue (chess computer) - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"ee7dd79d-593c-4901-9530-3b5ce49d055e","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Deep_Blue_(chess_computer)","wgTitle":"Deep Blue (chess computer)","wgCurRevisionId":1032218701,"wgRevisionId":1032218701,"wgArticleId":49387,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 maint: postscript","CS1 maint: unfit URL","Webarchive template wayback links","Articles with short description","Short d

In [57]:
Bsoup = bs4.BeautifulSoup(Res.text,"lxml")

In [58]:
Bsoup.select(".thumbimage")

[<img alt="" class="thumbimage" data-file-height="600" data-file-width="800" decoding="async" height="165" src="//upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Kasparov_Magath_1985_Hamburg-2.png/220px-Kasparov_Magath_1985_Hamburg-2.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Kasparov_Magath_1985_Hamburg-2.png/330px-Kasparov_Magath_1985_Hamburg-2.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Kasparov_Magath_1985_Hamburg-2.png/440px-Kasparov_Magath_1985_Hamburg-2.png 2x" width="220"/>]

In [69]:
person = Bsoup.select(".thumbimage")[0]

In [70]:
person

<img alt="" class="thumbimage" data-file-height="600" data-file-width="800" decoding="async" height="165" src="//upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Kasparov_Magath_1985_Hamburg-2.png/220px-Kasparov_Magath_1985_Hamburg-2.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Kasparov_Magath_1985_Hamburg-2.png/330px-Kasparov_Magath_1985_Hamburg-2.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Kasparov_Magath_1985_Hamburg-2.png/440px-Kasparov_Magath_1985_Hamburg-2.png 2x" width="220"/>

In [71]:
person["class"]

['thumbimage']

In [72]:
person['src']

'//upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Kasparov_Magath_1985_Hamburg-2.png/220px-Kasparov_Magath_1985_Hamburg-2.png'

<img
src ="//upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Kasparov_Magath_1985_Hamburg-2.png/220px-Kasparov_Magath_1985_Hamburg-2.png">

In [73]:
image_link = requests.get("https://upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Kasparov_Magath_1985_Hamburg-2 .png/220px-Kasparov_Magath_1985_Hamburg-2.png")

In [74]:
# image_link.content

b'<!DOCTYPE html>\n<html lang="en">\n<meta charset="utf-8">\n<title>Wikimedia Error</title>\n<style>\n* { margin: 0; padding: 0; }\nbody { background: #fff; font: 15px/1.6 sans-serif; color: #333; }\n.content { margin: 7% auto 0; padding: 2em 1em 1em; max-width: 640px; }\n.footer { clear: both; margin-top: 14%; border-top: 1px solid #e5e5e5; background: #f9f9f9; padding: 2em 0; font-size: 0.8em; text-align: center; }\nimg { float: left; margin: 0 2em 2em 0; }\na img { border: 0; }\nh1 { margin-top: 1em; font-size: 1.2em; }\n.content-text { overflow: hidden; overflow-wrap: break-word; word-wrap: break-word; -webkit-hyphens: auto; -moz-hyphens: auto; -ms-hyphens: auto; hyphens: auto; }\np { margin: 0.7em 0 1em 0; }\na { color: #0645ad; text-decoration: none; }\na:hover { text-decoration: underline; }\ncode { font-family: sans-serif; }\n.text-muted { color: #777; }\n</style>\n<div class="content" role="main">\n<a href="https://www.wikimedia.org"><img src="https://www.wikimedia.org/static/

In [79]:
f = open("C:\\Users\\GUPTA\\Desktop my_computer_image.png","wb")

In [80]:
f.write(image_link.content)

1931

In [81]:
f.close()

In [82]:
pwd #  if we want to save our image at this location

'C:\\Users\\GUPTA\\Desktop'

##  WORKING WITH MULTIPLE PAGES AND ITEMS

* we have seen how to grab elements at a time but realistically, we want to be able to grab multiple elements , most likely across
multiiple pages.
* This is where we can use our prior python knowledge with the web scraping libraries to create powerful scripts.

* we will use the site specifically designed to practice web scraping :
    www.toscrape.com
* we will practise grabbing elements across multiple pages:
* So let's get started 

In [None]:
# GOAL: to grab a title of a every book with 2- star rating

In [5]:
import requests 
import bs4

In [None]:
"https://books.toscrape.com/catalogue/page-1.html"  # page 1

In [None]:
"https://books.toscrape.com/catalogue/page-2.html" # page 2

In [None]:
"https://books.toscrape.com/catalogue/page-3.html" # page 3

 so we need to make a loop to extract all the books from all the 50 pages

In [6]:
base_url = "https://books.toscrape.com/catalogue/page-{}.html"

In [37]:
scrape_url = base_url.format(1)

In [38]:
scrape_url

'https://books.toscrape.com/catalogue/page-1.html'

In [7]:
Request_1 = requests.get("https://books.toscrape.com/catalogue/page-1.html")

In [8]:
bsoup = bs4.BeautifulSoup(Request_1.text,"lxml")

In [9]:
bsoup

<!DOCTYPE html>
<!--[if lt IE 7]>      <html lang="en-us" class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]--><!--[if IE 7]>         <html lang="en-us" class="no-js lt-ie9 lt-ie8"> <![endif]--><!--[if IE 8]>         <html lang="en-us" class="no-js lt-ie9"> <![endif]--><!--[if gt IE 8]><!--><html class="no-js" lang="en-us"> <!--<![endif]-->
<head>
<title>
    All products | Books to Scrape - Sandbox
</title>
<meta content="text/html; charset=utf-8" http-equiv="content-type"/>
<meta content="24th Jun 2016 09:30" name="created"/>
<meta content="" name="description"/>
<meta content="width=device-width" name="viewport"/>
<meta content="NOARCHIVE,NOCACHE" name="robots"/>
<!-- Le HTML5 shim, for IE6-8 support of HTML elements -->
<!--[if lt IE 9]>
        <script src="//html5shim.googlecode.com/svn/trunk/html5.js"></script>
        <![endif]-->
<link href="../static/oscar/favicon.ico" rel="shortcut icon"/>
<link href="../static/oscar/css/styles.css" rel="stylesheet" type="text/css"/>
<link href="

In [15]:
Products = bsoup.select(".product_pod") #class of the whole product/books

In [11]:
len(bsoup.select(".product_pod")) # all the 20 books of the page 1st.

20

In [17]:
Products

[<article class="product_pod">
 <div class="image_container">
 <a href="a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>
 </div>
 <p class="star-rating Three">
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 </p>
 <h3><a href="a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
 <div class="product_price">
 <p class="price_color">Â£51.77</p>
 <p class="instock availability">
 <i class="icon-ok"></i>
     
         In stock
     
 </p>
 <form>
 <button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
 </form>
 </div>
 </article>, <article class="product_pod">
 <div class="image_container">
 <a href="tipping-the-velvet_999/index.html"><img alt="Tipping the Velvet" class="thumbnail" src="../media/cache

In [20]:
Example = Products[0]

In [21]:
Example   # 1st book 

<article class="product_pod">
<div class="image_container">
<a href="a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>
</div>
<p class="star-rating Three">
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
</p>
<h3><a href="a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
<div class="product_price">
<p class="price_color">Â£51.77</p>
<p class="instock availability">
<i class="icon-ok"></i>
    
        In stock
    
</p>
<form>
<button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
</form>
</div>
</article>

1st method to grab a star rating class from the text(may not work every time)

In [22]:
"star-rating Three" in str(Example)

True

In [23]:
'star-rating Two' in str(Example)

False

2nd method (Always work)

In [28]:
Example.select(".star-rating.Three")   # since there was a space b/wn rating and Three we used .

[<p class="star-rating Three">
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 </p>]

In [29]:
Example.select(".star-rating.Two")  # as it is not present for this product

[]

In [30]:
Example.select("a") # since we have 2 title first on the book (image) and other one the actual title

[<a href="a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>,
 <a href="a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a>]

In [36]:
Example.select("a")[1]["title"] # here it is ! the title of our book 

'A Light in the Attic'

In [56]:
two_star_titles=[]

for pages in range(1,51):
    
    scrape_url = base_url.format(pages)
    Request    = requests.get(scrape_url)
    
    BeaSoup = bs4.BeautifulSoup(Request.text,"lxml")
    Allbooks = BeaSoup.select(".product_pod")
    
    for book in Allbooks:
        if len(book.select(".star-rating.Two")) != 0 :
            book_title = book.select("a")[1]["title"] 
            two_star_titles.append(book_title)
            
    

In [57]:
(two_star_titles)

['Starving Hearts (Triangular Trade Trilogy, #1)',
 'Libertarianism for Beginners',
 "It's Only the Himalayas",
 'How Music Works',
 'Maude (1883-1993):She Grew Up with the country',
 "You can't bury them all: Poems",
 'Reasons to Stay Alive',
 'Without Borders (Wanderlove #1)',
 'Soul Reader',
 'Security',
 'Saga, Volume 5 (Saga (Collected Editions) #5)',
 'Reskilling America: Learning to Labor in the Twenty-First Century',
 'Political Suicide: Missteps, Peccadilloes, Bad Calls, Backroom Hijinx, Sordid Pasts, Rotten Breaks, and Just Plain Dumb Mistakes in the Annals of American Politics',
 'Obsidian (Lux #1)',
 'My Paris Kitchen: Recipes and Stories',
 'Masks and Shadows',
 'Lumberjanes, Vol. 2: Friendship to the Max (Lumberjanes #5-8)',
 'Lumberjanes Vol. 3: A Terrible Plan (Lumberjanes #9-12)',
 'Judo: Seven Steps to Black Belt (an Introductory Guide for Beginners)',
 'I Hate Fairyland, Vol. 1: Madly Ever After (I Hate Fairyland (Compilations) #1-5)',
 'Giant Days, Vol. 2 (Giant Day

In [50]:
"Starving Hearts (Triangular Trade Trilogy, #1)" in str(two_star_titles)

True