# 23. Web Scraping - BeautifulSoup

In [1]:
# step to be followed:-

#1 : load html [requests lib]
#2 : parse HTML [beautifulsoup lib]
#3 : locate and extract the desired data

In [5]:
from bs4 import BeautifulSoup as bs
import requests

In [57]:
html = '<!DOCTYPE html>\
<html>\
<head>\
<title> Testing Web Page </title>\
</head>\
<body>\
<h1> Web Scraping </h1>\
<p id = "first_para">\
Let\'s start learning \
<b>\
Web Scraping\
</b>\
</p>\
<p class = "abc" id = "second_para">\
You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a>\
</p>\
<p class = "abc">\
<a href = "https://codingninjas.in/"> Coding Ninjas </a>\
</p>\
</body>\
</html>'

In [58]:
data = bs(html,'html.parser')
data

<!DOCTYPE html>
<html><head><title> Testing Web Page </title></head><body><h1> Web Scraping </h1><p id="first_para">Let's start learning <b>Web Scraping</b></p><p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p><p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p></body></html>

In [9]:
type(data)

bs4.BeautifulSoup

In [10]:
# beautify the data
print(data.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   Testing Web Page
  </title>
 </head>
 <body>
  <h1>
   Web Scraping
  </h1>
  <p id="first_para">
   Let's start learning
   <b>
    Web Scraping
   </b>
  </p>
  <p class="abc" id="second_para">
   You can read more about BeautifulSoup from
   <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/">
    here
   </a>
  </p>
  <p class="abc">
   <a href="https://codingninjas.in/">
    Coding Ninjas
   </a>
  </p>
 </body>
</html>


In [15]:
# extract ->title
# data.(tag_name)
print(data.title)
print(data.head)
print(data.h1)
print(data.p)

<title> Testing Web Page </title>
<head><title> Testing Web Page </title></head>
<h1> Web Scraping </h1>
<p id="first_para">Let's start learning <b>Web Scraping</b></p>


In [17]:
# extract content inside tag 
print(data.title) # complete desc of tag
print(data.title.name) # tag name
print(data.title.string) # only content of tag

<title> Testing Web Page </title>
title
 Testing Web Page 


In [21]:
print(data.title.attrs) # show all attribute in particular tag present
print(data.p.attrs)

{}
{'id': 'first_para'}


In [28]:
# get value of id
print(data.p.get('id')) # by using get()
print(data.p['id']) # by using dict concept

first_para
first_para


In [30]:
print(data.get_text()) # only text , extrxct all text 

 Testing Web Page  Web Scraping Let's start learning Web ScrapingYou can read more about BeautifulSoup from  here  Coding Ninjas 


In [33]:
# find() -> return first occurence of ele
print(data.find('p'))
print(data.find('pr')) # none

<p id="first_para">Let's start learning <b>Web Scraping</b></p>
None


In [35]:
# find_all() -- >all occurence of ele
print(data.find_all('p'))

# or
li = data.find_all('p')
for i in li:
    print(i)

[<p id="first_para">Let's start learning <b>Web Scraping</b></p>, <p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p>, <p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p>]
<p id="first_para">Let's start learning <b>Web Scraping</b></p>
<p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p>
<p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p>


### Navigate Tree

In [61]:
# searching -> find() and find_all()
data.find_all(['p','a'])

[<p id="first_para">Let's start learning <b>Web Scraping</b></p>,
 <p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p>,
 <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a>,
 <p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p>,
 <a href="https://codingninjas.in/"> Coding Ninjas </a>]

In [63]:
data.find_all(True)

[<html><head><title> Testing Web Page </title></head><body><h1> Web Scraping </h1><p id="first_para">Let's start learning <b>Web Scraping</b></p><p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p><p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p></body></html>,
 <head><title> Testing Web Page </title></head>,
 <title> Testing Web Page </title>,
 <body><h1> Web Scraping </h1><p id="first_para">Let's start learning <b>Web Scraping</b></p><p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p><p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p></body>,
 <h1> Web Scraping </h1>,
 <p id="first_para">Let's start learning <b>Web Scraping</b></p>,
 <b>Web Scraping</b>,
 <p class="abc" id="second_para">You can read more about BeautifulSoup from <a href=

In [65]:
# find id using id-name
data.find_all(id = 'first_para')

[<p id="first_para">Let's start learning <b>Web Scraping</b></p>]

In [68]:
print(data.find_all(class_ = 'abc'))

[<p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p>, <p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p>]


#### Going down 

In [72]:
print(data.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   Testing Web Page
  </title>
 </head>
 <body>
  <h1>
   Web Scraping
  </h1>
  <p id="first_para">
   Let's start learning
   <b>
    Web Scraping
   </b>
  </p>
  <p class="abc" id="second_para">
   You can read more about BeautifulSoup from
   <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/">
    here
   </a>
  </p>
  <p class="abc">
   <a href="https://codingninjas.in/">
    Coding Ninjas
   </a>
  </p>
 </body>
</html>


In [75]:
print(data.head)
print(data.head.title)
print(data.title)

<head><title> Testing Web Page </title></head>
<title> Testing Web Page </title>
<title> Testing Web Page </title>


In [79]:
print(data.title.string)

 Testing Web Page 


In [83]:
li = data.find_all('p')
for i in li:
    print(i.string)

None
None
 Coding Ninjas 


In [86]:
li = data.find_all('p') # find multiple children value
for i in li:
    print(list(i.strings))

["Let's start learning ", 'Web Scraping']
['You can read more about BeautifulSoup from ', ' here ']
[' Coding Ninjas ']


In [89]:
li = data.find_all('p') # find multiple children value and remove all extra spaces
for i in li:
    print(list(i.stripped_strings))

["Let's start learning", 'Web Scraping']
['You can read more about BeautifulSoup from', 'here']
['Coding Ninjas']


In [94]:
# contents -> give list
li = data.html.contents
print(li)
print(len(li)) # head and body so 2


[<head><title> Testing Web Page </title></head>, <body><h1> Web Scraping </h1><p id="first_para">Let's start learning <b>Web Scraping</b></p><p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p><p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p></body>]
2


In [97]:
li = data.html.children #children -> give ittrator
for i in li:
    print(i)

<head><title> Testing Web Page </title></head>
<body><h1> Web Scraping </h1><p id="first_para">Let's start learning <b>Web Scraping</b></p><p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p><p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p></body>


In [106]:
li = data.html.descendants
for i in li:
    print(i)
    

<head><title> Testing Web Page </title></head>
<title> Testing Web Page </title>
 Testing Web Page 
<body><h1> Web Scraping </h1><p id="first_para">Let's start learning <b>Web Scraping</b></p><p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p><p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p></body>
<h1> Web Scraping </h1>
 Web Scraping 
<p id="first_para">Let's start learning <b>Web Scraping</b></p>
Let's start learning 
<b>Web Scraping</b>
Web Scraping
<p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p>
You can read more about BeautifulSoup from 
<a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a>
 here 
<p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p>
<a href="https://codingninjas.in/"> Coding Ninjas </a>
 Cod

In [104]:
print(data.html.parent)
print(data.html.parents)

<!DOCTYPE html>
<html><head><title> Testing Web Page </title></head><body><h1> Web Scraping </h1><p id="first_para">Let's start learning <b>Web Scraping</b></p><p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p><p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p></body></html>
<generator object PageElement.parents at 0x000001F81B7B8820>


## ASSIGNMENT SOLUTION 

#### Solution 1

In [38]:
html = '<!DOCTYPE html><html><head><title>Learning Beautiful Soup</title></head>\
<body><h1> About Us </h1><div class = "first_div"><p>Coding Ninjas Website</p>\
<a href="https://www.codingninjas.in/">Link to Coding Ninjas Website</a>\
<ul><li>This</li><li>is</li><li>an</li><li>unordered</li><li>list.</li></ul>\
</div><p id = "template_p">This is a template paragraph tag</p>\
<a href = "https://www.facebook.com/codingninjas/">\
This is the link of our Facebook Page</a></body></html>'
data = bs(html,"html.parser")
#print(data.prettify())
print(data.body)

<body><h1> About Us </h1><div class="first_div"><p>Coding Ninjas Website</p><a href="https://www.codingninjas.in/">Link to Coding Ninjas Website</a><ul><li>This</li><li>is</li><li>an</li><li>unordered</li><li>list.</li></ul></div><p id="template_p">This is a template paragraph tag</p><a href="https://www.facebook.com/codingninjas/">This is the link of our Facebook Page</a></body>


#### Solution 2

In [45]:
html = '<!DOCTYPE html><html><head><title>Learning Beautiful Soup</title></head>\
<body><h1> About Us </h1><div class = "first_div"><p>Coding Ninjas Website</p>\
<a href="https://www.codingninjas.in/">Link to Coding Ninjas Website</a>\
<ul><li>This</li><li>is</li><li>an</li><li>unordered</li><li>list.</li></ul>\
</div><p id = "template_p">This is a template paragraph tag</p>\
<a href = "https://www.facebook.com/codingninjas/">\
This is the link of our Facebook Page</a></body></html>'

data = bs(html,'html.parser')
#print(data.prettify())
li = data.div.attrs
for i in li:
    print(i)

class


#### Solution 3

In [52]:
html = '<!DOCTYPE html><html><head><title>Learning Beautiful Soup</title></head>\
<body><h1> About Us </h1><div class = "first_div"><p>Coding Ninjas Website</p>\
<a href="https://www.codingninjas.in/">Link to Coding Ninjas Website</a>\
<ul><li>This</li><li>is</li><li>an</li><li>unordered</li><li>list.</li></ul>\
</div><p id = "template_p">This is a template paragraph tag</p>\
<a href = "https://www.facebook.com/codingninjas/">\
This is the link of our Facebook Page</a></body></html>'

data = bs(html,"html.parser")
da = data.find_all('li')
for i in da:
    print(i.string,end = " ")

This is an unordered list. 

In [56]:
html = '<!DOCTYPE html><html><head><title>Learning Beautiful Soup</title></head>\
<body><h1> About Us </h1><div class = "first_div"><p>Coding Ninjas Website</p>\
<a href="https://www.codingninjas.in/">Link to Coding Ninjas Website</a>\
<ul><li>This</li><li>is</li><li>an</li><li>unordered</li><li>list.</li></ul>\
</div><p id = "template_p">This is a template paragraph tag</p>\
<a href = "https://www.facebook.com/codingninjas/">\
This is the link of our Facebook Page</a></body></html>'

data = bs(html,"html.parser")
a = data.find_all('a')
for i in a:
    print(i.get('href'))

https://www.codingninjas.in/
https://www.facebook.com/codingninjas/
