# Simplilearn
## Web Scraping with Beautiful Soup

In [56]:
pip install bs4

Note: you may need to restart the kernel to use updated packages.


In [57]:
from bs4 import BeautifulSoup

In [58]:
html_doc="""
<html>
<body>
<h1>
My first Heading
</h1>
<b><!--This is a comment--></b>
<p title= "About me" class="test">Mt first Paragraph</p>
<div class="citites">
<h2>Nairobi</h2>
</div>
</body>
</html>"""

In [59]:
#use html parser to parse it because its an html doc
soup=BeautifulSoup(html_doc,'html.parser')

In [60]:
#view the soup type
type(soup)

bs4.BeautifulSoup

In [61]:
#view the soup object
print(soup)


<html>
<body>
<h1>
My first Heading
</h1>
<b><!--This is a comment--></b>
<p class="test" title="About me">Mt first Paragraph</p>
<div class="citites">
<h2>Nairobi</h2>
</div>
</body>
</html>


In [62]:
#create a tag object
tag=soup.p

In [63]:
#view the tag object type
type(tag)

bs4.element.Tag

In [64]:
#print the tag
print(tag)

<p class="test" title="About me">Mt first Paragraph</p>


In [65]:
#create a comment object type
comment=soup.b.string

In [66]:
type(comment)

bs4.element.Comment

In [67]:
print(comment)

This is a comment


In [68]:
#view tag attribute
tag.attrs

{'title': 'About me', 'class': ['test']}

In [69]:
#view the tag value
tag.string

'Mt first Paragraph'

In [70]:
#view the tag type(navigable string)
type(tag.string)

bs4.element.NavigableString

## Search the tree with filters

In [71]:
HTMLfile="notes.html"
with open(HTMLfile,"r") as notes:
    soup=BeautifulSoup(notes,'lxml')

In [72]:
#view contents of the soup object
soup.contents

['html',
 <html lang="en">
 <head>
 <meta charset="utf-8"/>
 <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
 <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
 <title>Udacity Course</title>
 </head>
 <body>
     My HTML notes from Udacity training. 
 Markup is refers to special characters in html that have <em>special meaning</em>. 
 Mary had a little toad <br/>
 Its skin as tough as shell <br/>
 And every file that toad would write <br/>
 It used HTML.<br/>
 <p id="Story come">This is a story.</p>
 <p>Add paragraph tags to make it better.</p>
 <p>Once upon a time there was a baby tiger named Talia. Talia lived in a fearsome jungle in the middle of the fearsome jungle. 
 Talia was the most fearsome of all, though she was only a baby.</p>
 <p>MC<sup>2</sup> CO<sub>2</sub></p>
 <p>When i start <mark>Scorprog </mark>maybe life will be easier</p>
 <h1>My HTML Notes</h1>
 <p>This page is a collection of my awesome notes about HTML!</p>
 <h2>The Web</h2>
 <p>The

In [73]:
tag_li=soup.li

In [74]:
#returns the first found li
tag_li.string

'Learn HTML'

In [75]:
#check attributes
#the li here has no attributes so an empty dict returned
tag_li.attrs

{}

In [76]:
tag_li2=soup.find("li")

In [77]:
type(tag_li2)

bs4.element.Tag

In [78]:
tag_li2.string

'Learn HTML'

In [79]:
print(tag_li2)

<li>Learn HTML</li>


In [80]:
#search the document using find method for an id
find_id=soup.find(id="story time")

In [81]:
print(find_id)

None


In [82]:
find_id=soup.find(id="Story come")

In [83]:
print(find_id)

<p id="Story come">This is a story.</p>


In [87]:
find_id2=soup.find(id='learn')

In [88]:
print(find_id2)

<ol id="learn">
<li>Learn HTML</li>
<li>Lean CSS</li>
<li>Learn Python</li>
<li>Save the World</li>
</ol>


In [89]:
#basically saying find li string in the id learn
print(find_id2.li.string)

Learn HTML


In [92]:
#search using string only
string_search=soup.findAll(text=['pages','Python'])

In [93]:
print(string_search)

[]


In [98]:
#search based on CSS class name 
css_class_search=soup.find(attrs={'class':'block'})

In [99]:
#returns the first one it finds
print(css_class_search)

<p class="block">Call me ishamel</p>


In [100]:
#create a function to search the document based upon the tag passed as a parameter


In [102]:
#find all tags in the document
for tag in soup.findAll(True):
    print(tag.name)

html
head
meta
meta
meta
title
body
em
br
br
br
br
p
p
p
p
sup
sub
p
mark
h1
p
h2
p
h2
p
br
h1
p
strong
em
p
strong
sub
p
strong
mark
br
style
br
br
br
br
p
p
h3
div
p
p
h3
ol
li
li
li
li
ul
li
ol
li
li
li
ol
li
li
li
ol
li
li
h3
p
p
a
br
h3
img
a
img
p
ol
a
li
a
li
p
ol
li
a
li
a
img
img
p
p


In [105]:
#searching using find all
find_class=soup.findAll(class_='block')
print(find_class)

[<p class="block">Call me ishamel</p>, <p class="block">Just don't call me late for dinner</p>, <p class="block">Call me ishamel</p>, <p class="block">Just don't call me late for dinner</p>]


In [106]:
type(find_class)

bs4.element.ResultSet

In [107]:
print(find_class[0])#you can index it too.

<p class="block">Call me ishamel</p>


In [109]:
print(find_class[1])

<p class="block">Just don't call me late for dinner</p>


In [112]:
#use regular expression to search the document
import re
email_example="""
<br>
<p>my email is </p>
abc@example.com
"""
soup_email=BeautifulSoup(email_example,'lxml')

In [113]:
emailID_regex=re.compile("\w+@\w+\.\w+")

In [114]:
#find and print the email ID using regular expression
email_id=soup_email.find(text=emailID_regex)

In [115]:
print(email_id)


abc@example.com




