In [2]:
# Required Libraries
from bs4 import BeautifulSoup

## HTML Content

In [1]:

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [4]:
# soup object
# Note : We can also use another parser, e.g  lxml parser.
soup = BeautifulSoup(html_doc, 'html.parser')

print(soup.prettify()) # printing soup object by prettifying it

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


## For getting all text from these soup content

In [5]:
soup.get_text()

"\nThe Dormouse's story\n\nThe Dormouse's story\nOnce upon a time there were three little sisters; and their names were\nElsie,\nLacie and\nTillie;\nand they lived at the bottom of a well.\n...\n"

## One common task is extracting all the URLs found within a page’s <a> tags:

In [20]:
# One common task is extracting all the URLs found within a page’s <a> tags:
links = soup.find_all('a')
links

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [21]:
for link in links:
    print(link.get("href"))



http://example.com/elsie
http://example.com/lacie
http://example.com/tillie


## Here are some simple ways to navigate that data structure:



In [24]:
print(soup.title) ## print the title of this page
print(soup.title.name)  ## It gives the tag i.e title
print(soup.title.string)  ## Gives the text inside the title

<title>The Dormouse's story</title>
title
The Dormouse's story


In [26]:
# Gives the parent associated to that particular tag, if not avaialbale gives none
print(soup.title.parent.name)

head


In [34]:
# For getting 1 paragraph 
print(soup.p)
print(soup.find('p'))  # both work for the same
print("-----------------------------------------------------------------")
print()
# For getting all paragraphs presnt in that web pages
print(soup.find_all('p'))  ## gives us all the paragraphs in a list
print("-----------------------------------------------------------------")
print()
# For getting the text from all paragraphs
paras = soup.find_all('p')
for para in paras:
    print(para.text)   # grtting the text from the tags
    
print("-----------------------------------------------------------------")
print()
# We can filter this more by using class
parag = soup.p['class']
print(parag)
print("-----------------------------------------------------------------")
print()
paragap = soup.find_all('p', class_="title")  # we are having only one paragraph by class = title

print(paragap)
    


<p class="title"><b>The Dormouse's story</b></p>
<p class="title"><b>The Dormouse's story</b></p>
-----------------------------------------------------------------

[<p class="title"><b>The Dormouse's story</b></p>, <p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>, <p class="story">...</p>]
-----------------------------------------------------------------

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
-----------------------------------------------------------------

['title']
-----------------------------------------------------------------

[<p class="title"><b>The Dormous

In [35]:
# Many more have to explore
# For more ecplore this document
# BeautifulSoup documnet: https://www.crummy.com/software/BeautifulSoup/bs4/doc/