# Introduction to BeautifulSoup

In [2]:
# Import BeautifulSoup
from bs4 import BeautifulSoup

In [3]:
# Store html site as a string
html = """
<!DOCTYPE html>
<html lang="en-us">

<head>
  <meta charset="UTF-8">
  <title>My First Page</title>
</head>

<body>
  <!-- Header -->
  <h1>Hello World!</h1>

  <!-- Image -->
  <img
    src="https://static.wikia.nocookie.net/spongebob/images/4/46/SVG_SpongeBob_SquarePants.svg/revision/latest/scale-to-width-down/195?cb=20181117230211"
    alt="Spongebob!" />
  <br />

  <!-- Link with New Tab -->
  <a href="https://www.google.com">Google</a>
  <br />

  <!-- An ordered list -->
  <ol>
    <li>Visit Grand Canyon</li>
    <li>Hike the trails</li>
    <li>Take photos</li>
  </ol>

  <ul>
    <li>Bach</li>
    <li>Mozart</li>
    <li>Beethoven</li>
    <li>Adele</li>
  </ul>
</body>

</html>
"""

In [4]:
# Create a BeautifulSoup object to parse the html code
soup = BeautifulSoup(html, 'html.parser')

In [6]:
# Print the parser
print(soup)


<!DOCTYPE html>

<html lang="en-us">
<head>
<meta charset="utf-8"/>
<title>My First Page</title>
</head>
<body>
<!-- Header -->
<h1>Hello World!</h1>
<!-- Image -->
<img alt="Spongebob!" src="https://static.wikia.nocookie.net/spongebob/images/4/46/SVG_SpongeBob_SquarePants.svg/revision/latest/scale-to-width-down/195?cb=20181117230211"/>
<br/>
<!-- Link with New Tab -->
<a href="https://www.google.com">Google</a>
<br/>
<!-- An ordered list -->
<ol>
<li>Visit Grand Canyon</li>
<li>Hike the trails</li>
<li>Take photos</li>
</ol>
<ul>
<li>Bach</li>
<li>Mozart</li>
<li>Beethoven</li>
<li>Adele</li>
</ul>
</body>
</html>



In [7]:
# Check type of parser object
type(soup)

bs4.BeautifulSoup

In [8]:
# Extract and print the head section
print(soup.head)

<head>
<meta charset="utf-8"/>
<title>My First Page</title>
</head>


In [9]:
# Extract the title section
soup.title

<title>My First Page</title>

In [10]:
# Extract the title text from the title
soup.title.text

'My First Page'

In [28]:
# Use the find method to locate the first image
image_html = soup.find("img")
image_html['src']

'https://static.wikia.nocookie.net/spongebob/images/4/46/SVG_SpongeBob_SquarePants.svg/revision/latest/scale-to-width-down/195?cb=20181117230211'

In [29]:
# Use the find method to locate the first link
link = soup.find('a')
# Print the html for the link
print(link)
# Extract and print the URL
print(link['href'])

<a href="https://www.google.com">Google</a>
https://www.google.com


In [21]:
# Extract bulleted lists
soup.ol

<ol>
<li>Visit Grand Canyon</li>
<li>Hike the trails</li>
<li>Take photos</li>
</ol>

In [30]:
# Extract the first list item
soup.ol.li

<li>Visit Grand Canyon</li>

In [31]:
# Extract the text from the first list item
soup.ol.li.text

'Visit Grand Canyon'

In [32]:
# Extract the source URL for the image
image_html['src']

'https://static.wikia.nocookie.net/spongebob/images/4/46/SVG_SpongeBob_SquarePants.svg/revision/latest/scale-to-width-down/195?cb=20181117230211'

In [33]:
# Extract the alt text for the image
image_html['alt']

'Spongebob!'