# Web scraping using beautiful soup!

In [4]:
import requests
from bs4 import BeautifulSoup as bs 

In [3]:
# Load the webpage content.
r = requests.get("https://keithgalli.github.io/web-scraping/example.html")

# Converting to a beautiful soup object
soup = bs(r.content)
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>


# Start using Beautiful Soup to scrape
* find and find_all

In [8]:
# find() - only finds a particular thing!
first_header = soup.find("h2")
print(first_header)

<h2>A Header</h2>


In [9]:
# find_all() - finds every elements and store it in a lists!
headers = soup.find_all("h2")
print(headers)

[<h2>A Header</h2>, <h2>Another header</h2>]


In [11]:
# We can also pass in a list of elements to look for
# find - returns only one elements
first_header = soup.find(["h1", "h2"])
print(first_header)

# findall - returns the every elements. 
headers = soup.find_all(["h1", "h2"])
print(headers)

<h1>HTML Webpage</h1>
[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]


In [13]:
# We can also pass in the attribute(to find the specific class or id's) to the find/findall

paragraph = soup.find_all("p", attrs={"id": "paragraph-id"})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [15]:
# We can also nest find/findall
body = soup.find("body")
div = body.find("div")
header = div.find("h1")
header

<h1>HTML Webpage</h1>

In [19]:
# we can search specific strings in our find/find_all calls
import re

paraSearch = soup.find_all("p", string=re.compile("Some"))
print(paraSearch)

headerSearch = soup.find_all("h2", string=re.compile("(H|h)eader"))
print(headerSearch)

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<h2>A Header</h2>, <h2>Another header</h2>]


# select (CSS Selectors) - 
* same as find/find_all but, here we can search for more accurate result without using regex.
* for css reference - visit W3 Schools

In [22]:
soup

<html><head>
<title>HTML Example</title>
</head>
<body>

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

<h2>A Header</h2>
<p><i>Some italicized text</i></p>

<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>



</body></html>

In [20]:
# Selecting the paragraph inside the div!      
content = soup.select("div p")
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [21]:
# Selecting all the "paragraph" preceded by h2
para = soup.select("h2 ~ p")
para

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [23]:
# Selecting the bold text from the para which has an id of paragraph-id

bold_text = soup.select("p#paragraph-id b")
print(bold_text)

[<b>Some bold text</b>]


In [25]:
# nesting in select!

paragraphs = soup.select("body > p")
print(paragraphs)
print("")

for paragraph in paragraphs:
    print(paragraph.select("i"))

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]

[<i>Some italicized text</i>]
[]


In [26]:
# Grab by element with specific property
soup.select("[align=middle]")

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

# Get different properties of the HTML

In [28]:
# To only print the text of the particular HTML elements! but it will not works for the div with multiple HTML elements!
header = soup.find("h2")
print(header.string)

A Header


In [29]:
# To only print the text of the particular div!  
# If multiple child elements, use get_text!

div = soup.find("div")
print(div.get_text())


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [32]:
# Getting the property of the HTML elements(href, src, id, class)
link = soup.find("a")
print(link)

# To get the actual link
print(link["href"])

<a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a>
https://keithgalli.github.io/web-scraping/webpage.html


In [35]:
# Getting the property of the HTML elements(href, src, id, class)

para = soup.select("p#paragraph-id")
para[0]["id"]

'paragraph-id'

## Code Navigation

In [40]:
# Path Syntax

soup.body.div.h1.string 
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>


In [42]:
# terms - Parent, Siblings - sublevels of parent, Child-inside the sub level!

soup.body.find("div").find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]