In [1]:
import requests
from bs4 import BeautifulSoup as bs

In [2]:
#load the webpage content
r = requests.get("http://keithgalli.github.io/web-scraping/example.html")

#convert to a bs object
soup = bs(r.content)

#print out our html
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



In [3]:
# To find a particular info(in this case a header)
first_header = soup.find("h2")
first_header
                         

<h2>A Header</h2>

In [4]:
#to find all the header
headers = soup.find_all("h2")
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

In [5]:
#Pass in a list of elements to look for
headers = soup.find_all(['h1', 'h2'])
headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [6]:
# you can pass in attributes to the find/find_all functions
paragraph = soup.find_all('p', attrs={"id" : "paragraph-id"})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [12]:
#You can nestfind/find all calls
body = soup.find("body")
div = body.find("div")
div

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

In [15]:
# We can search for specific strings in our find/find_all calls i.e to find if a specific word is found in the html code
import re

paragraphs = soup.find_all('p', string=re.compile("text"))
paragraphs 

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [18]:
# to search for places containing either Header and header

paragraphs = soup.find_all('h2', string=re.compile("(H|h)eader"))
paragraphs 

[<h2>A Header</h2>, <h2>Another header</h2>]

In [20]:
# select satatement

# lets scrab paragaphs inside of div
content = soup.select('div p')
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [21]:
# to get all the paragraphs after h2
paragraphs = soup.select("h2 ~ p")
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [31]:
# to get the bold text contained after the paragraph(paragraph-id)
bold_text = soup.select("p#paragraph-id b")
print(bold_text)

[<b>Some bold text</b>]


In [30]:
# Getting the contents inside the html like the content of the header
# we use .string
header = soup.find('h2')
print(header.string)


A Header


In [29]:
# if you use .string in div it wont work because it wont know the main text you want to get because it has many strings in it
# so we use .get_text() instead

div = soup.find("div")
print(div.get_text())


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [35]:
# to get a specific property from an element eg to get the link in <a
link = soup.find("a")
print(link['href'])

https://keithgalli.github.io/web-scraping/webpage.html


In [38]:
#to get the specify elements in the paragraph-id
paragraphs = soup.select("p#paragraph-id")
paragraphs[0]['id']

'paragraph-id'

#### Code navigation

In [48]:
# Path syntax
soup.body.div.h1.string

'HTML Webpage'

#### Know the terms : parents, siblings and children

In [56]:
# you can find the siblings of a tag using .find_next_siblings
div = soup.body.find('div')
div.find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]