In [67]:
#%pip install requests

## Load in necessary libraries

In [68]:
import requests
from bs4 import BeautifulSoup as bs

## Load the first page

In [69]:
#Load webpage content
r = requests.get("https://keithgalli.github.io/web-scraping/example.html")

# Convert to t a beautifyl soup object
soup = bs(r.content)

# Render out the html  
print(soup)

#To render in better format
print(soup.prettify())

<html>
<head>
<title>HTML Example</title>
</head>
<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>
</html>

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



## Start using Beautiful Soup to Scrape

### find and find_all

**find** will find the first element with the header supposed to find

In [70]:
first_header = soup.find("h2")
first_header

<h2>A Header</h2>

**find_all** will find all of the elements with the header supposed to find

In [71]:
headers = soup.find_all("h2")
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

#### Pass in a list of elements to look for

The below code finds the first element which occurs in the list provided as input i.e whatever occurs first in the webpage an h1 or h2

In [72]:
first_header = soup.find(["h1" , "h2"])
first_header

<h1>HTML Webpage</h1>

find_all finds all elements in the list of elements provided

In [73]:
headers = soup.find_all(["h1" , "h2"])
headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

#### Pass in the attributes to find/find_all functions

Example paragraph with id paragraph-id is retrieved

In [74]:
paragraph = soup.find_all("p" , attrs = {"id" : "paragraph-id"})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

#### Nest find/find_all calls

In [75]:
body = soup.find("body")
div = body.find("div")
header = div.find("h1")
header

<h1>HTML Webpage</h1>

#### Search for specific strings using find/find_all

In [76]:
paragraphs = soup.find_all("p" , string = "Some bold text")
paragraphs

[<p id="paragraph-id"><b>Some bold text</b></p>]

Find all strings with the word Some

In [77]:
import re

paragraphs = soup.find_all("p" , string = re.compile("Some"))
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

Find strings with "Header" or "header"

In [78]:
headers = soup.find_all("h2" , string = re.compile("(H|h)eader"))
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

### select (CSS selector)

In [79]:
soup.body

<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>

In [80]:
content = soup.select("p")
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

#### Grab paragraphs inside div

In [81]:
content = soup.select("div p")
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

#### Grab paragraphs preceeded by h2
i.e paragraphs immediately following an h2

In [82]:
paragraphs = soup.select("h2 ~ p")
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

#### Bold text inside Paragraph with id paragraph-id

In [83]:
bold_text = soup.select("p#paragraph-id b")
bold_text

[<b>Some bold text</b>]

#### Select paragraphs that are direct descendants of body

In [84]:
paragraphs = soup.select("body > p")
print(paragraphs)


[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]


#### Print the italicized text in a paragraph

In [85]:
for paragraph in paragraphs :
  print(paragraph.select("i"))

[<i>Some italicized text</i>]
[]


#### Select element by specific property

Select all elements with property align middle

##### Method 1

In [86]:
soup.select("[align=middle]")

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

##### Method 2

Select div with property align middle

In [87]:
soup.find_all("div" , attrs = {"align" : "middle"})

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

### Get different properties of the HTML

#### Fetch text content

##### If no child elements use .string

Print the text content inside h2

In [88]:
headers = soup.find_all("h2")

for header in headers :
  print(header.string)

A Header
Another header


In [89]:
headers = soup.select("h2")

for header in headers :
  print(header.string)

A Header
Another header


##### If mutliple child elements use get_text()

Print the text content inside div

In [90]:
div = soup.find("div")
print(div.prettify())

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>



This wont work

In [91]:
print(div.string)

None


In [92]:
print(div.get_text())


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



#### Get a specific property from an element

##### Get Links from **a** tag

In [93]:
link = soup.find("a")
link["href"]

'https://keithgalli.github.io/web-scraping/webpage.html'

##### Get id of a paragraph ... for thid example select the paragraph with id paragraph-id

In [94]:
paragraphs = soup.select("p#paragraph-id")
paragraphs[0]["id"]

'paragraph-id'

In [95]:
paragraphs = soup.find_all("p" , attrs = {"id" : "paragraph-id"})
paragraphs[0]["id"]

'paragraph-id'

In [96]:
paragraphs = soup.find("p" , attrs = {"id" : "paragraph-id"})
paragraphs["id"]

'paragraph-id'

### Code navigation

#### Path syntax

In [97]:
soup.body.div.h1.string

'HTML Webpage'

#### Parent , Sibling , Child

In [98]:
soup.body.find("div")

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

##### Sibling

In [99]:
soup.body.find("div").find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

##### Parent

In [100]:
soup.body.find("div").find_parent()

<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>