In [1]:
import requests

In [2]:
page = requests.get("http://dataquestio.github.io/web-scraping-pages/simple.html")
# requests.get(...) allows us to download the webpage of interest
page

<Response [200]>

In [3]:
page.status_code
# a status code of 200 means the page downloaded successfully
# Codes starting with 2 generally demonstrate "success"
# Codes starting with 4 or 5 generally demonstrate "success"

200

In [4]:
# Show the contents of the page
page.content

b'<!DOCTYPE html>\n<html>\n    <head>\n        <title>A simple example page</title>\n    </head>\n    <body>\n        <p>Here is some simple content for this page.</p>\n    </body>\n</html>'

In [5]:
# Using BeautifulSoup4
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')

In [7]:
# Printing alone gives us an unindented form of the page
# Including the method .prettify() indents the respective portions
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <p>
   Here is some simple content for this page.
  </p>
 </body>
</html>


In [10]:
# .children alone returns a list generator, but to see it, we need to call
# the list() function on it
soup.children

<list_iterator at 0x105dd59e8>

In [11]:
list(soup.children)
# We have two tags at the top level of the page:
# <!DOCTYPE html> = 'html' and <html>
# Note that \n is just a new line character

['html', '\n', <html>
 <head>
 <title>A simple example page</title>
 </head>
 <body>
 <p>Here is some simple content for this page.</p>
 </body>
 </html>]

In [14]:
# Show what each type of element in ths list
[type(i) for i in list(soup.children)]
# First: Doctype element (tells us it's an html file)
# Second: Navigablestring (text found inside the html doc)
# Third: Tag (contains other nested tags, generally the object of interest)

# Tag allows us to navigate through the HTML doc, and it's children

# More about BeautifulSoup Objects: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#kinds-of-objects

[bs4.element.Doctype, bs4.element.NavigableString, bs4.element.Tag]

In [16]:
# Selecting the third item, Tag
html = list(soup.children)[2]
# Show the children inside the html tag
list(html.children)
# Two tags: <head> and <body>

['\n', <head>
 <title>A simple example page</title>
 </head>, '\n', <body>
 <p>Here is some simple content for this page.</p>
 </body>, '\n']

In [17]:
# Let's extract the text inside the <p> tag
body = list(html.children)[3]
list(body.children)

['\n', <p>Here is some simple content for this page.</p>, '\n']

In [26]:
# Now isolate the <p> tag
p = list(body.children)[1]
# Once isolated, we can extract all the text with the method get_text()
p.get_text()

'Here is some simple content for this page.'

In [27]:
# Now, let's find all instances of a tag at once:
soup = BeautifulSoup(page.content, 'html.parser')
soup.find_all('p')
# Returns the html parsed list where <p> is 

[<p>Here is some simple content for this page.</p>]

In [35]:
soup.find_all('p')[0].get_text()
# This will return every instance of p, but .find(...) will return only the first

'Here is some simple content for this page.'

In [38]:
# Using classes and ids
page = requests.get("http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")
soup = BeautifulSoup(page.content, 'html.parser')
soup

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<div>
<p class="inner-text first-item" id="first">
                First paragraph.
            </p>
<p class="inner-text">
                Second paragraph.
            </p>
</div>
<p class="outer-text first-item" id="second">
<b>
                First outer paragraph.
            </b>
</p>
<p class="outer-text">
<b>
                Second outer paragraph.
            </b>
</p>
</body>
</html>

In [39]:
# Use find_all() to search for items by class or id
# This searches for all tags with p that have the class outer-text
soup.find_all('p', class_ = 'outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [40]:
# Now, let's just look for any tag with the outer-text class
soup.find_all(class_ = 'outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [41]:
# Now, let's search for elements by id
soup.find_all(id = 'first')

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>]

Now, we can use CSS Selectors
* For example: 'p a' finds all 'a' tags inside of a 'p' tag
* For example: 'body p a' finds all 'a' tags inside of a 'p' tag inside of a body tag
* For example: 'html body' finds all 'body' tags inside of an 'html' tag
* For example: p.outer-text finds all 'p' tags with a class of outer-text
* For example: p#first finds all 'p' tags with an id of 'first'
* For example: body p.outer-text finds any 'p' tags with a class of outer-text inside of a body tag

In [43]:
soup.select('div p')

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>, <p class="inner-text">
                 Second paragraph.
             </p>]

# Extracting information from an actual web page - San Francisco Weather
https://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168