In [1]:
import requests

In [2]:
page = requests.get("http://dataquestio.github.io/web-scraping-pages/simple.html")
# requests.get(...) allows us to download the webpage of interest
page

<Response [200]>

In [3]:
page.status_code
# a status code of 200 means the page downloaded successfully
# Codes starting with 2 generally demonstrate "success"
# Codes starting with 4 or 5 generally demonstrate "success"

200

In [4]:
# Show the contents of the page
page.content

b'<!DOCTYPE html>\n<html>\n    <head>\n        <title>A simple example page</title>\n    </head>\n    <body>\n        <p>Here is some simple content for this page.</p>\n    </body>\n</html>'

In [5]:
# Using BeautifulSoup4
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')

In [6]:
# Printing alone gives us an unindented form of the page
# Including the method .prettify() indents the respective portions
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <p>
   Here is some simple content for this page.
  </p>
 </body>
</html>


In [7]:
# .children alone returns a list generator, but to see it, we need to call
# the list() function on it
soup.children

<list_iterator at 0x115ef09b0>

In [8]:
list(soup.children)
# We have two tags at the top level of the page:
# <!DOCTYPE html> = 'html' and <html>
# Note that \n is just a new line character

['html', '\n', <html>
 <head>
 <title>A simple example page</title>
 </head>
 <body>
 <p>Here is some simple content for this page.</p>
 </body>
 </html>]

In [9]:
# Show what each type of element in ths list
[type(i) for i in list(soup.children)]
# First: Doctype element (tells us it's an html file)
# Second: Navigablestring (text found inside the html doc)
# Third: Tag (contains other nested tags, generally the object of interest)

# Tag allows us to navigate through the HTML doc, and it's children

# More about BeautifulSoup Objects: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#kinds-of-objects

[bs4.element.Doctype, bs4.element.NavigableString, bs4.element.Tag]

In [10]:
# Selecting the third item, Tag
html = list(soup.children)[2]
# Show the children inside the html tag
list(html.children)
# Two tags: <head> and <body>

['\n', <head>
 <title>A simple example page</title>
 </head>, '\n', <body>
 <p>Here is some simple content for this page.</p>
 </body>, '\n']

In [11]:
# Let's extract the text inside the <p> tag
body = list(html.children)[3]
list(body.children)

['\n', <p>Here is some simple content for this page.</p>, '\n']

In [12]:
# Now isolate the <p> tag
p = list(body.children)[1]
# Once isolated, we can extract all the text with the method get_text()
p.get_text()

'Here is some simple content for this page.'

In [13]:
# Now, let's find all instances of a tag at once:
soup = BeautifulSoup(page.content, 'html.parser')
soup.find_all('p')
# Returns the html parsed list where <p> is 

[<p>Here is some simple content for this page.</p>]

In [14]:
soup.find_all('p')[0].get_text()
# This will return every instance of p, but .find(...) will return only the first

'Here is some simple content for this page.'

In [15]:
# Using classes and ids
page = requests.get("http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")
soup = BeautifulSoup(page.content, 'html.parser')
soup

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<div>
<p class="inner-text first-item" id="first">
                First paragraph.
            </p>
<p class="inner-text">
                Second paragraph.
            </p>
</div>
<p class="outer-text first-item" id="second">
<b>
                First outer paragraph.
            </b>
</p>
<p class="outer-text">
<b>
                Second outer paragraph.
            </b>
</p>
</body>
</html>

In [16]:
# Use find_all() to search for items by class or id
# This searches for all tags with p that have the class outer-text
soup.find_all('p', class_ = 'outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [17]:
# Now, let's just look for any tag with the outer-text class
soup.find_all(class_ = 'outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [18]:
# Now, let's search for elements by id
soup.find_all(id = 'first')

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>]

Now, we can use CSS Selectors
* For example: 'p a' finds all 'a' tags inside of a 'p' tag
* For example: 'body p a' finds all 'a' tags inside of a 'p' tag inside of a body tag
* For example: 'html body' finds all 'body' tags inside of an 'html' tag
* For example: p.outer-text finds all 'p' tags with a class of outer-text
* For example: p#first finds all 'p' tags with an id of 'first'
* For example: body p.outer-text finds any 'p' tags with a class of outer-text inside of a body tag

https://developer.mozilla.org/en-US/docs/Learn/CSS/Introduction_to_CSS/Selectors

In [19]:
soup.select('div p')

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>, <p class="inner-text">
                 Second paragraph.
             </p>]

# Extracting information from an actual web page - San Francisco Weather
https://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168

In [20]:
page = requests.get('https://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168')
# downloads the webpage
page
# Response 200, successful

<Response [200]>

In [21]:
soup = BeautifulSoup(page.content, 'html.parser')
# sets the page as a list of html elements
print(soup.prettify())

<!DOCTYPE html>
<html class="no-js">
 <head>
  <!-- Meta -->
  <meta content="width=device-width" name="viewport"/>
  <link href="http://purl.org/dc/elements/1.1/" rel="schema.DC"/>
  <title>
   National Weather Service
  </title>
  <meta content="National Weather Service" name="DC.title">
   <meta content="NOAA National Weather Service National Weather Service" name="DC.description"/>
   <meta content="US Department of Commerce, NOAA, National Weather Service" name="DC.creator"/>
   <meta content="" name="DC.date.created" scheme="ISO8601"/>
   <meta content="EN-US" name="DC.language" scheme="DCTERMS.RFC1766"/>
   <meta content="weather, National Weather Service" name="DC.keywords"/>
   <meta content="NOAA's National Weather Service" name="DC.publisher"/>
   <meta content="National Weather Service" name="DC.contributor"/>
   <meta content="http://www.weather.gov/disclaimer.php" name="DC.rights"/>
   <meta content="General" name="rating"/>
   <meta content="index,follow" name="robots"/>

In [22]:
sevenDay = soup.find(id = 'seven-day-forecast')
# return the first instance of 'seven-day-forecast' only
print(sevenDay.prettify())

<div class="panel panel-default" id="seven-day-forecast">
 <div class="panel-heading">
  <b>
   Extended Forecast for
  </b>
  <h2 class="panel-title">
   San Francisco CA
  </h2>
 </div>
 <div class="panel-body" id="seven-day-forecast-body">
  <div id="seven-day-forecast-container">
   <ul class="list-unstyled" id="seven-day-forecast-list">
    <li class="forecast-tombstone">
     <div class="tombstone-container">
      <p class="period-name">
       Today
       <br/>
       <br/>
      </p>
      <p>
       <img alt="Today: Sunny, with a high near 72. Light and variable wind becoming west 6 to 11 mph in the afternoon. " class="forecast-icon" src="newimages/medium/few.png" title="Today: Sunny, with a high near 72. Light and variable wind becoming west 6 to 11 mph in the afternoon. "/>
      </p>
      <p class="short-desc">
       Sunny
      </p>
      <p class="temp temp-high">
       High: 72 °F
      </p>
     </div>
    </li>
    <li class="forecast-tombstone">
     <div class="

In [23]:
forecastItems = sevenDay.find_all(class_ = 'tombstone-container')
# returns all of the items inside the 'tombstone-container' class
print(forecastItems)

[<div class="tombstone-container">
<p class="period-name">Today<br/><br/></p>
<p><img alt="Today: Sunny, with a high near 72. Light and variable wind becoming west 6 to 11 mph in the afternoon. " class="forecast-icon" src="newimages/medium/few.png" title="Today: Sunny, with a high near 72. Light and variable wind becoming west 6 to 11 mph in the afternoon. "/></p><p class="short-desc">Sunny</p><p class="temp temp-high">High: 72 °F</p></div>, <div class="tombstone-container">
<p class="period-name">Tonight<br/><br/></p>
<p><img alt="Tonight: Mostly clear, with a low around 57. West wind 6 to 11 mph becoming light southwest  after midnight. " class="forecast-icon" src="newimages/medium/nfew.png" title="Tonight: Mostly clear, with a low around 57. West wind 6 to 11 mph becoming light southwest  after midnight. "/></p><p class="short-desc">Mostly Clear</p><p class="temp temp-low">Low: 57 °F</p></div>, <div class="tombstone-container">
<p class="period-name">Thursday<br/><br/></p>
<p><img a

In [24]:
tonight = forecastItems[0]
# return the first item in the forecastItems list which is associated with 'Tonight's Forecast
print(tonight.prettify())

<div class="tombstone-container">
 <p class="period-name">
  Today
  <br/>
  <br/>
 </p>
 <p>
  <img alt="Today: Sunny, with a high near 72. Light and variable wind becoming west 6 to 11 mph in the afternoon. " class="forecast-icon" src="newimages/medium/few.png" title="Today: Sunny, with a high near 72. Light and variable wind becoming west 6 to 11 mph in the afternoon. "/>
 </p>
 <p class="short-desc">
  Sunny
 </p>
 <p class="temp temp-high">
  High: 72 °F
 </p>
</div>


### Now, let's extract some information from the page
Get:
* Name of the forecast item
* Short description
* Temperature

In [25]:
period = tonight.find(class_ = 'period-name')
# Grabs only the text associated with the first instance of the 'period-name' class
# Note: without .get_text(), the command returns all info from the start to the finish of
# the relevant divider, in this case <p class ='period-name'>...</p>
period

<p class="period-name">Today<br/><br/></p>

In [26]:
shortDesc = tonight.find(class_ = 'short-desc').get_text()
# Grabs all info associated with the first instance of the 'short-desc' class
shortDesc

'Sunny'

In [27]:
temp = tonight.find(class_ = 'temp').get_text()
# Grabs all info associated with the 'temp' class
# Note: tonight.find(class_ = 'temp-low') and tonight.find(class_ = 'temp temp-low')
# all return the same results
temp

'High: 72 °F'

In [28]:
print(period.get_text())
print(shortDesc)
print(temp)

Today
Sunny
High: 72 °F


In [29]:
# Return only the 'title' element from the 'img' element
img = tonight.find('img')
desc = img['title']
desc

'Today: Sunny, with a high near 72. Light and variable wind becoming west 6 to 11 mph in the afternoon. '

### Extract all of the relevant info from the page

In [30]:
periodTags = sevenDay.select('.tombstone-container .period-name')
# The '.' finds all classes of the respective item, tombstone-container and period-name
# in this case
periodTags

[<p class="period-name">Today<br/><br/></p>,
 <p class="period-name">Tonight<br/><br/></p>,
 <p class="period-name">Thursday<br/><br/></p>,
 <p class="period-name">Thursday<br/>Night</p>,
 <p class="period-name">Friday<br/><br/></p>,
 <p class="period-name">Friday<br/>Night</p>,
 <p class="period-name">Saturday<br/><br/></p>,
 <p class="period-name">Saturday<br/>Night</p>,
 <p class="period-name">Sunday<br/><br/></p>]

In [31]:
periods = [i.get_text() for i in periodTags]
# list comprehension to return a list of only the text in the list
periods

['Today',
 'Tonight',
 'Thursday',
 'ThursdayNight',
 'Friday',
 'FridayNight',
 'Saturday',
 'SaturdayNight',
 'Sunday']

In [32]:
shortDescTags = sevenDay.select('.tombstone-container .short-desc')
shortDescs = [i.get_text() for i in shortDescTags]
shortDescs

['Sunny',
 'Mostly Clear',
 'Mostly Sunny',
 'Partly Cloudy',
 'Sunny',
 'Mostly Clear',
 'Mostly Sunny',
 'Partly Cloudy',
 'Mostly Sunny']

In [33]:
# Combine it all into one line
temps = [i.get_text() for i in sevenDay.select('.tombstone-container .temp')]
temps

['High: 72 °F',
 'Low: 57 °F',
 'High: 74 °F',
 'Low: 56 °F',
 'High: 71 °F',
 'Low: 56 °F',
 'High: 71 °F',
 'Low: 56 °F',
 'High: 69 °F']

In [34]:
descs = [i['title'] for i in sevenDay.select('.tombstone-container img')]
descs

['Today: Sunny, with a high near 72. Light and variable wind becoming west 6 to 11 mph in the afternoon. ',
 'Tonight: Mostly clear, with a low around 57. West wind 6 to 11 mph becoming light southwest  after midnight. ',
 'Thursday: Mostly sunny, with a high near 74. Light and variable wind becoming west 9 to 14 mph in the afternoon. ',
 'Thursday Night: Partly cloudy, with a low around 56. West southwest wind 5 to 13 mph. ',
 'Friday: Sunny, with a high near 71. Light and variable wind becoming west 8 to 13 mph in the afternoon. ',
 'Friday Night: Mostly clear, with a low around 56.',
 'Saturday: Mostly sunny, with a high near 71.',
 'Saturday Night: Partly cloudy, with a low around 56.',
 'Sunday: Mostly sunny, with a high near 69.']

# Now, let's put it all in a dataframe using Pandas

In [35]:
import pandas as pd

In [36]:
weather = pd.DataFrame({'period': periods, 
                        'shortDesc': shortDescs, 
                        'temp': temps, 
                        'desc': descs})
weather

Unnamed: 0,period,shortDesc,temp,desc
0,Today,Sunny,High: 72 °F,"Today: Sunny, with a high near 72. Light and v..."
1,Tonight,Mostly Clear,Low: 57 °F,"Tonight: Mostly clear, with a low around 57. W..."
2,Thursday,Mostly Sunny,High: 74 °F,"Thursday: Mostly sunny, with a high near 74. L..."
3,ThursdayNight,Partly Cloudy,Low: 56 °F,"Thursday Night: Partly cloudy, with a low arou..."
4,Friday,Sunny,High: 71 °F,"Friday: Sunny, with a high near 71. Light and ..."
5,FridayNight,Mostly Clear,Low: 56 °F,"Friday Night: Mostly clear, with a low around 56."
6,Saturday,Mostly Sunny,High: 71 °F,"Saturday: Mostly sunny, with a high near 71."
7,SaturdayNight,Partly Cloudy,Low: 56 °F,"Saturday Night: Partly cloudy, with a low arou..."
8,Sunday,Mostly Sunny,High: 69 °F,"Sunday: Mostly sunny, with a high near 69."


In [37]:
tempNums = weather['temp'].str.extract('(?P<temp_num>\d+)', expand = False)
weather['tempNum'] = tempNums.astype('int')
weather

Unnamed: 0,period,shortDesc,temp,desc,tempNum
0,Today,Sunny,High: 72 °F,"Today: Sunny, with a high near 72. Light and v...",72
1,Tonight,Mostly Clear,Low: 57 °F,"Tonight: Mostly clear, with a low around 57. W...",57
2,Thursday,Mostly Sunny,High: 74 °F,"Thursday: Mostly sunny, with a high near 74. L...",74
3,ThursdayNight,Partly Cloudy,Low: 56 °F,"Thursday Night: Partly cloudy, with a low arou...",56
4,Friday,Sunny,High: 71 °F,"Friday: Sunny, with a high near 71. Light and ...",71
5,FridayNight,Mostly Clear,Low: 56 °F,"Friday Night: Mostly clear, with a low around 56.",56
6,Saturday,Mostly Sunny,High: 71 °F,"Saturday: Mostly sunny, with a high near 71.",71
7,SaturdayNight,Partly Cloudy,Low: 56 °F,"Saturday Night: Partly cloudy, with a low arou...",56
8,Sunday,Mostly Sunny,High: 69 °F,"Sunday: Mostly sunny, with a high near 69.",69


In [38]:
# Show the average temperature for the week and round it to two decimal places
round(weather['tempNum'].mean(),2)

64.67

In [39]:
# Conditionally selecting rows - choosing only night time observations
night = weather['period'].str.contains('Night')
weather['Night'] = night * 1
weather

Unnamed: 0,period,shortDesc,temp,desc,tempNum,Night
0,Today,Sunny,High: 72 °F,"Today: Sunny, with a high near 72. Light and v...",72,0
1,Tonight,Mostly Clear,Low: 57 °F,"Tonight: Mostly clear, with a low around 57. W...",57,0
2,Thursday,Mostly Sunny,High: 74 °F,"Thursday: Mostly sunny, with a high near 74. L...",74,0
3,ThursdayNight,Partly Cloudy,Low: 56 °F,"Thursday Night: Partly cloudy, with a low arou...",56,1
4,Friday,Sunny,High: 71 °F,"Friday: Sunny, with a high near 71. Light and ...",71,0
5,FridayNight,Mostly Clear,Low: 56 °F,"Friday Night: Mostly clear, with a low around 56.",56,1
6,Saturday,Mostly Sunny,High: 71 °F,"Saturday: Mostly sunny, with a high near 71.",71,0
7,SaturdayNight,Partly Cloudy,Low: 56 °F,"Saturday Night: Partly cloudy, with a low arou...",56,1
8,Sunday,Mostly Sunny,High: 69 °F,"Sunday: Mostly sunny, with a high near 69.",69,0
