<a href='https://ai.meng.duke.edu'> = <img align="left" style="padding-top:10px;" src=https://storage.googleapis.com/aipi_datasets/Duke-AIPI-Logo.png>

# Web Scraping Example: National Weather Service Forecast
In this notebook we will demonstrate how to scrape and parse the data from a website using the Requests and BeautifulSoup libraries.  Our objective is to extract the forecast for the Raleigh-Durham Airport from the National Weather Service website.

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt

In [2]:
# Let's try to scrape the National Weather Service forecast
url = 'https://forecast.weather.gov/MapClick.php?lat=35.82547090000003&lon=-78.90540939999994#.X1u__NYpCi4'

# Request the page and use BeautifulSoup to extract the contents
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

# Print out the page contents
print(soup.prettify())

<!DOCTYPE html>
<html class="no-js">
 <head>
  <!-- Meta -->
  <meta content="width=device-width" name="viewport"/>
  <link href="http://purl.org/dc/elements/1.1/" rel="schema.DC"/>
  <title>
   National Weather Service
  </title>
  <meta content="National Weather Service" name="DC.title">
   <meta content="NOAA National Weather Service National Weather Service" name="DC.description"/>
   <meta content="US Department of Commerce, NOAA, National Weather Service" name="DC.creator"/>
   <meta content="" name="DC.date.created" scheme="ISO8601"/>
   <meta content="EN-US" name="DC.language" scheme="DCTERMS.RFC1766"/>
   <meta content="weather, National Weather Service" name="DC.keywords"/>
   <meta content="NOAA's National Weather Service" name="DC.publisher"/>
   <meta content="National Weather Service" name="DC.contributor"/>
   <meta content="//www.weather.gov/disclaimer.php" name="DC.rights"/>
   <meta content="General" name="rating"/>
   <meta content="index,follow" name="robots"/>
   <

In [3]:
# We can find elements by tag
# Let's look for all H2 tags
soup.find_all('h2')

[<h2 class="panel-title">Raleigh / Durham, Raleigh-Durham International Airport (KRDU)</h2>,
 <h2 class="panel-title">
                 4 Miles W Morrisville NC    </h2>,
 <h2 class="panel-title">Detailed Forecast</h2>,
 <h2 class="panel-title">Additional Forecasts and Information</h2>,
 <h2 class="panel-title">Additional Resources</h2>]

In [4]:
# We can also find elements by id
# Let's extract the daily forecast object (id 'seven-day-forecast-body')
daily_forecast = soup.find(attrs={'id':'seven-day-forecast-body'})
print(daily_forecast.prettify())

<div class="panel-body" id="seven-day-forecast-body">
 <div id="seven-day-forecast-container">
  <ul class="list-unstyled" id="seven-day-forecast-list">
   <li class="forecast-tombstone">
    <div class="tombstone-container">
     <p class="period-name">
      Tonight
      <br/>
      <br/>
     </p>
     <p>
      <img alt="Tonight: Mostly clear, with a low around 58. Calm wind. " class="forecast-icon" src="newimages/medium/nfew.png" title="Tonight: Mostly clear, with a low around 58. Calm wind. "/>
     </p>
     <p class="short-desc">
      Mostly Clear
     </p>
     <p class="temp temp-low">
      Low: 58 °F
     </p>
    </div>
   </li>
   <li class="forecast-tombstone">
    <div class="tombstone-container">
     <p class="period-name">
      Saturday
      <br/>
      <br/>
     </p>
     <p>
      <img alt="Saturday: Sunny, with a high near 84. Calm wind becoming east around 6 mph in the afternoon. " class="forecast-icon" src="newimages/medium/skc.png" title="Saturday: Sunny, 

In [11]:
# We can also find elements by class
# Extract the individual forecasts (class 'forecast-tombstone')
tombstones = daily_forecast.find_all(attrs={'class':'forecast-tombstone'})

forecast_dict = {}
# Extract the period, forecast description, and temperature text for each forecast period
for tombstone in tombstones:
    period = tombstone.find(attrs={'class':'period-name'}).text
    desc = tombstone.find(attrs={'class':'short-desc'}).text
    temp = tombstone.find(attrs={'class':'temp'}).text
    
    # Add to forecast_dict
    forecast_dict[period] = [desc,temp]
    
for day in forecast_dict:
    print(day)
    print(forecast_dict[day])

Tonight
['Mostly Clear', 'Low: 58 °F']
Saturday
['Sunny', 'High: 84 °F']
SaturdayNight
['Clear', 'Low: 59 °F']
Sunday
['Sunny', 'High: 86 °F']
SundayNight
['Clear', 'Low: 62 °F']
Monday
['Sunny', 'High: 89 °F']
MondayNight
['Clear', 'Low: 66 °F']
Tuesday
['Sunny', 'High: 91 °F']
TuesdayNight
['Mostly Clear', 'Low: 66 °F']


In [12]:
# Organize our forecasts in a dataframe
forecast_df = pd.DataFrame.from_dict(forecast_dict,orient='index',columns=['Conditions','Temperature'])
# Clean up Temperature column to remove text
forecast_df['Temperature'] = forecast_df['Temperature'].apply(lambda x: x.split(':')[1].strip())
forecast_df

Unnamed: 0,Conditions,Temperature
Tonight,Mostly Clear,58 °F
Saturday,Sunny,84 °F
SaturdayNight,Clear,59 °F
Sunday,Sunny,86 °F
SundayNight,Clear,62 °F
Monday,Sunny,89 °F
MondayNight,Clear,66 °F
Tuesday,Sunny,91 °F
TuesdayNight,Mostly Clear,66 °F
