# Webscraper Example

The goal of this notebook is to test out the webscraper implementation of libraries *request* and *beautifulsoup4*.  We will go through a simple example of creating a dataframe from information provided in weather.com.

In [1]:
# Import libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
# Request page
%time page = requests.get("https://weather.com/weather/tenday/l/USCT0257:1:US")

page
# a response of 200 indicates page loaded

CPU times: user 123 ms, sys: 21.4 ms, total: 145 ms
Wall time: 3.53 s


<Response [200]>

In [3]:
# Note that the webpage links to the weather.com site for weston.  Let's graph the days in the 10 day forecast.

# Initialize soup parser
soup = BeautifulSoup(page.content, 'html.parser')

# Find all tags with class "day-detail clearfix"
soup.find_all(class_="day-detail clearfix")

date = []
for i in soup.find_all(class_="day-detail clearfix"):
    date.append(i.get_text())
    
date


[u'MAY 23',
 u'MAY 24',
 u'MAY 25',
 u'MAY 26',
 u'MAY 27',
 u'MAY 28',
 u'MAY 29',
 u'MAY 30',
 u'MAY 31',
 u'JUN 1',
 u'JUN 2',
 u'JUN 3',
 u'JUN 4',
 u'JUN 5',
 u'JUN 6']

In [4]:
# Now, let's grab the temperature for each day
# Note that select identifies css labels - behaves just as find_all
temps = soup.select("td.temp span")

max_temp = [] 
min_temp = []
sep = u'\xb0'


for i in range(1, len(temps)-1):
    if i % 3 == 0:
        max_temp.append(temps[i].get_text().split(sep, 1)[0])
    elif i % 3 == 2:
        min_temp.append(temps[i].get_text().split(sep, 1)[0])

[min_temp, max_temp]



[[u'54',
  u'53',
  u'54',
  u'53',
  u'54',
  u'55',
  u'55',
  u'59',
  u'55',
  u'56',
  u'57',
  u'57',
  u'58',
  u'60'],
 [u'68',
  u'58',
  u'64',
  u'70',
  u'63',
  u'63',
  u'72',
  u'74',
  u'71',
  u'72',
  u'74',
  u'74',
  u'73',
  u'76']]

In [7]:
# Create dataframe of weather
# Get rid of first day in forecast (today)
date.pop(0)

weather_10_day_forcast = pd.DataFrame({"Date": date, "Min": min_temp, "Max": max_temp})
weather_10_day_forcast

Unnamed: 0,Date,Max,Min
0,MAY 24,68,54
1,MAY 25,58,53
2,MAY 26,64,54
3,MAY 27,70,53
4,MAY 28,63,54
5,MAY 29,63,55
6,MAY 30,72,55
7,MAY 31,74,59
8,JUN 1,71,55
9,JUN 2,72,56


In [16]:
# Access short description
short_des = soup.select("td.description span")
short_des.pop(0)
sd = []
for i in short_des:
    sd.append(i.get_text())
    
sd = pd.Series(sd)
weather_10_day_forcast["Short Description"] = sd
weather_10_day_forcast


Unnamed: 0,Date,Max,Min,Short Description
0,MAY 24,68,54,AM Showers
1,MAY 25,58,53,Rain
2,MAY 26,64,54,Scattered Thunderstorms
3,MAY 27,70,53,Partly Cloudy
4,MAY 28,63,54,PM Showers
5,MAY 29,63,55,Cloudy
6,MAY 30,72,55,PM Thunderstorms
7,MAY 31,74,59,Mostly Sunny
8,JUN 1,71,55,Partly Cloudy
9,JUN 2,72,56,Mostly Sunny
