# Chapter 2: Handling Data
## 2.1 Gather Data
* Handling Data  
* Gather Data  
  * Provided by Others  
  * Finding Sources  
    * Search Engines  
    * Direct from the Source  
    * Universities  
    * General Data Applications  
    * Topical Data  
      * Geography  
      * Sports  
      * World  
      * Government and Politics  
  * Data Scraping  
    * Example: Scrape a Website
    * Generalizing the Example

## 2.2 Formatting Data
* Data Formats
  * Delimited Text
  * JavaScript Object Notation (JSON)
  * Extensible Markup Language (XML)
* Formatting Tools
  * Google Refine
  * Mr. Data Converter
  * Mr. People
  * Spreadsheet Software
* Formatting with Code
  * Example: Switch Between Data Formats
    * Put Logic in the Loop

## 2.3 Wrapping Up

## 2.4 Exercise

In [None]:
# %load ch02/add-csv-flag.py
import csv
reader = csv.reader(open('ch02/wunder-data.txt', 'r'), delimiter=",")
for row in reader:
    if int(row[1]) <= 32:
        is_freezing = '1'
    else:
        is_freezing = '0'
    
    print(row[0] + "," + row[1] + "," + is_freezing)

In [None]:
# %load ch02/csv-to-json.py
import csv

reader = csv.reader(open('ch02/wunder-data.txt', 'r'), delimiter=",")

print( "{ observations: [")

rows_so_far = 0
for row in reader:
    
    rows_so_far += 1
    
    print( '{' )
    print( '"date": ' + '"' + row[0] + '", ')
    print( '"temperature": ' + row[1] )
    
    if rows_so_far < 365:
        print( " },")
    else:
        print( " }")
    
print( "] }")

In [None]:
# %load ch02/csv-to-xml.py
import csv

reader = csv.reader(open('ch02/wunder-data.txt', 'r'), delimiter=",")
print('<weather_data>')

for row in reader:
    print('<observation>')
    print('<date>' + row[0] + '</date>')
    print('<max_temperature>' + row[1] + '</max_temperature>')
    print('</observation>')

print('</weather_data>')

In [None]:
# %load ch02/get-weather-data.py
# import urllib2
from urllib.request import urlopen
# from BeautifulSoup import BeautifulSoup
from bs4 import BeautifulSoup
 
# Create/open a file called wunder.txt (which will be a comma-delimited file)
f = open('ch02/wunder-data.txt', 'w')
 
# Iterate through months and day
for m in range(1, 13):
    for d in range(1, 32):
 
      # Check if already gone through month
      if (m == 2 and d > 28):
        break
      elif (m in [4, 6, 9, 11] and d > 30):
        break
 
      # Open wunderground.com url
      url = "http://www.wunderground.com/history/airport/KBUF/2009/" + str(m) + "/" + str(d) + "/DailyHistory.html"
#       page = urllib2.urlopen(url)
      page = urlopen(url)
 
      # Get temperature from page
      soup = BeautifulSoup(page)
      # dayTemp = soup.body.nobr.b.string
      dayTemp = soup.findAll(attrs={"class":"nobr"})[5].span.string
 
      # Format month for timestamp
      if len(str(m)) < 2:
        mStamp = '0' + str(m)
      else:
        mStamp = str(m)
 
      # Format day for timestamp
      if len(str(d)) < 2:
        dStamp = '0' + str(d)
      else:
        dStamp = str(d)
 
      # Build timestamp
      timestamp = '2009' + mStamp + dStamp
 
      # Write timestamp and temperature to file
      f.write(timestamp + ',' + dayTemp + '\n')
 
# Done getting data! Close file.
f.close()


In [None]:
# %load ch02/get-weather-data-full.py
import urllib2
from BeautifulSoup import BeautifulSoup
 
# Create/open a file called wunder.txt (which will be a comma-delimited file)
f = open('wunder-data.txt', 'w')
 
# Iterate through year, month, and day
for y in range(2009, 2010):
  for m in range(1, 13):
    for d in range(1, 32):
 
      # Check if leap year
      if y%400 == 0:
        leap = True
      elif y%100 == 0:
        leap = False
      elif y%4 == 0:
        leap = True
      else:
        leap = False
 
      # Check if already gone through month
      if (m == 2 and leap and d > 29):
        continue
      elif (m == 2 and d > 28):
        continue
      elif (m in [4, 6, 9, 10] and d > 30):
        continue
 
      # Open wunderground.com url
      url = "http://www.wunderground.com/history/airport/KBUF/"+str(y)+ "/" + str(m) + "/" + str(d) + "/DailyHistory.html"
      page = urllib2.urlopen(url)
 
      # Get temperature from page
      soup = BeautifulSoup(page)
      # dayTemp = soup.body.nobr.b.string
      dayTemp = soup.findAll(attrs={"class":"nobr"})[5].span.string
 
      # Format month for timestamp
      if len(str(m)) < 2:
        mStamp = '0' + str(m)
      else:
        mStamp = str(m)
 
      # Format day for timestamp
      if len(str(d)) < 2:
        dStamp = '0' + str(d)
      else:
        dStamp = str(d)
 
      # Build timestamp
      timestamp = str(y) + mStamp + dStamp
 
      # Write timestamp and temperature to file
      f.write(timestamp + ',' + dayTemp + '\n')
 
# Done getting data! Close file.
f.close()


In [None]:
# %load ch02/xml-to-csv.py
from bs4 import BeautifulStoneSoup

f = open('ch02/wunder-data.xml', 'r')
xml = f.read()

soup = BeautifulStoneSoup(xml)
observations = soup.findAll('observation')
for o in observations:
    print(o.date.string + "," + o.max_temperature.string)
    