# Dowload the weather data from wunderground.com API

We use [wunderground.com API](https://www.wunderground.com/weather/api) to download weather data in XML format. One API call can be used to download data for a chosen airport and chosen date. This code can be used to generate a data frame and hence a csv file for weather data for a given airport, given number of days, and a given starting date.


There are 4 input parameters:

1. state: State of the airport

2. airport: Name of the airport code

3. ndays: Number of days we wish to download the data for (note that this number has some limitation per day)

4. first_day: First date of the data we wish to start downloading 

## Loading modules

In [173]:
from urllib.request import urlopen
from xml.etree import ElementTree as ET
from lxml import etree
import pandas as pd
import numpy as np
import time
import pickle
from collections import OrderedDict
from lxml.etree import tostring as ts
from lxml.etree import fromstring as fs

## Setting data frame display option

In [174]:
pd.set_option('display.max_columns', None)

## Input parameters

In [4]:
#Note we start collecting data from Jan 01, 2015
state = "NY"
airport = "JFK"
ndays = 500
first_day = '2016-07-14'
API_key = # provide your wunderground API here

## Function to download tree for a given day (and given location)

In [1]:
def get_observations_oneday(date,state, airport):
    year = str(date.year)
    month = str(date.month).zfill(2)
    day = str(date.day).zfill(2)
    url = 'http://api.wunderground.com/api/'+API_key+'/history_'+year+month+day+'/q/'+state+'/'+airport+'.xml'
    tree = ET.parse(urlopen(url))
    root = tree.getroot()
    obs = root[3].getchildren()[2]
    return(obs)
#http://api.wunderground.com/api/key/history_20160918/q/GA/ATL.xml

## Function to parse a tree and convert that to a data frame

In [2]:
def get_df_oneday(obs, date, state, airport):
    lists = [[] for _ in range(24)]
    for ob in obs.getchildren():
        ob_date_element = ob.getchildren()[0]
        lists[0].append(ob_date_element.getchildren()[1].text) # Year
        lists[1].append(ob_date_element.getchildren()[2].text) # Month
        lists[2].append(ob_date_element.getchildren()[3].text) # DayOfMonth
        lists[3].append(ob_date_element.getchildren()[4].text) # Hour
        lists[4].append(ob_date_element.getchildren()[5].text) # Minute
        lists[5].append(ob_date_element.getchildren()[6].text) # TimeZone
        lists[6].append(ob.getchildren()[3].text) # Temperature in F
        lists[7].append(ob.getchildren()[5].text) # DewPoint in F
        lists[8].append(ob.getchildren()[6].text) # Humidity in %
        lists[9].append(ob.getchildren()[8].text) # WindSpeed in mph
        lists[10].append(ob.getchildren()[10].text) # WindGust in mph
        lists[11].append(ob.getchildren()[11].text) # WindDirection in degrees (0 or 360 - N, 270 - W, 90 - E, 180 - S)
        lists[12].append(ob.getchildren()[14].text) # Visibility in miles
        lists[13].append(ob.getchildren()[16].text) # Pressure in inHg
        lists[14].append(ob.getchildren()[18].text) # WindChill in F
        lists[15].append(ob.getchildren()[20].text) # HeatIndex in F
        lists[16].append(ob.getchildren()[22].text) # Precipitation in inches
        lists[17].append(ob.getchildren()[23].text) # Condition
        lists[18].append(ob.getchildren()[25].text) # Fog
        lists[19].append(ob.getchildren()[26].text) # Rain
        lists[20].append(ob.getchildren()[27].text) # Snow
        lists[21].append(ob.getchildren()[28].text) # Hail
        lists[22].append(ob.getchildren()[29].text) # Thunder
        lists[23].append(ob.getchildren()[30].text) # Tornado
    
    df = pd.DataFrame(OrderedDict({"Year": lists[0], "Month": lists[1], "DayOfMonth": lists[2], "Hour": lists[3],
                  "Minute": lists[4], "TimeZone": lists[5], "Temperature": lists[6], "DewPoint": lists[7],
                  "Humidity": lists[8], "WindSpeed": lists[9], "WindGust": lists[10], "WindDirection": lists[11],
                  "Visibility": lists[12], "Pressure": lists[13], "WindChill": lists[14], "HeatIndex": lists[15],
                  "Precipitation": lists[16], "Condition": lists[17], "Fog": lists[18], "Rain": lists[19],
                  "Snow": lists[20], "Hail": lists[21], "Thunder": lists[22], "Tornado": lists[23]}))
    
    # For some hours, sometimes there are multiple entries corresponding to multiple minutes.
    # In such cases, we just pick the measurement for the latest minute in a given hour.
    # This is done to be consistent with the flight data where we do not have records of multiple flights 
    # (i.e. same uniue flights) in a single hour. 
    df = df.iloc[df.groupby('Hour')['Minute'].agg(pd.Series.idxmax)]
    df.Hour = df.Hour.astype(int)
    df["Date"] = date
    df["State"] = state
    df["Airport"] = airport
    return(df)

## Loop over all days and concatenate data frames for all days in to a single data frame 

In [479]:
# Dates array
dates = pd.date_range(first_day, periods=ndays, freq='D')
# Initializing a list to contain all observations trees
obs_store = [] # this is optional (just to store something in case we loose data frames!)
# Initializing with first date, i.e. Jan 01, 2015
date = dates[0]
# Getting observations for the first date
observations = get_observations_oneday(date, state, airport)
obs_store.append(observations)
# Converting the observations into data frame
df = get_df_oneday(observations, date, state, airport)
# Looping through all dates and concatinating data frames for each day into one big data frame
i = 0
for date in dates:
    if (i != 0):
        observations = get_observations_oneday(date, state, airport)
        obs_store.append(observations) # optional
        df = pd.concat([df, get_df_oneday(observations, date, state, airport)])
        if (i % 10 == 0):
            print("Data processed for date: ", date)
    i = i + 1
    time.sleep(6) # the loop is paused to satisfy wunderground.com's API rate limiting criteria

Data processed for date:  2016-07-24 00:00:00
Data processed for date:  2016-08-03 00:00:00
Data processed for date:  2016-08-13 00:00:00
Data processed for date:  2016-08-23 00:00:00
Data processed for date:  2016-09-02 00:00:00
Data processed for date:  2016-09-12 00:00:00
Data processed for date:  2016-09-22 00:00:00
Data processed for date:  2016-10-02 00:00:00
Data processed for date:  2016-10-12 00:00:00
Data processed for date:  2016-10-22 00:00:00
Data processed for date:  2016-11-01 00:00:00
Data processed for date:  2016-11-11 00:00:00
Data processed for date:  2016-11-21 00:00:00
Data processed for date:  2016-12-01 00:00:00
Data processed for date:  2016-12-11 00:00:00
Data processed for date:  2016-12-21 00:00:00
Data processed for date:  2016-12-31 00:00:00


In [480]:
df.head()

Unnamed: 0,Year,Month,DayOfMonth,Hour,Minute,TimeZone,Temperature,DewPoint,Humidity,WindSpeed,WindGust,WindDirection,Visibility,Pressure,WindChill,HeatIndex,Precipitation,Condition,Fog,Rain,Snow,Hail,Thunder,Tornado,Date,State,Airport
0,2016,07,14,0,51,America/New_York,75.0,70.0,84,10.4,-9999.0,160,10.0,30.00,-999,-9999,-9999.00,Overcast,0,0,0,0,0,0,2016-07-14,NY,JFK
1,2016,07,14,1,51,America/New_York,75.0,71.1,87,10.4,-9999.0,160,10.0,30.01,-999,-9999,-9999.00,Overcast,0,0,0,0,0,0,2016-07-14,NY,JFK
2,2016,07,14,2,51,America/New_York,75.0,73.0,94,10.4,-9999.0,190,10.0,29.99,-999,-9999,-9999.00,Overcast,0,0,0,0,0,0,2016-07-14,NY,JFK
3,2016,07,14,3,51,America/New_York,75.0,73.0,94,10.4,-9999.0,180,10.0,29.98,-999,-9999,0.00,Overcast,0,0,0,0,0,0,2016-07-14,NY,JFK
4,2016,07,14,4,51,America/New_York,75.0,73.0,94,11.5,-9999.0,180,10.0,29.98,-999,-9999,-9999.00,Overcast,0,0,0,0,0,0,2016-07-14,NY,JFK
5,2016,07,14,5,51,America/New_York,75.0,73.0,94,9.2,-9999.0,150,10.0,29.95,-999,-9999,-9999.00,Scattered Clouds,0,0,0,0,0,0,2016-07-14,NY,JFK
7,2016,07,14,6,51,America/New_York,75.0,73.0,94,11.5,-9999.0,150,7.0,29.95,-999,-9999,-9999.00,Mostly Cloudy,0,0,0,0,0,0,2016-07-14,NY,JFK
8,2016,07,14,7,51,America/New_York,75.9,73.0,91,12.7,-9999.0,180,7.0,29.94,-999,-9999,-9999.00,Overcast,0,0,0,0,0,0,2016-07-14,NY,JFK
10,2016,07,14,8,51,America/New_York,77.0,73.9,90,11.5,-9999.0,170,3.0,29.95,-999,-9999,-9999.00,Mostly Cloudy,0,0,0,0,0,0,2016-07-14,NY,JFK
11,2016,07,14,9,51,America/New_York,80.1,75.0,85,10.4,-9999.0,210,7.0,29.94,-999,85.1,-9999.00,Scattered Clouds,0,0,0,0,0,0,2016-07-14,NY,JFK


## Resetting index to make the data frame consistent in terms of indexing

In [481]:
df = df.reset_index()
df = df.drop("index", axis=1)

In [482]:
df.shape

(4100, 27)

## Saving the resulting data frame to a csv file

In [483]:
first_date = str(dates[0].year)+str(dates[0].month).zfill(2)+str(dates[0].day).zfill(2)
last_date = str(dates[-1].year)+str(dates[-1].month).zfill(2)+str(dates[-1].day).zfill(2)
output_file = "WeatherData/"+airport+"/"+"Weather_"+state+"_"+airport+"_"+first_date+"-"+last_date+".csv"
df.to_csv(output_file)

## Optional: Saving a pickle file (just in case we loose data frames!)

In [484]:
output_pickle = "WeatherData/"+airport+"/"+"Weather_"+state+"_"+airport+"_"+first_date+"-"+last_date+".pkl"
filehandler = open(output_pickle,"wb")
pickle.dump(obs_store[0:-1],filehandler)
filehandler.close()

In [172]:
len(obs_store)

116

Just in case something happens to data frame result above, we can rerun without calling Wunderground API using following code and regenerate our data frame.

In [32]:
file = open(output_pickle,'rb')
obs_store = pickle.load(file)
file.close()

# Dates array
dates = pd.date_range('2015-01-01', periods=ndays, freq='D')
# Initializing with first date, i.e. Jan 01, 2015
date = dates[0]
# Getting observations for the first date
observations = obs_store[0]
# Converting the observations into data frame
df1 = get_df_oneday(observations, date, state, airport)
# Looping through all dates and concatinating data frames for each day into one big data frame
i = 0
for date in dates:
    if (i != 0):
        observations = obs_store[i]
        df1 = pd.concat([df1, get_df_oneday(observations, date, state, airport)])
    i = i + 1

In [474]:
pd.date_range('2016-07-14', periods=171, freq='D')

DatetimeIndex(['2016-07-14', '2016-07-15', '2016-07-16', '2016-07-17',
               '2016-07-18', '2016-07-19', '2016-07-20', '2016-07-21',
               '2016-07-22', '2016-07-23',
               ...
               '2016-12-22', '2016-12-23', '2016-12-24', '2016-12-25',
               '2016-12-26', '2016-12-27', '2016-12-28', '2016-12-29',
               '2016-12-30', '2016-12-31'],
              dtype='datetime64[ns]', length=171, freq='D')