# EXTRACTING WEATHER FEATURES

This is for extracting weather features from the two weather datasets provided. Creates an output dict from which desired features can be selected or all features can be combined with existing dataset

In [462]:
import pandas as pd
import datetime as dt

Read the training set 

In [463]:
dftrain = pd.read_csv('train.csv', nrows = 10) #Reading first 10 rows to reduce execution time
dftrain.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


# USING CENTRAL PARK WEATHER DATASET

Read the weather features

In [464]:
dfweather = pd.read_csv('weather_2016.csv')
dfweather.head()

Unnamed: 0,date,maximum temerature,minimum temperature,average temperature,precipitation,snow fall,snow depth
0,01-01-16,44,34,39.0,0,0,0
1,02-01-16,40,30,35.0,0,0,0
2,03-01-16,46,33,39.5,0,0,0
3,04-01-16,35,13,24.0,0,0,0
4,05-01-16,29,10,19.5,0,0,0


Index each row by the date

In [465]:
dfweather.set_index("date", inplace=True)
dfweather.head()    

Unnamed: 0_level_0,maximum temerature,minimum temperature,average temperature,precipitation,snow fall,snow depth
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
01-01-16,44,34,39.0,0,0,0
02-01-16,40,30,35.0,0,0,0
03-01-16,46,33,39.5,0,0,0
04-01-16,35,13,24.0,0,0,0
05-01-16,29,10,19.5,0,0,0


Function that extracts the features given an input date and returns a dict

In [466]:
def weather_extract(date):
    #Need to modify the date format from training and testing sets
    date_format = dt.datetime.strptime(date, '%Y-%m-%d').strftime('%d-%m-%y')
    
    #Output a dict containing features
    out = dict(dfweather.loc[date_format])
    return out

We can now extract desired features given a particular date

In [467]:
dftrain_weather = dftrain.copy(deep=True)
dftrain_weather.set_index("id", inplace=True)
dftrain_weather.apply(lambda row: pd.Series(weather_extract(row['pickup_datetime'][:10])), axis = 1)

Unnamed: 0_level_0,average temperature,maximum temerature,minimum temperature,precipitation,snow depth,snow fall
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
id2875421,45.0,48,42,0.89,0,0
id2377394,72.0,84,60,0,0,0
id3858529,24.5,30,19,0,T,0
id3504673,36.0,46,26,0,0,0
id2181028,42.5,49,36,0,0,0
id0801584,33.0,40,26,0,1,0
id1813257,69.5,76,63,0,0,0
id1324603,57.0,63,51,T,0,0
id1301050,70.0,78,62,0,0,0
id0012891,59.5,68,51,T,0,0


# USING THE KNYCMETARS DATASET

In [468]:
dfweather_knyc = pd.read_csv('knycmetars.csv') 
dfweather_knyc.head()

Unnamed: 0,Time,Temp.,Windchill,Heat Index,Humidity,Pressure,Dew Point,Visibility,Wind Dir,Wind Speed,Gust Speed,Precip,Events,Conditions
0,2015-12-31 02:00:00,7.8,7.1,,0.89,1017.0,6.1,8.0,NNE,5.6,0.0,0.8,,Overcast
1,2015-12-31 03:00:00,7.2,5.9,,0.9,1016.5,5.6,12.9,Variable,7.4,0.0,0.3,,Overcast
2,2015-12-31 04:00:00,7.2,,,0.9,1016.7,5.6,12.9,Calm,0.0,0.0,0.0,,Overcast
3,2015-12-31 05:00:00,7.2,5.9,,0.86,1015.9,5.0,14.5,NW,7.4,0.0,0.0,,Overcast
4,2015-12-31 06:00:00,7.2,6.4,,0.9,1016.2,5.6,11.3,West,5.6,0.0,0.0,,Overcast


Index the dataset by timestamp

In [469]:
dfweather_knyc.set_index("Time", inplace=True)
dfweather_knyc.head()

Unnamed: 0_level_0,Temp.,Windchill,Heat Index,Humidity,Pressure,Dew Point,Visibility,Wind Dir,Wind Speed,Gust Speed,Precip,Events,Conditions
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2015-12-31 02:00:00,7.8,7.1,,0.89,1017.0,6.1,8.0,NNE,5.6,0.0,0.8,,Overcast
2015-12-31 03:00:00,7.2,5.9,,0.9,1016.5,5.6,12.9,Variable,7.4,0.0,0.3,,Overcast
2015-12-31 04:00:00,7.2,,,0.9,1016.7,5.6,12.9,Calm,0.0,0.0,0.0,,Overcast
2015-12-31 05:00:00,7.2,5.9,,0.86,1015.9,5.0,14.5,NW,7.4,0.0,0.0,,Overcast
2015-12-31 06:00:00,7.2,6.4,,0.9,1016.2,5.6,11.3,West,5.6,0.0,0.0,,Overcast


Extract weather features from KNYC dataset

In [470]:
def weather_extract_knyc(pickup_datetime):
    timestamp = pickup_datetime.split()
    date = timestamp[0]
    time = timestamp[1]
    
    time_format = dt.datetime.strptime(time, '%H:%M:%S').strftime('%H:00:00')
    timestamp = date + ' ' + time_format
    
    #Output a dict containing features
    out = dict(dfweather_knyc.loc[timestamp])
    return out
        

The weather data can then be combined with existing dataset using the id

In [471]:
dftrain_knyc = dftrain.copy(deep=True)
dftrain_knyc.set_index("id",inplace=True)
dftrain_knyc.apply(lambda row: pd.Series(weather_extract_knyc(str(row['pickup_datetime']))), axis = 1)

Unnamed: 0_level_0,Conditions,Dew Point,Events,Gust Speed,Heat Index,Humidity,Precip,Pressure,Temp.,Visibility,Wind Dir,Wind Speed,Windchill
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
id2875421,Overcast,2.2,,57.4,,0.86,0.3,1017.5,4.4,8.0,ENE,27.8,-0.5
id2377394,Unknown,18.3,,0.0,29.9,0.53,0.0,1006.6,28.9,16.1,West,7.4,
id3858529,Clear,-16.7,,46.3,,0.46,0.0,1016.3,-6.7,16.1,West,24.1,-14.3
id3504673,Clear,-6.1,,35.2,,0.39,0.0,1019.1,7.2,16.1,South,25.9,3.3
id2181028,Clear,-1.7,,0.0,,0.46,0.0,1026.9,9.4,16.1,Variable,9.3,
id0801584,Clear,-8.3,,29.6,,0.41,0.0,1015.8,3.9,16.1,SW,16.7,0.2
id1813257,Clear,9.4,,0.0,,0.54,0.0,1018.1,18.9,16.1,Variable,7.4,
id1324603,Clear,5.0,,0.0,,0.55,0.0,1021.3,13.9,16.1,Calm,0.0,
id1301050,Clear,17.2,,0.0,,0.6,0.0,1017.3,25.6,12.9,Variable,9.3,
id0012891,Overcast,8.9,,0.0,,0.38,0.0,1011.8,23.9,16.1,West,7.4,
