# Taxi Data

In [39]:
import os

%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt


import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)

import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [76]:
#import data (Yellow Taxi June 2015) from http://www.nyc.gov/html/tlc/html/about/trip_record_data.shtml

name = 'yellow_tripdata_2015-06.csv'
fullframe = pd.read_csv(name)

# set sample size, original frame is too large to use practically (time constraints)
ssize = .001

In [41]:
fullframe.shape

(12324935, 19)

In [80]:
# clean up column names
fullframe.rename(columns={
        'tpep_pickup_datetime' : 'pickup_datetime',
        'tpep_dropoff_datetime' : 'dropoff_datetime'
    }, inplace=True)

#convert pickup and dropoff to from strings to date objects
fullframe['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
fullframe['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])
fullframe.head()

Unnamed: 0,VendorID,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,-73.95443,40.764141,1,N,-73.974754,40.754093,2,17.0,0,0.5,0.0,0,0.3,17.8
1,2,2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,-73.971443,40.758942,1,N,-73.978539,40.761909,1,6.5,0,0.5,1.0,0,0.3,8.3
2,2,2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,-73.978111,40.738434,1,N,-73.990273,40.745438,1,8.0,0,0.5,2.2,0,0.3,11.0
3,2,2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,-73.945892,40.773529,1,N,-73.971527,40.76033,1,13.5,0,0.5,2.86,0,0.3,17.16
4,1,2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.4,-73.979088,40.776772,1,N,-73.982162,40.758999,2,9.5,0,0.5,0.0,0,0.3,10.3


In [None]:
sframe = fullframe.sample(frac = ssize)
# sframe.reset_index(drop=True, inplace=True)

# components returns a 7-tuple --> (days, hours, minutes, seconds, milliseconds, microseconds, nanoseconds)
# temp.components[2] # minutes of a journey
# temp.components[3] # seconds of a journey

sframe['pickup_seconds'] = sframe.pickup_datetime.apply(lambda x: x.hour*3600 + x.minute*60 + x.second)
sframe['dropoff_seconds'] = sframe.dropoff_datetime.apply(lambda x: x.hour*3600 + x.minute*60 + x.second)

sframe['pickup_minutes'] = sframe.pickup_datetime.apply(lambda x: x.hour*60 + x.minute + float(x.second)/60)
sframe['dropoff_minutes'] = sframe.dropoff_datetime.apply(lambda x: x.hour*60 + x.minute + float(x.second)/60)

sframe['pickup_hours'] = sframe.pickup_datetime.apply(lambda x: x.hour + float(x.minute)/60 + float(x.second)/3600)
sframe['dropoff_seconds'] = sframe.dropoff_datetime.apply(lambda x: x.hour + float(x.minute)/60 + float(x.second)/3600)

sframe['trip_seconds'] = (sframe.dropoff_datetime - sframe.pickup_datetime).apply(lambda x: float(x)/1000000000)

sframe['trip_minutes'] = (sframe.dropoff_datetime - sframe.pickup_datetime).apply(lambda x: float(x)/60000000000)

sframe['trip_hours'] = (sframe.dropoff_datetime - sframe.pickup_datetime).apply(lambda x: float(x)/3.6e12)

In [84]:
sframe.head()

Unnamed: 0,VendorID,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,pickup_seconds,dropoff_seconds,pickup_minutes,dropoff_minutes,pickup_hours
588619,1,2015-06-02 18:30:21,2015-06-02 19:09:55,1,7.8,-73.99501,40.734112,1,N,-73.949799,40.801804,2,30.0,1.0,0.5,0.0,0,0.3,31.8,66621,19.165278,1110.35,1149.916667,18.505833
257698,2,2015-06-01 08:11:46,2015-06-01 08:17:40,4,1.06,-73.97364,40.7551,1,N,-73.985619,40.747036,1,6.0,0.0,0.5,1.7,0,0.3,8.5,29506,8.294444,491.766667,497.666667,8.196111
5729594,2,2015-06-17 19:02:15,2015-06-17 19:20:38,2,3.16,-73.99012,40.757809,1,N,-73.999741,40.722099,1,14.5,1.0,0.5,3.26,0,0.3,19.56,68535,19.343889,1142.25,1160.633333,19.0375
436926,1,2015-06-06 21:19:08,2015-06-06 21:54:02,2,10.9,-73.970291,40.789108,1,N,-73.976204,40.683479,1,36.0,0.5,0.5,7.45,0,0.3,44.75,76748,21.900556,1279.133333,1314.033333,21.318889
564548,1,2015-06-01 11:57:04,2015-06-01 12:04:45,1,0.7,-73.96814,40.758984,1,N,-73.961761,40.768414,1,7.0,0.0,0.5,1.5,0,0.3,9.3,43024,12.079167,717.066667,724.75,11.951111


In [90]:
for x in (sframe.dropoff_datetime - sframe.pickup_datetime).values[:4]:
    print float(x)/60000000000

39.5666666667
5.9
18.3833333333
34.9
