In [1]:
import pandas as pd
from datetime import datetime
from pandas.tseries.holiday import USFederalHolidayCalendar

In [2]:
cab_data = pd.read_csv("cab_data_p2.csv")

In [3]:
print(cab_data.iloc[0])

Unnamed: 0                                500324
dropoff_latitude                          40.774
dropoff_longitude                       -73.8709
extra                                          0
fare_amount                                   31
improvement_surcharge                        0.3
mta_tax                                      0.5
passenger_count                                2
payment_type                                   1
pickup_latitude                          40.7569
pickup_longitude                        -73.9731
ratecodeid                                     1
store_and_fwd_flag                             N
tip_amount                                     5
tolls_amount                                5.54
total_amount                               42.34
tpep_dropoff_datetime    2016-04-12T10:28:02.000
tpep_pickup_datetime     2016-04-12T10:05:28.000
trip_distance                               10.7
vendorid                                       1
Name: 0, dtype: obje

In [4]:
# US Holidays Test Code
cal = USFederalHolidayCalendar()
us_holidays = cal.holidays(start='2016-01-01', end='2016-12-31').to_pydatetime()
dt = datetime(2016, 1, 18)
assert(dt in us_holidays)
dt = datetime(2016, 1, 19)
assert(dt not in us_holidays)

In [7]:
def create_date_features(df):
    dt_format = '%Y-%m-%dT%H:%M:%S.000'
    # Get the list of US federal hollidays
    cal = USFederalHolidayCalendar()
    us_holidays = cal.holidays(start='2016-01-01', end='2016-12-31').to_pydatetime()
    
    date_feature_names = ["tpep_dropoff_datetime", "tpep_pickup_datetime"]
    date_feature_to_datetimes = {"tpep_dropoff_datetime" : [],
                                 "tpep_pickup_datetime" : []}
    for feature_name in date_feature_names:
        y, mo, d = [], [], [] # Year, Month, Day
        h, mi, s = [], [], [] # Hours, Minutes, Seconds
        day_of_week = [] # Day of the Week
        is_holiday = [] # Is the date a holiday?
        for time in df[feature_name]:
            # Extract the datetime object from the timestamp
            dt = datetime.strptime(time, dt_format)
            date_feature_to_datetimes[feature_name].append(dt)
            # Add the Year/Month/Day
            y.append(dt.year)
            mo.append(dt.month)
            d.append(dt.day)
            # Add Hour/Minute/Second
            h.append(dt.hour)
            mi.append(dt.minute)
            s.append(dt.second)
            # Add Day of the Week
            day_of_week.append(dt.weekday())
            # Add is_holiday
            y_m_d = datetime(dt.year, dt.month, dt.day)
            is_holiday.append(1 if y_m_d in us_holidays else 0)
        df[feature_name + "_years"] = y
        df[feature_name + "_months"] = mo
        df[feature_name + "_days"] = d
        df[feature_name + "_hours"] = h
        df[feature_name + "_minutes"] = mi
        df[feature_name + "_seconds"] = s
        df[feature_name + "_day_of_week"] = day_of_week
        df[feature_name + "_is_holiday"] = is_holiday
    durations = []
    for dropoff_time, pickup_time in zip(date_feature_to_datetimes["tpep_dropoff_datetime"],
                                         date_feature_to_datetimes["tpep_pickup_datetime"]):
        duration = (dropoff_time - pickup_time).total_seconds() / 60.0
        durations.append(round(duration))
    
    df["duration"] = durations
    return df

cab_data = create_date_features(cab_data)

In [8]:
# Check Distribution of months and holidays
print("Distribution of Months")
print(cab_data["tpep_dropoff_datetime_months"].value_counts())
print("Distribution of Holidays")
print(cab_data["tpep_dropoff_datetime_is_holiday"].value_counts())

0         23
1         15
2          6
3         29
4          4
5          9
6          7
7          9
8         20
9         37
10         7
11         9
12         3
13         3
14         5
15        13
16        21
17        17
18        13
19        17
20         6
21        18
22        92
23        18
24         8
25         7
26        16
27        24
28         5
29        13
          ..
492164    16
492165     9
492166     5
492167    18
492168    28
492169    16
492170     6
492171     4
492172    13
492173    14
492174    17
492175    10
492176    17
492177     4
492178     5
492179    11
492180    15
492181    15
492182     2
492183     4
492184    29
492185     9
492186    13
492187     8
492188    12
492189     5
492190    15
492191    12
492192     5
492193    11
Name: duration, Length: 492194, dtype: int64
Distribution of Months
5    246662
4    245486
6        46
Name: tpep_dropoff_datetime_months, dtype: int64
Distribution of Holidays
0    486831
1      5363
Name:

In [10]:
print(cab_data["dropoff_longitude"].value_counts())

-73.982368    158
-73.982384    151
-73.982353    144
-73.982224    144
-73.982201    140
-73.982246    139
-73.981979    138
-73.982292    137
-73.991379    136
-73.982063    136
-73.982048    135
-73.982101    135
-73.991280    134
-73.991318    134
-73.982307    134
-73.982300    133
-73.982338    133
-73.981773    133
-73.982330    133
-73.982269    133
-73.991341    133
-73.981003    132
-73.991493    131
-73.982239    131
-73.978371    131
-73.991081    130
-73.978851    130
-73.978394    130
-73.982162    129
-73.982407    127
             ... 
-73.781548      1
-74.183006      1
-73.918190      1
-73.918114      1
-73.831375      1
-73.920120      1
-73.781982      1
-73.920555      1
-73.826683      1
-73.828148      1
-73.920288      1
-73.925446      1
-73.788574      1
-73.920692      1
-73.921776      1
-73.812416      1
-73.829430      1
-73.822777      1
-73.927406      1
-73.787201      1
-73.922966      1
-73.927528      1
-74.181480      1
-73.920097      1
-73.829231

In [11]:
cal = USFederalHolidayCalendar()
us_holidays = cal.holidays(start='2016-01-01', end='2016-12-31').to_pydatetime()

In [12]:
print(us_holidays)

[datetime.datetime(2016, 1, 1, 0, 0) datetime.datetime(2016, 1, 18, 0, 0)
 datetime.datetime(2016, 2, 15, 0, 0) datetime.datetime(2016, 5, 30, 0, 0)
 datetime.datetime(2016, 7, 4, 0, 0) datetime.datetime(2016, 9, 5, 0, 0)
 datetime.datetime(2016, 10, 10, 0, 0)
 datetime.datetime(2016, 11, 11, 0, 0)
 datetime.datetime(2016, 11, 24, 0, 0)
 datetime.datetime(2016, 12, 26, 0, 0)]
