In [1]:
pip install featuretools==0.27.1

Collecting featuretools==0.27.1
  Using cached featuretools-0.27.1-py3-none-any.whl (327 kB)
Installing collected packages: featuretools
Successfully installed featuretools-0.27.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
import featuretools as ft

In [3]:
import utils

In [4]:
from utils import load_nyc_taxi_data, compute_features, preview, feature_importances
from sklearn.ensemble import GradientBoostingRegressor
from featuretools.primitives import (Minute, Hour, Day, Week, Month,
                                     Weekday, IsWeekend, Count, Sum, Mean, Median, Std, Min, Max)
import numpy as np
ft.__version__
%load_ext autoreload
%autoreload 2

In [5]:
trips, pickup_neighborhoods, dropoff_neighborhoods = load_nyc_taxi_data()
preview(trips, 10)

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,payment_type,trip_duration,pickup_neighborhood,dropoff_neighborhood
0,0,2,2016-01-01 00:00:19,2016-01-01 00:06:31,3,1.32,-73.961258,40.7962,-73.95005,40.787312,2,372.0,AH,C
672146,672146,1,2016-04-29 07:01:31,2016-04-29 07:15:46,1,3.3,-73.949951,40.784653,-73.982536,40.75547,1,855.0,C,AA
672147,672147,2,2016-04-29 07:01:43,2016-04-29 07:09:15,1,1.14,-73.967331,40.75737,-73.954277,40.765282,1,452.0,N,K
672148,672148,1,2016-04-29 07:01:46,2016-04-29 07:07:54,1,1.1,-74.003082,40.727509,-73.984703,40.724377,1,368.0,AB,AC
672149,672149,2,2016-04-29 07:01:46,2016-04-29 07:06:48,2,1.4,-73.990158,40.77235,-73.982147,40.7598,1,302.0,AR,AA
672150,672150,1,2016-04-29 07:01:59,2016-04-29 07:07:33,1,1.2,-73.983681,40.746677,-73.971703,40.762463,2,334.0,AO,A
672151,672151,2,2016-04-29 07:02:11,2016-04-29 07:15:24,2,2.13,-73.994209,40.750999,-73.969391,40.761539,1,793.0,D,AK
672152,672152,1,2016-04-29 07:02:11,2016-04-29 07:06:44,1,1.0,-73.983276,40.770985,-73.98011,40.760666,1,273.0,AR,A
672153,672153,2,2016-04-29 07:02:13,2016-04-29 07:08:36,1,1.17,-73.980141,40.743168,-73.983391,40.754665,1,383.0,Y,AA
672154,672154,1,2016-04-29 07:02:16,2016-04-29 07:04:07,1,0.5,-73.965973,40.765381,-73.970558,40.758724,1,111.0,AK,N


In [6]:
entities = {
        "trips": (trips, "id", 'pickup_datetime' ),
        "pickup_neighborhoods": (pickup_neighborhoods, "neighborhood_id"),
        "dropoff_neighborhoods": (dropoff_neighborhoods, "neighborhood_id"),
        }

relationships = [("pickup_neighborhoods", "neighborhood_id", "trips", "pickup_neighborhood"),
                 ("dropoff_neighborhoods", "neighborhood_id", "trips", "dropoff_neighborhood")]

In [7]:
cutoff_time = trips[['id', 'pickup_datetime']]
cutoff_time = cutoff_time[cutoff_time['pickup_datetime'] > "2016-01-12"]
preview(cutoff_time, 10)

Unnamed: 0,id,pickup_datetime
56311,56311,2016-01-12 00:00:25
698765,698765,2016-05-03 18:54:53
698766,698766,2016-05-03 18:55:37
698767,698767,2016-05-03 18:55:38
698768,698768,2016-05-03 18:55:49
698769,698769,2016-05-03 18:55:58
698770,698770,2016-05-03 18:56:22
698771,698771,2016-05-03 18:56:24
698772,698772,2016-05-03 18:56:51
698773,698773,2016-05-03 18:56:56


In [33]:
cutoff_time.head()

Unnamed: 0,id,pickup_datetime
56311,56311,2016-01-12 00:00:25
56312,56312,2016-01-12 00:02:09
56313,56313,2016-01-12 00:02:25
56314,56314,2016-01-12 00:02:41
56315,56315,2016-01-12 00:03:44


In [8]:
trans_primitives = [IsWeekend]

features = ft.dfs(entities=entities,
                  relationships=relationships,
                  target_entity="trips",
                  trans_primitives=trans_primitives,
                  agg_primitives=[],
                  ignore_variables={"trips": ["pickup_latitude", "pickup_longitude",
                                              "dropoff_latitude", "dropoff_longitude"]},
                  features_only=True)

In [9]:
print ("Number of features: %d" % len(features))
features

Number of features: 13


[<Feature: vendor_id>,
 <Feature: passenger_count>,
 <Feature: trip_distance>,
 <Feature: payment_type>,
 <Feature: trip_duration>,
 <Feature: pickup_neighborhood>,
 <Feature: dropoff_neighborhood>,
 <Feature: IS_WEEKEND(dropoff_datetime)>,
 <Feature: IS_WEEKEND(pickup_datetime)>,
 <Feature: pickup_neighborhoods.latitude>,
 <Feature: pickup_neighborhoods.longitude>,
 <Feature: dropoff_neighborhoods.latitude>,
 <Feature: dropoff_neighborhoods.longitude>]

In [10]:
def compute_features(features, cutoff_time):
    # shuffle so we don't see encoded features in the front or backs

    np.random.shuffle(features)
    feature_matrix = ft.calculate_feature_matrix(features,
                                                 cutoff_time=cutoff_time,
                                                 approximate='36d',
                                                 verbose=True,entities=entities, relationships=relationships)
    print("Finishing computing...")
    feature_matrix, features = ft.encode_features(feature_matrix, features,
                                                  to_encode=["pickup_neighborhood", "dropoff_neighborhood"],
                                                  include_unknown=False)
    return feature_matrix

In [11]:
feature_matrix = compute_features(features, cutoff_time)

Elapsed: 00:17 | Progress: 100%|███████████████████████████████████████████████████████████████████████████████████████
Finishing computing...


In [12]:
preview(feature_matrix, 5)

Unnamed: 0_level_0,dropoff_neighborhood = AD,dropoff_neighborhood = A,dropoff_neighborhood = AA,dropoff_neighborhood = D,dropoff_neighborhood = AR,dropoff_neighborhood = C,dropoff_neighborhood = O,dropoff_neighborhood = N,dropoff_neighborhood = AO,dropoff_neighborhood = AK,...,pickup_neighborhood = AA,pickup_neighborhood = D,pickup_neighborhood = A,pickup_neighborhood = AR,pickup_neighborhood = AK,pickup_neighborhood = AO,pickup_neighborhood = N,pickup_neighborhood = R,pickup_neighborhood = O,IS_WEEKEND(dropoff_datetime)
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
56311,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
691284,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
691285,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
691286,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
691288,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False


In [13]:
# separates the whole feature matrix into train data feature matrix, 
# train data labels, and test data feature matrix 
X_train, y_train, X_test, y_test = utils.get_train_test_fm(feature_matrix,.75)
y_train = np.log(y_train+1)
y_test = np.log(y_test+1)

In [14]:
model = GradientBoostingRegressor(verbose=True)
model.fit(X_train, y_train)
model.score(X_test, y_test)

      Iter       Train Loss   Remaining Time 
         1           0.4925            2.94m
         2           0.4333            2.82m
         3           0.3843            2.79m
         4           0.3446            2.79m
         5           0.3119            2.70m
         6           0.2852            2.62m
         7           0.2634            2.56m
         8           0.2454            2.51m
         9           0.2305            2.47m
        10           0.2183            2.43m
        20           0.1666            2.15m
        30           0.1558            1.82m
        40           0.1514            1.53m
        50           0.1488            1.25m
        60           0.1472           59.73s
        70           0.1458           44.42s
        80           0.1448           29.52s
        90           0.1440           14.72s
       100           0.1433            0.00s


0.7220107526801756

In [18]:
trans_primitives = [Minute, Hour, Day, Week, Month, Weekday, IsWeekend]

features = ft.dfs(entities=entities,
                  relationships=relationships,
                  target_entity="trips",
                  trans_primitives=trans_primitives,
                  agg_primitives=[],
                  ignore_variables={"trips": ["pickup_latitude", "pickup_longitude",
                                              "dropoff_latitude", "dropoff_longitude"]},
                  features_only=True)

In [19]:
print ("Number of features: %d" % len(features))
features

Number of features: 25


[<Feature: vendor_id>,
 <Feature: passenger_count>,
 <Feature: trip_distance>,
 <Feature: payment_type>,
 <Feature: trip_duration>,
 <Feature: pickup_neighborhood>,
 <Feature: dropoff_neighborhood>,
 <Feature: DAY(dropoff_datetime)>,
 <Feature: DAY(pickup_datetime)>,
 <Feature: HOUR(dropoff_datetime)>,
 <Feature: HOUR(pickup_datetime)>,
 <Feature: IS_WEEKEND(dropoff_datetime)>,
 <Feature: IS_WEEKEND(pickup_datetime)>,
 <Feature: MINUTE(dropoff_datetime)>,
 <Feature: MINUTE(pickup_datetime)>,
 <Feature: MONTH(dropoff_datetime)>,
 <Feature: MONTH(pickup_datetime)>,
 <Feature: WEEK(dropoff_datetime)>,
 <Feature: WEEK(pickup_datetime)>,
 <Feature: WEEKDAY(dropoff_datetime)>,
 <Feature: WEEKDAY(pickup_datetime)>,
 <Feature: pickup_neighborhoods.latitude>,
 <Feature: pickup_neighborhoods.longitude>,
 <Feature: dropoff_neighborhoods.latitude>,
 <Feature: dropoff_neighborhoods.longitude>]

In [20]:
feature_matrix = compute_features(features, cutoff_time)

Elapsed: 00:11 | Progress: 100%|███████████████████████████████████████████████████████████████████████████████████████
Finishing computing...


In [21]:
preview(feature_matrix, 10)

Unnamed: 0_level_0,WEEKDAY(pickup_datetime),trip_distance,pickup_neighborhood = AD,pickup_neighborhood = AA,pickup_neighborhood = D,pickup_neighborhood = A,pickup_neighborhood = AR,pickup_neighborhood = AK,pickup_neighborhood = AO,pickup_neighborhood = N,...,WEEK(dropoff_datetime),WEEK(pickup_datetime),pickup_neighborhoods.latitude,MINUTE(pickup_datetime),payment_type,MONTH(dropoff_datetime),HOUR(pickup_datetime),HOUR(dropoff_datetime),MINUTE(dropoff_datetime),DAY(pickup_datetime)
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
56311,1,1.61,False,False,False,False,False,False,False,False,...,2,2,40.720245,0,1,1,0,0,11,12
691284,0,0.61,False,False,False,False,False,False,False,False,...,18,18,40.729652,21,1,5,12,12,24,2
691285,0,0.88,False,False,False,False,False,False,False,False,...,18,18,40.77627,22,1,5,12,12,27,2
691286,0,1.9,False,False,False,False,False,False,False,False,...,18,18,40.742531,22,1,5,12,12,48,2
691288,0,1.0,False,False,False,False,False,False,True,False,...,18,18,40.747126,23,1,5,12,12,30,2
691289,0,3.24,False,False,False,False,False,False,False,False,...,18,18,40.721435,23,1,5,12,12,55,2
691290,0,0.1,False,False,False,False,False,True,False,False,...,18,18,40.764723,23,2,5,12,12,26,2
691291,0,1.6,False,False,False,False,False,False,False,False,...,18,18,40.766809,24,1,5,12,12,37,2
691292,0,1.5,True,False,False,False,False,False,False,False,...,18,18,40.752186,24,1,5,12,12,39,2
691293,0,1.89,False,False,False,False,False,False,False,False,...,18,18,40.775299,24,1,5,12,12,34,2


In [22]:
##4.1 and 4.2

In [23]:
# separates the whole feature matrix into train data feature matrix,
# train data labels, and test data feature matrix 
X_train, y_train, X_test, y_test = utils.get_train_test_fm(feature_matrix,.75)
y_train = np.log(y_train+1)
y_test = np.log(y_test+1)

In [24]:
model = GradientBoostingRegressor(verbose=True)
model.fit(X_train,y_train)
model.score(X_test,y_test)

      Iter       Train Loss   Remaining Time 
         1           0.4925            3.95m
         2           0.4333            3.97m
         3           0.3843            3.90m
         4           0.3444            3.86m
         5           0.3117            3.81m
         6           0.2848            3.77m
         7           0.2620            3.72m
         8           0.2435            3.67m
         9           0.2282            3.62m
        10           0.2152            3.58m
        20           0.1588            3.15m
        30           0.1415            2.73m
        40           0.1332            2.32m
        50           0.1283            1.92m
        60           0.1252            1.52m
        70           0.1227            1.14m
        80           0.1207           45.15s
        90           0.1191           22.48s
       100           0.1177            0.00s


0.7755557670208889

In [25]:
trans_primitives = [Minute, Hour, Day, Week, Month, Weekday, IsWeekend]
aggregation_primitives = [Count, Sum, Mean, Median, Std, Max, Min]

features = ft.dfs(entities=entities,
                  relationships=relationships,
                  target_entity="trips",
                  trans_primitives=trans_primitives,
                  agg_primitives=aggregation_primitives,
                  ignore_variables={"trips": ["pickup_latitude", "pickup_longitude",
                                              "dropoff_latitude", "dropoff_longitude"]},
                  features_only=True)

In [26]:
print ("Number of features: %d" % len(features))
features

Number of features: 63


[<Feature: vendor_id>,
 <Feature: passenger_count>,
 <Feature: trip_distance>,
 <Feature: payment_type>,
 <Feature: trip_duration>,
 <Feature: pickup_neighborhood>,
 <Feature: dropoff_neighborhood>,
 <Feature: DAY(dropoff_datetime)>,
 <Feature: DAY(pickup_datetime)>,
 <Feature: HOUR(dropoff_datetime)>,
 <Feature: HOUR(pickup_datetime)>,
 <Feature: IS_WEEKEND(dropoff_datetime)>,
 <Feature: IS_WEEKEND(pickup_datetime)>,
 <Feature: MINUTE(dropoff_datetime)>,
 <Feature: MINUTE(pickup_datetime)>,
 <Feature: MONTH(dropoff_datetime)>,
 <Feature: MONTH(pickup_datetime)>,
 <Feature: WEEK(dropoff_datetime)>,
 <Feature: WEEK(pickup_datetime)>,
 <Feature: WEEKDAY(dropoff_datetime)>,
 <Feature: WEEKDAY(pickup_datetime)>,
 <Feature: pickup_neighborhoods.latitude>,
 <Feature: pickup_neighborhoods.longitude>,
 <Feature: dropoff_neighborhoods.latitude>,
 <Feature: dropoff_neighborhoods.longitude>,
 <Feature: pickup_neighborhoods.COUNT(trips)>,
 <Feature: pickup_neighborhoods.MAX(trips.passenger_count)>

In [27]:
feature_matrix = compute_features(features, cutoff_time)

Elapsed: 00:23 | Progress: 100%|███████████████████████████████████████████████████████████████████████████████████████
Finishing computing...


In [28]:
preview(feature_matrix, 10)

Unnamed: 0_level_0,dropoff_neighborhoods.latitude,pickup_neighborhoods.MEAN(trips.trip_duration),WEEK(pickup_datetime),pickup_neighborhoods.SUM(trips.passenger_count),pickup_neighborhoods.MEAN(trips.trip_distance),pickup_neighborhoods.STD(trips.trip_duration),pickup_neighborhoods.MEDIAN(trips.passenger_count),HOUR(pickup_datetime),trip_duration,dropoff_neighborhoods.MAX(trips.trip_duration),...,dropoff_neighborhood = AD,dropoff_neighborhood = A,dropoff_neighborhood = AA,dropoff_neighborhood = D,dropoff_neighborhood = AR,dropoff_neighborhood = C,dropoff_neighborhood = O,dropoff_neighborhood = N,dropoff_neighborhood = AO,dropoff_neighborhood = AK
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
56311,40.721435,740.870871,2,2283.0,2.978551,424.975281,1.0,0,645.0,3572.0,...,False,False,False,False,False,False,False,False,False,False
691284,40.721435,753.81368,18,34521.0,2.232347,475.463994,1.0,12,160.0,3603.0,...,False,False,False,False,False,False,False,False,False,False
691285,40.785005,681.405688,18,36299.0,2.062772,478.673953,1.0,12,295.0,3602.0,...,False,False,False,False,False,False,False,False,False,False
691286,40.757707,682.62444,18,31158.0,2.125305,444.617706,1.0,12,1573.0,3606.0,...,False,False,True,False,False,False,False,False,False,False
691288,40.761087,714.648716,18,43543.0,2.171776,475.990878,1.0,12,404.0,3580.0,...,False,False,False,False,False,False,False,False,False,False
691289,40.761492,818.141251,18,30913.0,2.509054,499.064139,1.0,12,1906.0,3606.0,...,False,True,False,False,False,False,False,False,False,False
691290,40.764723,637.726834,18,43212.0,1.830726,445.631332,1.0,12,156.0,3580.0,...,False,False,False,False,False,False,False,False,False,True
691291,40.77627,707.024093,18,32656.0,2.266902,472.566001,1.0,12,827.0,3604.0,...,False,False,False,False,False,False,False,False,False,False
691292,40.764723,749.696305,18,57862.0,2.274509,477.721891,1.0,12,883.0,3580.0,...,False,False,False,False,False,False,False,False,False,True
691293,40.766488,670.677993,18,39612.0,1.872252,478.578875,1.0,12,592.0,3587.0,...,False,False,False,False,True,False,False,False,False,False


In [None]:
##5.1 and 5.2

In [29]:
# separates the whole feature matrix into train data feature matrix,
# train data labels, and test data feature matrix 
X_train, y_train, X_test, y_test = utils.get_train_test_fm(feature_matrix,.75)
y_train = np.log(y_train+1)
y_test = np.log(y_test+1)

In [30]:
# note: this may take up to 30 minutes to run
model = GradientBoostingRegressor(verbose=True)
model.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1           0.4925           10.01m
         2           0.4333            9.85m
         3           0.3843            9.68m
         4           0.3444            9.55m
         5           0.3117            9.48m
         6           0.2848            9.39m
         7           0.2620            9.27m
         8           0.2435            9.19m
         9           0.2282            9.09m
        10           0.2152            8.99m
        20           0.1585            7.98m
        30           0.1420            6.96m
        40           0.1332            5.93m
        50           0.1271            4.96m
        60           0.1238            3.96m
        70           0.1211            2.95m
        80           0.1191            1.97m
        90           0.1176           58.87s
       100           0.1163            0.00s


GradientBoostingRegressor(verbose=True)

In [31]:
model.score(X_test,y_test)

0.7781928490478007

In [32]:
y_pred = model.predict(X_test)
y_pred = np.exp(y_pred) - 1 # undo the log we took earlier
y_pred[5:]

array([ 555.94822628,  590.51563503, 1494.62671006, ..., 1073.50652609,
       1792.28002517,  737.46367124])