# Importing packages 

In [48]:
# Data wrangling 
import pandas as pd 

# Deep learning 
import tensorflow as tf
import keras 

# Array math 
import numpy as np

# Loading the memory profile extension
%load_ext memory_profiler

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


# Reading the data 

In [49]:
d = pd.read_csv('data/train.csv')

print(f"Shape of the data: {d.shape}")
print(d.describe().round(2))

Shape of the data: (1458644, 11)
        vendor_id  passenger_count  pickup_longitude  pickup_latitude  \
count  1458644.00       1458644.00        1458644.00       1458644.00   
mean         1.53             1.66            -73.97            40.75   
std          0.50             1.31              0.07             0.03   
min          1.00             0.00           -121.93            34.36   
25%          1.00             1.00            -73.99            40.74   
50%          2.00             1.00            -73.98            40.75   
75%          2.00             2.00            -73.97            40.77   
max          2.00             9.00            -61.34            51.88   

       dropoff_longitude  dropoff_latitude  trip_duration  
count         1458644.00        1458644.00     1458644.00  
mean              -73.97             40.75         959.49  
std                 0.07              0.04        5237.43  
min              -121.93             32.18           1.00  
25%      

In [50]:
d

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.964630,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.010040,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.782520,N,435
...,...,...,...,...,...,...,...,...,...,...,...
1458639,id2376096,2,2016-04-08 13:31:04,2016-04-08 13:44:02,4,-73.982201,40.745522,-73.994911,40.740170,N,778
1458640,id1049543,1,2016-01-10 07:35:15,2016-01-10 07:46:10,1,-74.000946,40.747379,-73.970184,40.796547,N,655
1458641,id2304944,2,2016-04-22 06:57:41,2016-04-22 07:10:25,1,-73.959129,40.768799,-74.004433,40.707371,N,764
1458642,id2714485,1,2016-01-05 15:56:26,2016-01-05 16:02:39,1,-73.982079,40.749062,-73.974632,40.757107,N,373


In [51]:
# Checking the memory usage 
%memit d

peak memory: 2818.54 MiB, increment: 0.00 MiB


The data in memory uses ~873MB of RAM 

# Feature engineering 

## Date variables 

In [52]:
def create_date_vars(d):
    """
    Creates the datetime variables
    """
    # Infering the day of the week from pickup_datetime
    d['pickup_datetime'] = pd.to_datetime(d['pickup_datetime'])
    d['pickup_dayofweek'] = d['pickup_datetime'].dt.dayofweek

    # Infering the hour of the day from pickup_datetime
    d['pickup_hour'] = d['pickup_datetime'].dt.hour

    # Creating a new variable for the day of the year
    d['pickup_dayofyear'] = d['pickup_datetime'].dt.dayofyear

    # Ensuring a monotonic relationship between pickup_hour and pickup_dayofyear
    d['pickup_hour_sin'] = np.sin(2 * np.pi * d['pickup_hour']/23.0)
    d['pickup_hour_cos'] = np.cos(2 * np.pi * d['pickup_hour']/23.0)

    d['pickup_dayofyear_sin'] = np.sin(2 * np.pi * d['pickup_dayofyear']/365.0)
    d['pickup_dayofyear_cos'] = np.cos(2 * np.pi * d['pickup_dayofyear']/365.0)

    return d

## Dummy variables

The features that will be one-hot encoded: 

* store_and_fwd_flag
* vendor_id 
* pickup_dayofweek

In [53]:
# Defining the dummy var list 
dummy_features = [
    'vendor_id',
    'store_and_fwd_flag',
    'pickup_dayofweek'
]

# Defining the function for dummy creation 
def create_dummy(df, dummy_var_list):
    # Placeholder for the dummy variables
    added_features = []
    for var in dummy_var_list:
        dummy = pd.get_dummies(df[var], prefix=var, drop_first=True)
        
        # Adding the new features to list 
        added_features.extend(dummy.columns)

        # Adding the dummy variables to the dataframe
        df = pd.concat([df, dummy], axis=1)
        df.drop(var, axis=1, inplace=True)

    # Returning the dataframe 
    return df, added_features

## Distance of travel 

In [54]:
# Defining the function for distance calculation
def distance_calculation(df):
    """
    Calculates the distance between two points on the earth's surface.

    The distance is in meters
    """
    R = 6373.0

    lat1 = np.radians(df['pickup_latitude'])
    lon1 = np.radians(df['pickup_longitude'])
    lat2 = np.radians(df['dropoff_latitude'])
    lon2 = np.radians(df['dropoff_longitude'])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

    distance = R * c

    # Saving the distance to the dataframe
    df['distance'] = distance * 1000 # Converting to meters
    return df 

## Final feature list and the ft engineering pipeline 


In [59]:
# Defining the final feature list 
numeric_features = [
    'distance',
    'passenger_count', 
    'pickup_hour_sin',
    'pickup_hour_cos',
    'pickup_dayofyear_sin',
    'pickup_dayofyear_cos',
]

# Defining the target variable
target = 'trip_duration'

# Defining the ft engineering pipeline 
def ft_engineering_pipeline(
    df, 
    numeric_features, 
    dummy_features,
    target):
    """
    Applies the feature engineering pipeline to the data
    """
    # Creating the date variables
    df = create_date_vars(df)

    # Creating the dummy variables
    df, new_features = create_dummy(df, dummy_features)

    # Appending the distance
    df = distance_calculation(df) 

    # Appending the new features to the numeric features
    final_features = numeric_features + new_features

    # Creating the x matrix 
    x = df[final_features].values

    # Creating the y vector
    y = df[target].values

    # Returning the x and y matrices
    return x, y, final_features

# Creating the input for model 

In [60]:
x, y, features = ft_engineering_pipeline(d, numeric_features, dummy_features, target)

In [61]:
print(f"Shape of x: {x.shape} | Shape of y: {y.shape}")
print(features)

Shape of x: (1458644, 14) | Shape of y: (1458644,)
['distance', 'passenger_count', 'pickup_hour_sin', 'pickup_hour_cos', 'pickup_dayofyear_sin', 'pickup_dayofyear_cos', 'vendor_id_2', 'store_and_fwd_flag_Y', 'pickup_dayofweek_1', 'pickup_dayofweek_2', 'pickup_dayofweek_3', 'pickup_dayofweek_4', 'pickup_dayofweek_5', 'pickup_dayofweek_6']
