In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import tensorflow as tf
import sklearn
import tensorflow as tf

from data_utils import filter_outliers


In [2]:
os.chdir(r'D:\Projects\MobilityForecast\Repo\mobilityforecast\data')
df = pd.read_parquet('yellow_tripdata_2022-01.parquet',engine='pyarrow')

In [3]:
df = pd.read_parquet('yellow_tripdata_2022-01.parquet',engine='pyarrow')
print(df.columns) # gives the names of the features in the dataset
df.head(5) # shows the first 5 data samples

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee'],
      dtype='object')


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,N,142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0
1,1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,N,236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0
2,2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,N,166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0
3,2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,N,114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0
4,2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,N,68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0


In [4]:
df = df.drop(columns=['VendorID' , 'RatecodeID', 'store_and_fwd_flag', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge','total_amount', 'congestion_surcharge', 'airport_fee' ]) # dropping the store and forward flag, VendorID is the company that provided the record(this does not add any value to our study)
df = df.dropna(axis=0) # RatecodeID and payment_type are categorical features affecting the price of the trip, we discard this for the moment as this does not add value to our study. 
df.head(5)

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,PULocationID,DOLocationID
0,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,142,236
1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,236,42
2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,166,166
3,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,114,68
4,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,68,163


In [5]:
df['pickup_day'] = df['tpep_pickup_datetime'].dt.day
df['pickup_hour'] = df['tpep_pickup_datetime'].dt.hour
df['pickup_min'] = df['tpep_pickup_datetime'].dt.minute

df['dropoff_day'] = df['tpep_dropoff_datetime'].dt.day
df['dropoff_hour'] = df['tpep_dropoff_datetime'].dt.hour
df['dropoff_min'] = df['tpep_dropoff_datetime'].dt.minute

df = df.drop(columns=['tpep_dropoff_datetime', 'tpep_pickup_datetime'])
df.head(5)

Unnamed: 0,passenger_count,trip_distance,PULocationID,DOLocationID,pickup_day,pickup_hour,pickup_min,dropoff_day,dropoff_hour,dropoff_min
0,2.0,3.8,142,236,1,0,35,1,0,53
1,1.0,2.1,236,42,1,0,33,1,0,42
2,1.0,0.97,166,166,1,0,53,1,1,2
3,1.0,1.09,114,68,1,0,25,1,0,35
4,1.0,4.3,68,163,1,0,36,1,1,14


In [6]:
# converting cyclic features to float values between 0s and 1s

df['pickup_hour_sin'] = np.sin(2 * np.pi * df['pickup_hour']/23.0)
df['pickup_hour_cos'] = np.cos(2 * np.pi * df['pickup_hour']/23.0)

df['dropoff_hour_sin'] = np.sin(2 * np.pi * df['dropoff_hour']/23.0)
df['dropoff_hour_cos'] = np.cos(2 * np.pi * df['dropoff_hour']/23.0)

df['pickup_min_sin'] = np.sin(2 * np.pi * df['pickup_min']/59.0)
df['pickup_min_cos'] = np.cos(2 * np.pi * df['pickup_min']/59.0)

df['dropoff_min_sin'] = np.sin(2 * np.pi * df['dropoff_hour']/59.0)
df['dropoff_min_cos'] = np.cos(2 * np.pi * df['dropoff_hour']/59.0)

df = df.drop(columns=['pickup_hour', 'pickup_min', 'dropoff_hour', 'dropoff_min'])
# Normalising the trip_distance feature
df['trip_distance'] = (df['trip_distance'] - df['trip_distance'].min()) / (df['trip_distance'].max() - df['trip_distance'].min()) 
df.head(5)

Unnamed: 0,passenger_count,trip_distance,PULocationID,DOLocationID,pickup_day,dropoff_day,pickup_hour_sin,pickup_hour_cos,dropoff_hour_sin,dropoff_hour_cos,pickup_min_sin,pickup_min_cos,dropoff_min_sin,dropoff_min_cos
0,2.0,0.005837,142,236,1,1,0.0,1.0,0.0,1.0,-0.5528,-0.833314,0.0,1.0
1,1.0,0.003226,236,42,1,1,0.0,1.0,0.0,1.0,-0.364161,-0.931336,0.0,1.0
2,1.0,0.00149,166,166,1,1,0.0,1.0,0.269797,0.962917,-0.596367,0.802712,0.106293,0.994335
3,1.0,0.001674,114,68,1,1,0.0,1.0,0.0,1.0,0.461093,-0.887352,0.0,1.0
4,1.0,0.006605,68,163,1,1,0.0,1.0,0.269797,0.962917,-0.638244,-0.769834,0.106293,0.994335


In [7]:
# extract the target variables and training data
target = df.pop('passenger_count')
labels = tf.one_hot(target, depth=target.max())

In [8]:
# passenger_count_cat = pd.get_dummies(df['passenger_count'], prefix='cat')
PULocationID_cat = pd.get_dummies(df['PULocationID'], prefix='cat')
DOLocationID_cat = pd.get_dummies(df['DOLocationID'], prefix='cat')
pickup_day_cat = pd.get_dummies(df['pickup_day'], prefix='cat')
dropoff_day_cat = pd.get_dummies(df['dropoff_day'], prefix='cat')

df = pd.concat([df, PULocationID_cat, DOLocationID_cat, pickup_day_cat,dropoff_day_cat], axis=1)

df = df.drop(columns=['PULocationID', 'DOLocationID', 'pickup_day', 'dropoff_day'])
 
df.head(5)

Unnamed: 0,trip_distance,pickup_hour_sin,pickup_hour_cos,dropoff_hour_sin,dropoff_hour_cos,pickup_min_sin,pickup_min_cos,dropoff_min_sin,dropoff_min_cos,cat_1,...,cat_22,cat_23,cat_24,cat_25,cat_26,cat_27,cat_28,cat_29,cat_30,cat_31
0,0.005837,0.0,1.0,0.0,1.0,-0.5528,-0.833314,0.0,1.0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.003226,0.0,1.0,0.0,1.0,-0.364161,-0.931336,0.0,1.0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.00149,0.0,1.0,0.269797,0.962917,-0.596367,0.802712,0.106293,0.994335,0,...,0,0,0,0,0,0,0,0,0,0
3,0.001674,0.0,1.0,0.0,1.0,0.461093,-0.887352,0.0,1.0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.006605,0.0,1.0,0.269797,0.962917,-0.638244,-0.769834,0.106293,0.994335,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
data = df.to_numpy()
data[0:500000][:].shape

(500000, 588)

In [10]:
labels[0:500000][:].shape

TensorShape([500000, 9])

In [11]:
BATCH_SIZE = 16
dataset = tf.data.Dataset.from_tensor_slices((data, labels)).batch(BATCH_SIZE)

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.

In [None]:
## Do the one-hot encoding in Tensorflow
## split it to train-val-test and train
## Read about imbalanced datasets in tensorflow

In [None]:
## next steps are to create a tensorflow dataset
## split it to train-val-test and train

In [None]:
# https://towardsdatascience.com/ml-approaches-for-time-series-4d44722e48fe
# https://towardsdatascience.com/multivariate-time-series-forecasting-with-deep-learning-3e7b3e2d2bcf
# https://stats.stackexchange.com/questions/311494/best-practice-for-encoding-datetime-in-machine-learning
# https://ianlondon.github.io/blog/encoding-cyclical-features-24hour-time/
# https://stats.stackexchange.com/questions/126230/optimal-construction-of-day-feature-in-neural-networks
# https://datascience.stackexchange.com/questions/5990/what-is-a-good-way-to-transform-cyclic-ordinal-attributes
# https://towardsdatascience.com/machine-learning-with-datetime-feature-engineering-predicting-healthcare-appointment-no-shows-5e4ca3a85f96


# https://stackoverflow.com/questions/46428870/how-to-handle-date-variable-in-machine-learning-data-pre-processing
# http://appliedpredictivemodeling.com/blog/2015/7/28/feature-engineering-versus-feature-extraction
# 
