In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
import statsmodels.api as smg
import seaborn as sns
import tensorboard

In [2]:
### Load stratified data
strat_splits = []
for i in range(10):
    split = []
    for j in range(2):
        split.append(pd.read_pickle(f'pickled-data/df_{i}-{j}.pkl'))
    strat_splits.append(split)

In [3]:
strat_train_set, strat_test_set = strat_splits[0] # train with 100000 for reasonable amount of training
strat_train_set = strat_train_set[:100000]
strat_test_set = strat_test_set[:20000]

In [4]:
# trainining data
trips_train = strat_train_set.drop(columns=['trip_duration'])          # predictors
trips_train_label = strat_train_set["trip_duration"]            # targets
trips_train_label = trips_train_label/pd.Timedelta(minutes=1)

In [5]:
# validating data
trips_test = strat_test_set.drop(columns=['trip_duration'])          # predictors
trips_test_label = strat_test_set["trip_duration"]           # targets
trips_test_label = trips_test_label/pd.Timedelta(minutes=1)

#### Preprocess the features

In [6]:
def extract_features(trips):
    trips['pickup_weekday'] = trips['tpep_pickup_datetime'].dt.weekday
    trips['pickup_hour'] = trips['tpep_pickup_datetime'].dt.hour
    trips['pickup_minute'] = trips['tpep_pickup_datetime'].dt.minute
    return trips

In [7]:
# a utility function to drop features
def feature_selection(dataframe, attributes=[]): 
    return dataframe.drop(columns=attributes)

In [8]:
def type_casting(dataframe, attribute, type):
    dataframe[f"{attribute}"] = dataframe[[f"{attribute}"]].astype(f"{type}")
    return dataframe

In [9]:
# Extract features from datetime columns of pickup
trips_train = extract_features(trips_train)
trips_test = extract_features(trips_test)

In [10]:
# drop the tpep_pickup_datetime columns and date columns (used for joining)
drop_dates = ["tpep_pickup_datetime", "date"]
trips_train = feature_selection(trips_train, drop_dates)
trips_test = feature_selection(trips_test, drop_dates)

# drop irrelevant data columns
irrelevant_attr = ["payment_type", "VendorID", "RatecodeID"]
trips_train = feature_selection(trips_train, irrelevant_attr)
trips_test = feature_selection(trips_test, irrelevant_attr)

# drop columns with significant missing values i.e., almost equal to the dataset size
significant_nulls = ["wpgt", "snow", "prcp", "tsun", "wdir", "airport_fee"]
trips_train = feature_selection(trips_train, significant_nulls)
trips_test = feature_selection(trips_test, significant_nulls)


In [11]:
# cast dates to a numeral
trips_train = type_casting(trips_train, "tpep_dropoff_datetime", "int64")
trips_test = type_casting(trips_test, "tpep_dropoff_datetime", "int64")

In [12]:
trips_train.head(2)

Unnamed: 0,tpep_dropoff_datetime,passenger_count,trip_distance,store_and_fwd_flag,PULocationID,DOLocationID,fare_amount,extra,mta_tax,tip_amount,...,total_amount,congestion_surcharge,tavg,tmin,tmax,wspd,pres,pickup_weekday,pickup_hour,pickup_minute
5629127,1580305394000000,1.0,1.23,N,238,166,7.0,0.0,0.5,0.0,...,7.8,0.0,3.5,1.1,7.2,7.8,1018.2,2,13,36
3950490,1579435722000000,2.0,1.01,N,141,263,5.5,0.0,0.5,1.32,...,10.12,2.5,4.0,0.6,7.2,10.2,1008.9,6,12,4


In [13]:
trips_test.head(2)

Unnamed: 0,tpep_dropoff_datetime,passenger_count,trip_distance,store_and_fwd_flag,PULocationID,DOLocationID,fare_amount,extra,mta_tax,tip_amount,...,total_amount,congestion_surcharge,tavg,tmin,tmax,wspd,pres,pickup_weekday,pickup_hour,pickup_minute
4827251,1579809623000000,1.0,5.5,N,234,24,22.0,3.5,0.5,5.25,...,31.55,2.5,3.5,0.0,7.2,7.5,1029.4,3,19,32
3890488,1579392861000000,1.0,0.7,N,230,164,4.0,0.5,0.5,1.95,...,9.75,2.5,4.0,0.6,7.2,10.2,1008.9,6,0,12


In [14]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [15]:
# numerical transformer
num_attributes = list(trips_train.select_dtypes(np.number).columns)
num_pipeline = make_pipeline(SimpleImputer(strategy="mean"),
                              StandardScaler())

# categorical transformer
cat_attributes = ['store_and_fwd_flag']
cat_pipeline = make_pipeline(SimpleImputer(strategy="most_frequent"), 
                                       OneHotEncoder(handle_unknown="ignore"))

In [16]:
# combined Transformation pipelines
preprocessing = ColumnTransformer([
        ("num", num_pipeline, num_attributes),
        ("cat", cat_pipeline, cat_attributes),
    ])

### Regression MLP No hidden layers

In [17]:
import tensorflow as tf

##### Standardize the inputs

In [18]:
trips_train_prepared = preprocessing.fit_transform(trips_train)
df_trips_train_prepared = pd.DataFrame(trips_train_prepared,
                                       columns=preprocessing.get_feature_names_out(),
                                       index=trips_train.index)
df_trips_train_prepared.head(2)


Unnamed: 0,num__tpep_dropoff_datetime,num__passenger_count,num__trip_distance,num__PULocationID,num__DOLocationID,num__fare_amount,num__extra,num__mta_tax,num__tip_amount,num__tolls_amount,...,num__tmin,num__tmax,num__wspd,num__pres,num__pickup_weekday,num__pickup_hour,num__pickup_minute,cat__store_and_fwd_flag_N,cat__store_and_fwd_flag_Y,cat__store_and_fwd_flag_None
5629127,1.403842,-0.447559,-0.438733,1.12107,0.056583,-0.484147,-0.885659,0.105365,-0.851985,-0.21839,...,-0.089092,-0.077886,-0.816167,-0.333511,-0.534378,-0.153186,0.366407,1.0,0.0,0.0
3950490,0.265157,0.425807,-0.496661,-0.355428,1.44184,-0.612103,-0.885659,0.105365,-0.339326,-0.21839,...,-0.233086,-0.077886,-0.117681,-1.388189,1.62466,-0.323082,-1.477519,1.0,0.0,0.0


In [19]:
trips_test_prepared = preprocessing.fit_transform(trips_test)
df_trips_test_prepared = pd.DataFrame(trips_test_prepared,
                                       columns=preprocessing.get_feature_names_out(),
                                       index=trips_test.index)
df_trips_test_prepared.head(2)

Unnamed: 0,num__tpep_dropoff_datetime,num__passenger_count,num__trip_distance,num__PULocationID,num__DOLocationID,num__fare_amount,num__extra,num__mta_tax,num__tip_amount,num__tolls_amount,...,num__tmin,num__tmax,num__wspd,num__pres,num__pickup_weekday,num__pickup_hour,num__pickup_minute,cat__store_and_fwd_flag_N,cat__store_and_fwd_flag_Y,cat__store_and_fwd_flag_None
4827251,0.761239,-0.448046,0.685706,1.052892,-1.977166,0.778787,1.899921,0.099834,1.150466,-0.218816,...,-0.41694,-0.088592,-0.916521,0.947635,0.012142,0.860072,0.120948,1.0,0.0,0.0
3890488,0.21285,-0.448046,-0.569841,0.991763,0.023873,-0.716101,-0.490388,0.099834,-0.07952,-0.218816,...,-0.243857,-0.088592,-0.125247,-1.383785,1.631095,-2.346412,-1.033854,1.0,0.0,0.0


In [20]:
df_trips_train_prepared.shape

(100000, 24)

In [21]:
# Initialize the model
tf.random.set_seed(42) # set a tf random seed to make the result reproducible: i.e, the random weights of hidden layers and output layer will be the same everywhere

model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(shape=(24,)),
    tf.keras.layers.Dense(1) # output layer
])

# Adam optmizer with lr=0.03
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
model.compile(loss="mae", optimizer=optimizer, metrics=["RootMeanSquaredError"])


In [22]:
model.summary()

In [23]:
df_trips_train_prepared.shape

(100000, 24)

In [24]:
df_trips_test_prepared.shape

(20000, 24)

#### Add callbacks

In [25]:
# stop training if no progress on the validation set and roll back to the best model after training
early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=10,
                                                    restore_best_weights=True)

In [26]:
# create log directory for Tensor board (dont profile for each batch to save memory)
from pathlib import Path
from time import strftime

def get_run_logdir(root_logdir="my_logs"):
    return Path(root_logdir) / strftime("run_%Y_%m_%d_%H_%M_%S")

run_logdir = get_run_logdir()

tensorboard_cb = tf.keras.callbacks.TensorBoard(run_logdir,
                                                profile_batch=(100, 200))

In [27]:

# %load_ext tensorboard # load the tensor board 
# %tensorboard --logdir=./my_logs # start the tensorboard server for my_logs directory

In [28]:
model.fit(df_trips_train_prepared, 
          trips_train_label,
          epochs=100,
          batch_size=32,
          validation_data=(df_trips_test_prepared, trips_test_label),
          callbacks=[early_stopping_cb, tensorboard_cb])

Epoch 1/100
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - RootMeanSquaredError: 62.5874 - loss: 14.1241 - val_RootMeanSquaredError: 68.3179 - val_loss: 10.6892
Epoch 2/100
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - RootMeanSquaredError: 61.1087 - loss: 8.9057 - val_RootMeanSquaredError: 67.5496 - val_loss: 7.2884
Epoch 3/100
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - RootMeanSquaredError: 60.5000 - loss: 6.3962 - val_RootMeanSquaredError: 67.4263 - val_loss: 6.7355
Epoch 4/100
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - RootMeanSquaredError: 60.4149 - loss: 6.0093 - val_RootMeanSquaredError: 67.4207 - val_loss: 6.5952
Epoch 5/100
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - RootMeanSquaredError: 60.4112 - loss: 5.8777 - val_RootMeanSquaredError: 67.4275 - val_loss: 6.5140
Epoch 6/100
[1m3125/3125[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x1e97823b620>

#### Save the model

In [29]:
# saving the model
model.save("models/04-03-MLP-with-no-hidden-layers-ADAM-MAE.keras",)

In [30]:
model = tf.keras.models.load_model("models/04-03-MLP-with-no-hidden-layers-ADAM-MAE.keras")

#### Evaluate model

In [None]:
mae_test, rmse_test = model.evaluate(df_trips_test_prepared, trips_test_label)

In [None]:
# Mean absolute error
mae_test

In [None]:
# Root mean square error
rmse_test

In [None]:
trips_new = df_trips_test_prepared[:5] # predict the first 5
trips_pred = model.predict(trips_new)

In [None]:
print("Predicted",trips_pred)
print("Actual",trips_test_label[:5])