In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
import statsmodels.api as smg
import seaborn as sns 

### Load data

In [2]:
# Load pickled data
strat_splits = []
for i in range(10):
    split = []
    for j in range(2):
        split.append(pd.read_pickle(f'pickled-data/df_{i}-{j}.pkl'))
    strat_splits.append(split)

In [None]:
strat_train_set, strat_test_set = strat_splits[0]
strat_train_set.head(2)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun,trip_duration
5629127,2,2020-01-29 13:36:22,2020-01-29 13:43:14,1.0,1.23,1.0,N,238,166,2,...,1.1,7.2,,,,7.8,,1018.2,,0 days 00:06:52
3950490,2,2020-01-19 12:04:41,2020-01-19 12:08:42,2.0,1.01,1.0,N,141,263,1,...,0.6,7.2,,,,10.2,,1008.9,,0 days 00:04:01


In [4]:
trips = strat_train_set.drop(columns=['trip_duration'])          # predictors
trips_label = strat_train_set["trip_duration"].copy()            # targets
trips_label = trips_label/pd.Timedelta(minutes=1)

In [5]:
trips.head(1)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
5629127,2,2020-01-29 13:36:22,2020-01-29 13:43:14,1.0,1.23,1.0,N,238,166,2,...,3.5,1.1,7.2,,,,7.8,,1018.2,


In [6]:
# Extract features from datetime columns of pickup
trips['pickup_weekday'] = trips['tpep_pickup_datetime'].dt.weekday
trips['pickup_hour'] = trips['tpep_pickup_datetime'].dt.hour
trips['pickup_minute'] = trips['tpep_pickup_datetime'].dt.minute

In [7]:
# a utility function to drop features
def feature_selection(dataframe, attributes=[]): 
    return dataframe.drop(columns=attributes)

In [8]:
def type_casting(dataframe, attribute, type):
    dataframe[f"{attribute}"] = dataframe[[f"{attribute}"]].astype(f"{type}")
    return dataframe

In [9]:
# drop the tpep_pickup_datetime columns and date columns (used for joining)
drop_dates = ["tpep_pickup_datetime", "date"]
trips = feature_selection(trips, drop_dates)

In [10]:
# drop irrelevant data columns
irrelevant_attr = ["payment_type", "VendorID", "RatecodeID"]
trips = feature_selection(trips, irrelevant_attr)

In [11]:
# drop columns with significant missing values i.e., almost equal to the dataset size
significant_nulls = ["wpgt", "snow", "prcp", "tsun", "wdir", "airport_fee"]
trips = feature_selection(trips, significant_nulls)

In [12]:
# cast dates to a numeral
trips = type_casting(trips, "tpep_dropoff_datetime", "int64")

In [13]:
trips.head(2)

Unnamed: 0,tpep_dropoff_datetime,passenger_count,trip_distance,store_and_fwd_flag,PULocationID,DOLocationID,fare_amount,extra,mta_tax,tip_amount,...,total_amount,congestion_surcharge,tavg,tmin,tmax,wspd,pres,pickup_weekday,pickup_hour,pickup_minute
5629127,1580305394000000,1.0,1.23,N,238,166,7.0,0.0,0.5,0.0,...,7.8,0.0,3.5,1.1,7.2,7.8,1018.2,2,13,36
3950490,1579435722000000,2.0,1.01,N,141,263,5.5,0.0,0.5,1.32,...,10.12,2.5,4.0,0.6,7.2,10.2,1008.9,6,12,4


### Data processing pipeline

##### Checkin all data is numerical or categorical

In [14]:
trips.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5123836 entries, 5629127 to 4623401
Data columns (total 22 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   tpep_dropoff_datetime  int64  
 1   passenger_count        float64
 2   trip_distance          float64
 3   store_and_fwd_flag     object 
 4   PULocationID           int64  
 5   DOLocationID           int64  
 6   fare_amount            float64
 7   extra                  float64
 8   mta_tax                float64
 9   tip_amount             float64
 10  tolls_amount           float64
 11  improvement_surcharge  float64
 12  total_amount           float64
 13  congestion_surcharge   float64
 14  tavg                   float64
 15  tmin                   float64
 16  tmax                   float64
 17  wspd                   float64
 18  pres                   float64
 19  pickup_weekday         int32  
 20  pickup_hour            int32  
 21  pickup_minute          int32  
dtypes: float64(15), i

In [15]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [16]:
# numerical transformer
num_attributes = list(trips.select_dtypes(np.number).columns)
num_pipeline = make_pipeline(SimpleImputer(strategy="mean"),
                              StandardScaler())

# categorical transformer
cat_attributes = ['store_and_fwd_flag']
cat_pipeline = make_pipeline(SimpleImputer(strategy="most_frequent"), 
                                       OneHotEncoder(handle_unknown="ignore"))

In [17]:
# combined Transformation pipelines
preprocessing = ColumnTransformer([
        ("num", num_pipeline, num_attributes),
        ("cat", cat_pipeline, cat_attributes),
    ])

In [18]:
trips.head(2)

Unnamed: 0,tpep_dropoff_datetime,passenger_count,trip_distance,store_and_fwd_flag,PULocationID,DOLocationID,fare_amount,extra,mta_tax,tip_amount,...,total_amount,congestion_surcharge,tavg,tmin,tmax,wspd,pres,pickup_weekday,pickup_hour,pickup_minute
5629127,1580305394000000,1.0,1.23,N,238,166,7.0,0.0,0.5,0.0,...,7.8,0.0,3.5,1.1,7.2,7.8,1018.2,2,13,36
3950490,1579435722000000,2.0,1.01,N,141,263,5.5,0.0,0.5,1.32,...,10.12,2.5,4.0,0.6,7.2,10.2,1008.9,6,12,4


In [19]:
trips_prepared = preprocessing.fit_transform(trips)

In [20]:
df_trips_prepared = pd.DataFrame(
                                trips_prepared, 
                                columns=preprocessing.get_feature_names_out(),
                                index=trips.index
                            )
df_trips_prepared.head()

Unnamed: 0,num__tpep_dropoff_datetime,num__passenger_count,num__trip_distance,num__PULocationID,num__DOLocationID,num__fare_amount,num__extra,num__mta_tax,num__tip_amount,num__tolls_amount,...,num__tmin,num__tmax,num__wspd,num__pres,num__pickup_weekday,num__pickup_hour,num__pickup_minute,cat__store_and_fwd_flag_N,cat__store_and_fwd_flag_Y,cat__store_and_fwd_flag_None
5629127,1.404475,-0.449863,-0.018388,1.118062,0.048046,-0.468797,-0.88498,0.103804,-0.789652,-0.202777,...,-0.089458,-0.078207,-0.816922,-0.334115,-0.533904,-0.151863,0.368525,1.0,0.0,0.0
3950490,0.265921,0.422977,-0.020754,-0.362223,1.435722,-0.592265,-0.88498,0.103804,-0.313534,-0.202777,...,-0.233509,-0.078207,-0.120065,-1.38886,1.627584,-0.321797,-1.476921,1.0,0.0,0.0
3215978,-0.128593,-0.449863,-0.015591,-0.05701,-1.611443,-0.468797,-0.488333,0.103804,-0.789652,-0.202777,...,0.227455,0.752142,-0.003922,-0.685697,0.006468,-2.360998,-0.669538,1.0,0.0,0.0
2139620,-0.700297,-0.449863,0.020017,0.995977,-1.2681,0.354321,1.4949,0.103804,0.707233,-0.202777,...,-0.089458,1.256283,1.128469,1.514523,0.54684,1.377538,1.118237,1.0,0.0,0.0
69250,-1.780301,3.914336,-0.019141,-0.011228,-1.039205,-0.221862,-0.88498,0.103804,-0.789652,-0.202777,...,0.083404,-0.730625,1.941468,-1.468249,-0.533904,-0.831597,0.887557,1.0,0.0,0.0


#### Check there are no null values

In [21]:
df_trips_prepared.isnull().sum()

num__tpep_dropoff_datetime      0
num__passenger_count            0
num__trip_distance              0
num__PULocationID               0
num__DOLocationID               0
num__fare_amount                0
num__extra                      0
num__mta_tax                    0
num__tip_amount                 0
num__tolls_amount               0
num__improvement_surcharge      0
num__total_amount               0
num__congestion_surcharge       0
num__tavg                       0
num__tmin                       0
num__tmax                       0
num__wspd                       0
num__pres                       0
num__pickup_weekday             0
num__pickup_hour                0
num__pickup_minute              0
cat__store_and_fwd_flag_N       0
cat__store_and_fwd_flag_Y       0
cat__store_and_fwd_flag_None    0
dtype: int64

### Regression analysis

#### Linear regression

In [25]:
from sklearn.linear_model import LinearRegression

lin_reg = make_pipeline(preprocessing, LinearRegression())
lin_reg.fit(trips, trips_label)

#### Trying out the linearRegression model

In [26]:
trips_prediction = lin_reg.predict(trips)
print(trips_prediction[:5].round(-2)) # try out on the training set
print(trips_label.iloc[:5].values)

[0. 0. 0. 0. 0.]
[ 6.86666667  4.01666667  7.65       18.13333333 13.4       ]


In [27]:
# check the performance 
from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(trips_label, trips_prediction)
lin_mse

3890.262396817238

In [42]:
trips_label.max()

8525.116666666667

In [43]:
trips_label.min()

-2770.366666666667

In [28]:
from sklearn.metrics import root_mean_squared_error
lin_rmse = root_mean_squared_error(trips_label, trips_prediction)
lin_rmse

62.37196803706965

#### Decision tree regressor

In [44]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = make_pipeline(preprocessing, DecisionTreeRegressor(random_state=42))
tree_reg.fit(trips, trips_label)

In [45]:
# evaluating the model
trips_prediction = tree_reg.predict(trips)
tree_mse = mean_squared_error(trips_label, trips_prediction)
tree_mse

3.9436240474166585e-34

In [46]:
# evaluating the model
trips_prediction = tree_reg.predict(trips)
tree_rmse = root_mean_squared_error(trips_label, trips_prediction)
tree_rmse

1.985855998660693e-17

### Evaluation using cross-validation

In [47]:
from sklearn.model_selection import cross_val_score
tree_rmses = -cross_val_score(tree_reg, trips, trips_label,
                    scoring="neg_root_mean_squared_error", cv=10)

In [48]:
pd.Series(tree_rmses).describe()

count    10.000000
mean     80.462282
std       1.374829
min      78.581266
25%      79.695095
50%      80.029075
75%      81.516521
max      82.899136
dtype: float64