In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

import DataImport
import ProcessTrips

trips = ProcessTrips.transformTrips(DataImport.trips)
zones = DataImport.zones

Original number of rows: 6122322
Number of rows after removing outliers: 6122322


In [2]:
trips.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6122322 entries, 0 to 7667786
Data columns (total 27 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[ns]
 2   tpep_dropoff_datetime  datetime64[ns]
 3   passenger_count        int64         
 4   trip_distance          float64       
 5   RatecodeID             int64         
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  pickup_date            date

### Dataset preparation. <br>
- Get dummy variables
- Split dataset to test and train


In [None]:
# Get dummy variables for Ratecode and Weekday
trips = pd.get_dummies(trips, columns=['RatecodeID', 'pickup_weekday'])
trips.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6122322 entries, 0 to 7667786
Data columns (total 39 columns):
 #   Column                    Dtype         
---  ------                    -----         
 0   VendorID                  int64         
 1   tpep_pickup_datetime      datetime64[ns]
 2   tpep_dropoff_datetime     datetime64[ns]
 3   passenger_count           int64         
 4   trip_distance             float64       
 5   store_and_fwd_flag        object        
 6   PULocationID              int64         
 7   DOLocationID              int64         
 8   payment_type              int64         
 9   fare_amount               float64       
 10  extra                     float64       
 11  mta_tax                   float64       
 12  tip_amount                float64       
 13  tolls_amount              float64       
 14  improvement_surcharge     float64       
 15  total_amount              float64       
 16  congestion_surcharge      float64       
 17  pickup_date  

Split data to train and test 

In [8]:
rng = np.random.default_rng(123)

dt_index = trips.index
smpl_index = rng.choice(dt_index, size=int(np.floor(0.8 * len(dt_index))), replace=False)

dt_train = trips.loc[smpl_index].reset_index(drop=True)
dt_test = trips.drop(smpl_index)  # Drop selected indices to get the test set

print(len(dt_train))
print(len(dt_test))

4897857
1224465


In [5]:
# import scipy
# # For plot size options, see: plotnine.options.figure_size = (., .)
# import matplotlib.pyplot as plt
# # plt.figure(figsize = (., .))
# import seaborn as sns

# #Note: before this I dropped NA values in dataset transformations.
# def reg_coef(x, y, label=None, color=None, **kwargs):
#     # A modified version of https://stackoverflow.com/a/63433499
#     ax = plt.gca()
#     r,p = scipy.stats.pearsonr(x, y)
#     val = 'r = {:.3f}'.format(r)
#     if p <= 0.001:
#         val = val + "***"  
#     elif p <= 0.01:
#         val = val + "**"  
#     elif p <= 0.05:
#         val = val + "*"    
#     ax.annotate(val, xy=(0.5, 0.5), xycoords='axes fraction', ha='center')
#     ax.set_axis_off()
# g = sns.PairGrid(dt_train.iloc[:, [16, 4, 23, 22, 10, 11, 12, 13, 14, 15, 25, 26]], diag_sharey = False)
# tmp_plt = g.map_diag(sns.histplot)
# tmp_plt = g.map_lower(sns.scatterplot)
# tmp_plt = g.map_upper(reg_coef)
# plt.show()

In [13]:
mdl = smf.ols(formula = "total_amount ~ trip_distance + pickup_hour + trip_duration + fare_amount + \
                            extra + mta_tax + tip_amount +  improvement_surcharge + \
                             + PULocationID_encoded + DOLocationID_encoded +\
                            RatecodeID_1 + RatecodeID_2 + RatecodeID_3 + RatecodeID_4 + RatecodeID_5 + RatecodeID_6 + RatecodeID_99 +\
                            pickup_weekday_Monday + pickup_weekday_Tuesday + pickup_weekday_Wednesday +\
                            pickup_weekday_Thursday + pickup_weekday_Saturday", data = dt_train)
mdl_summary = mdl.fit()
print(mdl_summary.summary().tables[1])



                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
Intercept                            0.5393      0.010     54.845      0.000       0.520       0.559
RatecodeID_1[T.True]                 0.0765      0.013      5.804      0.000       0.051       0.102
RatecodeID_2[T.True]                 0.0714      0.041      1.753      0.080      -0.008       0.151
RatecodeID_3[T.True]                 0.0760      0.078      0.972      0.331      -0.077       0.229
RatecodeID_4[T.True]                 0.0711      0.014      5.020      0.000       0.043       0.099
RatecodeID_5[T.True]                 0.0712      0.013      5.382      0.000       0.045       0.097
RatecodeID_6[T.True]                 0.0705      0.019      3.624      0.000       0.032       0.109
RatecodeID_99[T.True]                0.1027      0.016      6.623      0.000       0.072   

In [14]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# VIF dataframe 
vif_data = pd.DataFrame() 
vif_data["variable"] = mdl.exog_names
# calculating VIF for each feature 
vif_data["VIF"] = [variance_inflation_factor(mdl.exog, i) for i in range(len(vif_data["variable"]))]
# drop intercept:
vif_data = vif_data[vif_data['variable'] != 'Intercept']
print(vif_data.T)

  return 1 - self.ssr/self.centered_tss
  vif = 1. / (1. - r_squared_i)


                            1                     2                     3   \
variable  RatecodeID_1[T.True]  RatecodeID_2[T.True]  RatecodeID_3[T.True]   
VIF                        inf                   inf                   inf   

                            4                     5                     6   \
variable  RatecodeID_4[T.True]  RatecodeID_5[T.True]  RatecodeID_6[T.True]   
VIF                        inf                   inf                   inf   

                             7                              8   \
variable  RatecodeID_99[T.True]  pickup_weekday_Monday[T.True]   
VIF                         inf                       1.295708   

                                      9                                 10  \
variable  pickup_weekday_Tuesday[T.True]  pickup_weekday_Wednesday[T.True]   
VIF                             1.373306                          1.394865   

          ...             13           14             15           16  \
variable  ...  trip_dis