# Import modules and read data

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
jan_data = pd.read_parquet('data/yellow_tripdata_2022-01.parquet')
feb_data = pd.read_parquet('data/yellow_tripdata_2022-02.parquet')

Join the data of both months in one dataframe

In [3]:
taxi_data = pd.concat([jan_data, feb_data], axis=0)

# Analyze and prepare data

In [4]:
taxi_data.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,N,142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0
1,1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,N,236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0
2,2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,N,166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0
3,2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,N,114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0
4,2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,N,68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0


In [5]:
print(taxi_data.shape)

(5443362, 19)


In [6]:
print(taxi_data.dtypes)

VendorID                          int64
tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                      int64
DOLocationID                      int64
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
airport_fee                     float64
dtype: object


Compute column of trip duration

In [7]:
taxi_data['duration'] = taxi_data.tpep_dropoff_datetime - taxi_data.tpep_pickup_datetime
taxi_data['duration'] = taxi_data.duration.astype('timedelta64[m]')

In [8]:
print(taxi_data.describe())

           VendorID  passenger_count  trip_distance    RatecodeID  \
count  5.443362e+06     5.270121e+06   5.443362e+06  5.270121e+06   
mean   1.709109e+00     1.390917e+00   5.635142e+00  1.419827e+00   
std    5.017359e-01     9.836000e-01   6.015312e+02  5.972370e+00   
min    1.000000e+00     0.000000e+00   0.000000e+00  1.000000e+00   
25%    1.000000e+00     1.000000e+00   1.060000e+00  1.000000e+00   
50%    2.000000e+00     1.000000e+00   1.770000e+00  1.000000e+00   
75%    2.000000e+00     1.000000e+00   3.170000e+00  1.000000e+00   
max    6.000000e+00     9.000000e+00   3.487985e+05  9.900000e+01   

       PULocationID  DOLocationID  payment_type   fare_amount         extra  \
count  5.443362e+06  5.443362e+06  5.443362e+06  5.443362e+06  5.443362e+06   
mean   1.657783e+02  1.635005e+02  1.183061e+00  1.304498e+01  1.012510e+00   
std    6.554616e+01  7.058795e+01  4.975932e-01  1.723407e+02  1.235380e+00   
min    1.000000e+00  1.000000e+00  0.000000e+00 -6.000000e+02 

Create features from the datetime columns of pickup and dropoffs 

In [9]:
taxi_data['dropoff_hour'] = taxi_data.tpep_dropoff_datetime.dt.hour
taxi_data['pickup_hour'] = taxi_data.tpep_pickup_datetime.dt.hour

Remove outliers by defining them as trips with duration between 1 and 60 minutes

In [10]:
taxi_data_filtered = taxi_data[taxi_data.duration.between(1,60)].copy(deep=True)
print(f'{taxi_data_filtered.shape[0] / taxi_data.shape[0]} rows were kept')

0.9818786624883665 rows were kept


# Preprocessing steps

Fit the encode on the full data to ensure no ID's are missed

In [11]:
taxi_data_enc = taxi_data_filtered[['PULocationID','DOLocationID']]
enc = OneHotEncoder(drop='if_binary')
enc.fit(taxi_data_enc)

Split the data into train and validation according to the month

In [12]:
X_train = taxi_data_filtered[taxi_data_filtered.tpep_pickup_datetime.dt.month == 1]
X_val = taxi_data_filtered[taxi_data_filtered.tpep_pickup_datetime.dt.month == 2]
Y_train = X_train.duration.copy(deep=True)
Y_val = X_val.duration.copy(deep=True)

Select the only two features to train the model and encode them

In [13]:
X_train = enc.transform(X_train[['PULocationID','DOLocationID']])
X_val = enc.transform(X_val[['PULocationID','DOLocationID']])

Check the number of categories encoded

In [14]:
print(X_val.shape)

(2921348, 519)


# Train Linear Regression

Fit the model to the train data and predict

In [15]:
reg = LinearRegression().fit(X_train, Y_train)

Test the accuracy of the model in the training data

In [16]:
Y_train_pred = reg.predict(X_train)
mean_squared_error(Y_train, Y_train_pred, squared=False)

7.016769818834364

Test the accuracy of the model of the validation data

In [17]:
Y_val_pred = reg.predict(X_val)
mean_squared_error(Y_val, Y_val_pred, squared=False)

7.827335280339048

The score in validation is slightly higher than in trainning meaning we are in overfitting