In [1]:
import pandas as pd

In [2]:
! pip install pyarrow



In [3]:
df_train = pd.read_parquet('yellow_tripdata_2023-01.parquet')
df_val = pd.read_parquet('yellow_tripdata_2023-02.parquet')



In [4]:
df_train.dtypes

VendorID                          int64
tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                      int64
DOLocationID                      int64
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
airport_fee                     float64
dtype: object

In [5]:
def preprocess(df):
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    df = df[df.duration >= 1]
    df = df[df.duration <= 60]
    return df

In [6]:
df_train = preprocess(df_train)

In [7]:
df_val = preprocess(df_val)

In [8]:
categorical = ['PULocationID', 'DOLocationID']
df_train[categorical] = df_train[categorical].astype(str)
df_val[categorical] = df_val[categorical].astype(str)

In [9]:
train_dicts = df_train[categorical].to_dict(orient='records')

In [10]:
val_dicts = df_val[categorical].to_dict(orient='records')

In [12]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [14]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_train)
rmse = mean_squared_error(y_train, y_pred, squared=False)
print(f'This is the root mean squared error for the training set: {rmse}')
val_pred = lr.predict(X_val)

val_rmse = mean_squared_error(y_val, val_pred, squared=False)
print(f'This is the root mean squared error for the validation set: {val_rmse}')

This is the root mean squared error for the training set: 7.64926180044339
This is the root mean squared error for the validation set: 7.8118236307935


In [20]:
print(f'The number of columns is:  {len(df_train.columns)}')

The number of columns is:  20


In [18]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3009173 entries, 0 to 3066765
Data columns (total 20 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[ns]
 2   tpep_dropoff_datetime  datetime64[ns]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           object        
 8   DOLocationID           object        
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee           

In [21]:
print(f'The standard deviation of the trips in January is: {df_train.duration.std()}')

The standard deviation of the trips in January is: 9.939385620145579
