In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
pd.__version__

'1.4.2'

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

In [5]:
df_jan_2023 = pd.read_parquet('./data/yellow_tripdata_2023-01.parquet')
df_feb_2023 = pd.read_parquet('./data/yellow_tripdata_2023-02.parquet')

In [6]:
df = df_jan_2023

In [7]:
# df.head()

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3066766 entries, 0 to 3066765
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[ns]
 2   tpep_dropoff_datetime  datetime64[ns]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee           

In [9]:
df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime

# print(df['duration'].dtype)
# print(type(df['duration'].iloc[0]))

In [10]:
df['duration'].describe()

count                      3066766
mean     0 days 00:15:40.139710039
std      0 days 00:42:35.661074517
min              -1 days +23:30:48
25%                0 days 00:07:07
50%                0 days 00:11:31
75%                0 days 00:18:18
max                6 days 23:09:11
Name: duration, dtype: object

In [11]:
df['duration'] = df['duration'].dt.total_seconds() / 60

In [12]:
# print(df['duration'].dtype)
# print(type(df['duration'].iloc[0]))

In [13]:
# df['duration'] = df.duration.apply(lambda td: td.total_seconds() / 60) #td time delta / not needed

In [14]:
#df = df[df['RatecodeID'] == 5] # Negotiated fare - https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf

In [15]:
# df.duration.describe(percentiles=[0.95, 0.98, 0.99])

In [16]:
((df.duration >= 1) & (df.duration <= 60)).mean()

0.9812202822125979

In [17]:
df = df[(df.duration >= 1) & (df.duration <= 60)].copy()
# df['duration'].describe()

In [18]:
categorical = ['PULocationID', 'DOLocationID']

In [19]:
numerical = ['trip_distance']

In [20]:
df[categorical] = df[categorical].astype(str)

In [21]:
# df[caterorical + numerical].iloc[:10].to_dict(orient='records')

In [22]:
# train_dicts = df[caterorical + numerical].to_dict(orient='records')
train_dicts = df[categorical].to_dict(orient='records')

In [23]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [24]:
X_train

<3009173x515 sparse matrix of type '<class 'numpy.float64'>'
	with 6018346 stored elements in Compressed Sparse Row format>

In [25]:
X_train.shape

(3009173, 515)

In [26]:
# dv.feature_names_

In [27]:
target = 'duration'
y_train = df[target].values

In [28]:
y_train

array([ 8.43333333,  6.31666667, 12.75      , ..., 24.51666667,
       13.        , 14.4       ])

In [29]:
def preprocess(train_df, val_df, catergorical, numerical, target, flag='train'):
    
    def dfCleaner(df):
        df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
        df['duration'] = df['duration'].dt.total_seconds() / 60
        df = df[(df.duration >= 1) & (df.duration <= 60)].copy()
        df[categorical] = df[categorical].astype(str)
        dicts = df[categorical].to_dict(orient='records')
        y = df[target].values
        
        return dicts, y
    
    train_dicts, y_train = dfCleaner(train_df)
    val_dicts, y_val = dfCleaner(val_df)
    
    
    dv = DictVectorizer()
    X = dv.fit_transform(train_dicts)
    y = y_train
    if flag == 'val':
        X = dv.transform(val_dicts)
        y = y_val
        
    return X, y

In [None]:
X_val, y_val = preprocess(df_jan_2023, df_feb_2023, categorical, numerical, target, flag='val')

In [None]:
X_val.shape

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
y_pred_train = model.predict(X_train)
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
print("RMSE on train:", round(rmse_train, 2))

In [None]:
y_pred_val = model.predict(X_val)
rmse_val = mean_squared_error(y_val, y_pred_val, squared=False)
print("RMSE on validation:", round(rmse_val, 2))