In [1]:
import pandas as pd

In [2]:
df = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet")

In [3]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,2,2024-01-01 00:57:55,2024-01-01 01:17:43,1.0,1.72,1.0,N,186,79,2,17.7,1.0,0.5,0.0,0.0,1.0,22.7,2.5,0.0
1,1,2024-01-01 00:03:00,2024-01-01 00:09:36,1.0,1.8,1.0,N,140,236,1,10.0,3.5,0.5,3.75,0.0,1.0,18.75,2.5,0.0
2,1,2024-01-01 00:17:06,2024-01-01 00:35:01,1.0,4.7,1.0,N,236,79,1,23.3,3.5,0.5,3.0,0.0,1.0,31.3,2.5,0.0
3,1,2024-01-01 00:36:38,2024-01-01 00:44:56,1.0,1.4,1.0,N,79,211,1,10.0,3.5,0.5,2.0,0.0,1.0,17.0,2.5,0.0
4,1,2024-01-01 00:46:51,2024-01-01 00:52:57,1.0,0.8,1.0,N,211,148,1,7.9,3.5,0.5,3.2,0.0,1.0,16.1,2.5,0.0


## Clean data

In [4]:
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)

In [5]:
df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
df.duration = df.duration.apply(lambda t: t.total_seconds() / 60)

In [6]:
df = df[(df.duration >= 1) & (df.duration <= 60)]

In [7]:
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

In [2]:
def read_data(url: str) -> pd.DataFrame:
    df = pd.read_parquet(url)
    
    df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
    df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda t: t.total_seconds() / 60)
    
    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    numerical = ['trip_distance']

    df[categorical] = df[categorical].astype(str)
    
    return df

In [3]:
train_data = read_data("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet")
test_data = read_data("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet")

In [4]:
len(train_data), len(test_data)

(2898906, 2938060)

In [5]:
df_train = train_data
df_val = test_data

## Preprocessing

In [6]:
from sklearn.feature_extraction import DictVectorizer

In [7]:
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

In [8]:
train_dicts = df_train[categorical + numerical].to_dict(orient="records")
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient="records")
X_val = dv.transform(val_dicts)

In [9]:
target = 'duration'

y_train = df_train[target].values
y_val = df_val[target].values

In [10]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, root_mean_squared_error

**Linear regression** model

In [11]:
model = LinearRegression()
model = model.fit(X_train, y_train)

y_pred = model.predict(X_val)

print("mse: ", mean_squared_error(y_val, y_pred))
print("rmse: ", root_mean_squared_error(y_val, y_pred))

mse:  66.04703702738202
rmse:  8.12693281794442


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sns.distplot(y_pred, label='prediction')
sns.distplot(y_train, label='actual')

plt.legend()

**Lasso regression** model

In [None]:
model = Lasso(alpha=0.001)
model = model.fit(X_train, y_train)

y_pred = model.predict(X_val)

print("mse: ", mean_squared_error(y_val, y_pred))
print("rmse: ", root_mean_squared_error(y_val, y_pred))

**Ridge regression model**

In [12]:
model = Ridge(alpha=0.001)
model = model.fit(X_train, y_train)

y_pred = model.predict(X_val)

print("mse: ", mean_squared_error(y_val, y_pred))
print("rmse: ", root_mean_squared_error(y_val, y_pred))

mse:  66.00282036022344
rmse:  8.124211983954101


## Save the model

In [19]:
import pickle
with open('./models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, model), f_out)