In [24]:
# import dependencies
import pandas as pd
import seaborn as sns
import pickle
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score

In [10]:
# A function for reading the dataset
# Returns a dataframe
def read_dataframe(filename):
    df = pd.read_parquet(filename) 

    df['lpep_dropoff_datetime']= pd.to_datetime(df.lpep_dropoff_datetime)
    df['lpep_pickup_datetime'] = pd.to_datetime(df.lpep_pickup_datetime)
    
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda x: x.total_seconds() / 60)
    
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    
    categorical = ['PULocationID', 'DOLocationID']
    numerical   = ['trip_distance']
    
    df[categorical] = df[categorical].astype(str)

    return df

In [11]:
# Receive the dataframe 
df_train = read_dataframe('data/green_tripdata_2021-01.parquet')
df_val   = read_dataframe('data/green_tripdata_2021-02.parquet')

In [12]:
len(df_train), len(df_val)

(73908, 61921)

In [17]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [18]:
categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [19]:
y_train = df_train['duration'].values
y_val   = df_val['duration'].values

In [28]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)
val_pred = lr.predict(X_val)

print('mse score:', mean_squared_error(y_train, y_pred, squared=False))
print('r2-score:', r2_score(y_train, y_pred))

print('\n val pred: ', mean_squared_error(y_val, val_pred, squared=False))

mse score: 5.6995641181989765
r2-score: 0.7570393923885601

 val pred:  7.758715203341164


In [26]:
# LASSO
lr = Lasso(alpha=0.0001)
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)
val_pred = lr.predict(X_val)

print('mse score:', mean_squared_error(y_train, y_pred, squared=False))
print('r2-score:', r2_score(y_train, y_pred))

print('\n val pred: ', mean_squared_error(y_val, val_pred, squared=False))

mse score: 5.108197838551504
r2-score: 0.8048412644602487

 val pred:  7.616617761096093


In [33]:
# RIDGE
lr = Ridge(alpha=0.001)
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)
val_pred = lr.predict(X_val)

print('mse score:', mean_squared_error(y_train, y_pred, squared=False))
print('r2-score:', r2_score(y_train, y_pred))

print('\n val pred: ', mean_squared_error(y_val, val_pred, squared=False))

mse score: 4.944550134760984
r2-score: 0.817145291823711

 val pred:  7.510909603240781


In [39]:
with open('model/lr1.bin', 'wb' ) as file:
    pickle.dump((dv, lr), file)