In [None]:
import pandas as pd
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge


In [None]:
pip install pyarrow

In [None]:
url = "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet"
df = pd.read_parquet(url, engine="pyarrow")

In [None]:
df.head(3)

In [None]:
df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime ) # we need to convert strings to datetime

In [None]:
df.trip_type.unique().tolist()

In [None]:
# create a new col
df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime 

# Convert timedelta to minutes
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

In [None]:
#df = df[df.trip_type == 2] # interested in this type of trip

In [None]:
df.duration.describe(percentiles= [0.95,0.98,0.99 ])

In [None]:
# frombuisness point of view, it's imp to pay attention to this --> 98%-> around 1 hour

In [None]:
# we can do extra filtering, only attention to those trip the at least one minutes.

((df.duration >= 1) & (df.duration <= 60)).mean()


In [None]:
df= df[((df.duration >= 1) & (df.duration <= 60))]
df # all data regardless of different trip type

In [None]:
categorical = ['PULocationID','DOLocationID']
numerical = ['trip_distance']

In [None]:
# let's train a model, 

In [None]:
df[categorical].dtypes

In [None]:
# for implementing the onehot encoding we can implemnt dictinoary vectorizer
# dictionary vectorizer treat everything that is not numbrer as a categorical varible

In [None]:
# Convert int to category when numbers represent labels, not quantities.
# we do for clarity, memory efficiency and at the end we encode categories numerically

df[categorical].astype(str).dtypes

In [None]:
df[categorical] = df[categorical].astype(str)

In [None]:
# we have now dataframe, we need dic though

In [None]:
dv = DictVectorizer() # vectorizes a dictionary. dic --> vector

In [None]:
df[categorical + numerical].iloc[:10].to_dict(orient='records')

In [None]:
#apply to the entire dataframe
train_dicts= df[categorical + numerical].to_dict(orient='records')

In [None]:
# then we can put inside our dictionary vectorizer
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [None]:
X_train

In [None]:
dv.feature_names_

In [None]:
target = 'duration'
y_train= df[target].values

In [None]:
y_train

In [None]:
# Baseline Model --> Linear Regression

In [None]:
lr = LinearRegression()
lr.fit(X_train,y_train)

In [None]:
y_pred = lr.predict(X_train)

In [None]:
sns.distplot(y_pred, label = 'prediction')
sns.distplot(y_train, label = 'actual')

plt.legend()

In [None]:
# prection of actual value is different, lest quatifize that

In [None]:
mean_squared_error(y_train, y_pred, squared=False) #  returns the RMSE

In [None]:
# our model is (error)wrong on average by 9 min, it's not ideal
# you want to reach somewhere by 30 min, but it turns out thta it take 40 min

In [None]:
# - Lasso

In [None]:
# create all we did and put all inside a function 

In [None]:
# All the preprocessing we need to do: 

In [None]:
def read_dataframe(url):
    
    df = pd.read_parquet(url, engine="pyarrow")

   # url = "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet"
   # df = pd.read_parquet(url, engine="pyarrow")

    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime ) 

    # create a new col
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime 

    # Convert timedelta to minutes
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[((df.duration >= 1) & (df.duration <= 60))] # intrested in this timeline

    categorical = ['PULocationID','DOLocationID'] # only preprocessing for cat
    

    df[categorical] = df[categorical].astype(str)
    
    return df

In [None]:
# January data = training
df_train = read_dataframe("https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet")

# February data = validation
df_val = read_dataframe("https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet")


In [None]:
len(df_train), len(df_val)

In [None]:
categorical = ['PULocationID','DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts= df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts= df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [None]:
target = 'duration'
y_train= df_train[target].values

y_val = df_val[target].values

In [None]:
# baseline

lr = LinearRegression()
lr.fit(X_train,y_train)


y_pred = lr.predict(X_val)
mean_squared_error(y_val, y_pred, squared=False) 

In [None]:
# lasso

lr = Lasso(alpha= 0.001)
lr.fit(X_train,y_train)


y_pred = lr.predict(X_val)
mean_squared_error(y_val, y_pred, squared=False) 

In [None]:
# Ridge

lr = Ridge()
lr.fit(X_train,y_train)


y_pred = lr.predict(X_val)
mean_squared_error(y_val, y_pred, squared=False) 

In [None]:
#### Do Some Experimentation

In [None]:
# we can create another feature 
# combine e.g., pick ip and drop off

In [None]:
df_train['PU_DO'] = df_train['PULocationID'] + '_'+  df_train['DOLocationID']

df_val['PU_DO'] = df_val['PULocationID'] + '_'+  df_val['DOLocationID']

In [None]:
categorical =  ['PU_DO'] # ['PULocationID','DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts= df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts= df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [None]:
target = 'duration'
y_train= df_train[target].values

y_val = df_val[target].values

In [None]:
# baseline  --> here became much better

lr = LinearRegression()
lr.fit(X_train,y_train)


y_pred = lr.predict(X_val)
mean_squared_error(y_val, y_pred, squared=False) 

In [None]:
# lasso

lr = Lasso(alpha= 0.0001)
lr.fit(X_train,y_train)


y_pred = lr.predict(X_val)
mean_squared_error(y_val, y_pred, squared=False) 

In [None]:
# Ridge

lr = Ridge()
lr.fit(X_train,y_train)


y_pred = lr.predict(X_val)
mean_squared_error(y_val, y_pred, squared=False) 

In [None]:
# we reduced the error quit significantly

In [None]:
# save the model

In [None]:
import pickle

In [None]:
with open('models/lin_reg.bin','wb') as f_out:
    pickle.dump((dv, lr), f_out)