In [None]:
import pandas as pd
import numpy as np
pd.options.plotting.backend = "plotly"

###  NOTE*: I have CLEARED all outputs due to the size of the notebook (cannot push the notebook to github with its output). If you want to see the output, please run the notebook yourself.

Questions answers:  
1- 18  
2- 46.445295712725304  
3- 0.9827547930522406  
4- 515  
5- 6.986190837370544  
6- 7.786409085078911  

### Load the data

In [None]:
data = (
    pd.read_parquet("../yellow_tripdata_2022-01.parquet")
)
data.head(2)

### Calculate Duration and its Standard Deviation (two methods are shown)

In [None]:
data = (
    data
    .assign(
    duration= lambda df_: df_.tpep_dropoff_datetime - df_.tpep_pickup_datetime,
    duration_minutes= lambda df_: df_.duration.dt.total_seconds() / 60
    )
)

In [None]:
(
    (data.duration_minutes).std() #it uses np.std under the hood but it is nice to discover all options.  
)

In [None]:
np.std(data.duration_minutes)

### Drop Outliers and get durations between [1, 60] minutes.  

I have used plotly to create a box plot since it is more informative about the distribution than the seabor historgram  plot .

In [None]:
(
  data
  .duration_minutes
  .plot.box()  
)

In [None]:
import seaborn as sns
sns.histplot(data.duration_minutes, label="duration in minutes", kde=True, stat="density", linewidth=0)

The amount of rides that has duration between [1 and 60] are 98.2% of the data.

In [None]:
len(data.query("1 <= duration_minutes <= 60.0")) / len(data)

In [None]:
data = (
    data.query("1 <= duration_minutes <= 60.0")
)

### One-Hot Encoding

This time me we are using DictVectorizer, it takes only strings values in the dictionary.  
It has some downsides, like the ordinality.

In [None]:
train_dict_categories = data.filter(['PULocationID', 'DOLocationID']).astype(str).to_dict(orient='records')

In [None]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer()
train = dv.fit_transform(train_dict_categories)
train.shape

In [None]:
y_train = data.duration_minutes

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()

lr.fit(train, y_train)

y_pred = lr.predict(train)

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_train, y_pred, squared=False)

In [None]:
del data, train_dict_categories, train, y_train, y_pred

### Validation Set

In [None]:
def read_clean_data(filename):
    categorical = ['PULocationID', 'DOLocationID']
    df = pd.read_parquet(filename)
    df = (
        df
        .assign(
        duration= lambda df_: df_.tpep_dropoff_datetime - df_.tpep_pickup_datetime,
        duration_minutes= lambda df_: df_.duration.dt.total_seconds() / 60
        )
        .query("1 <= duration_minutes <= 60.0")
        
    )
    dicts_categorical = df.filter(categorical).astype(str).to_dict(orient='records')
    return df, dicts_categorical

In [None]:
data_val, val_dicts_categories = read_clean_data("../yellow_tripdata_2022-02.parquet")

In [None]:
val = dv.transform(val_dicts_categories)
y_val = data_val.duration_minutes

In [None]:
y_pred_val = lr.predict(val)
print(mean_squared_error(y_val, y_pred_val, squared=False))
del val, val_dicts_categories, lr, dv, y_pred_val