Imports

In [17]:
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_squared_error
import math
import os

Constants

In [18]:
DATA_LOCATION = Path(os.getcwd()) / '..' / 'data'
JAN_DATA = DATA_LOCATION / 'yellow_tripdata_2022-01.parquet'
FEV_DATA = DATA_LOCATION / 'yellow_tripdata_2022-02.parquet'

Load datasets

In [19]:
jan_data = pd.read_parquet(JAN_DATA)
col_num = len(jan_data.columns)

## Exercises

### Exercise 1

In [20]:
print(f"There exists a total of {col_num} column in the January dataset")

There exists a total of 19 column in the January dataset


### Exercise 2

In [21]:
jan_data['duration'] = (jan_data['tpep_dropoff_datetime'] - jan_data['tpep_pickup_datetime']).dt.total_seconds()/60
std_deviation = jan_data['duration'].std()

In [22]:
print(f"The standard deviation for the duration column is {std_deviation:.4}")

The standard deviation for the duration column is 46.45


### Exercise 3

In [23]:
outlier_free_data = jan_data.loc[(jan_data.duration<=60) & (jan_data.duration>=1)]
pct_kept = outlier_free_data.shape[0]/jan_data.shape[0]*100

In [24]:
print(f"The fraction of records left after outlier removal is {int(pct_kept)}%")

The fraction of records left after outlier removal is 98%


### Exercise 4

In [25]:
outlier_free_data.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee', 'duration'],
      dtype='object')

In [26]:
cat_cols = ['PULocationID', 'DOLocationID']
train_data = outlier_free_data[cat_cols].astype('str')
train_data = train_data.to_dict(orient="records")
vectorizer = DictVectorizer()
X_data = vectorizer.fit_transform(train_data)
nr_cols = X_data.shape[1]

In [27]:
print(f"After one-hot enconding we have {nr_cols} columns")

After one-hot enconding we have 515 columns


### Exercise 5

In [28]:
lr = LinearRegression()
y_data = outlier_free_data['duration'].values
lr.fit(X_data, y_data)
y_pred = lr.predict(X_data)

In [29]:
mean_squared_error(y_data, y_pred, squared=False)

6.986190836477672

### Exercise 6

In [30]:
fev_data = pd.read_parquet(FEV_DATA)
fev_data['duration'] = (fev_data['tpep_dropoff_datetime'] - fev_data['tpep_pickup_datetime']).dt.total_seconds()/60
fev_data = fev_data.loc[(fev_data.duration<=60) & (fev_data.duration>=1)]
X_test = fev_data[cat_cols].astype(str).to_dict(orient="records")
y_test = fev_data['duration']
X_test = vectorizer.transform(X_test)
y_pred = lr.predict(X_test)

In [31]:
mean_squared_error(y_test, y_pred, squared=False)

7.78640879016696