In [None]:
import pandas as pd #data manipulation
import numpy as np

In [None]:
import seaborn as sns #visualisation
import matplotlib.pyplot as plt #visualisation
#import plotly as 

In [None]:
from sklearn.feature_extraction import DictVectorizer #modelling
from sklearn.linear_model import LinearRegression #modelling
from sklearn.metrics import mean_squared_error #modelling

In [None]:
# Jupyter display env setup
pd.set_option('display.max_rows', 200)

pd.options.display.max_columns

pd.options.display.max_colwidth

pd.options.display.precision = 3

pd.options.plotting.backend = "plotly"

np.random.seed(42)

In [None]:
# load the data just for January 2021
df = pd.read_parquet('./data/fhv_tripdata_2021-01.parquet')

# and quickly inspect the size of the table
print(df.shape)

# Question 1: Number of records in Jan 2021 FHV data
print(df.shape[0])

df.head()

In [None]:
# create the response variable
df['duration'] = df.dropOff_datetime - df.pickup_datetime

df.duration = df.duration.dt.total_seconds()/60

In [None]:
# Question 2: Average duration in Jan 2021 FHV

df.duration.mean()

In [None]:
# exclude outliers
df = df.query("duration >= 1 & duration <= 60")

df.duration.describe()


In [None]:
df.dtypes

In [None]:
# subset just the locationID columns and replace NAs with -1
PU_DO = df[['PUlocationID', 'DOlocationID']].fillna(-1)

PU_DO.head()

In [None]:
#Question 3: Fraction of missing values
PU_DO.query('PUlocationID == -1').shape[0]/df.shape[0] * 100

In [None]:
# set appropriate data type
PU_DO = PU_DO.astype(str)

PU_DO.dtypes

In [None]:
# turn into a list of dicts
PU_DO_dicts = PU_DO.to_dict(orient='records')
PU_DO_dicts

In [None]:
#Question 4: Dimensionality after OHE
dv = DictVectorizer()
PU_DO_ohe = dv.fit_transform(PU_DO_dicts)

PU_DO_ohe.shape

In [None]:
# Build a vanilla LR
Y_train = df.duration.values

X_train = PU_DO_ohe

lr = LinearRegression()
lr.fit(X_train, Y_train)

Y_pred = lr.predict(X_train)

In [None]:
actual_preds = pd.DataFrame({'Actual': Y_train, 'Predicted': Y_pred})

actual_preds.head()

In [None]:
#Question 5: RMSE on train
print(mean_squared_error(actual_preds.Actual, actual_preds.Predicted, squared=False))


# check the distributions
fig = actual_preds.plot.hist()
fig.show()

In [None]:
# load Feb 2021 data and create response var: duration
df_val= pd.read_parquet('./data/fhv_tripdata_2021-02.parquet')

df_val['duration'] = df_val.dropOff_datetime - df_val.pickup_datetime

df_val.duration = df_val.duration.dt.total_seconds()/60


In [None]:
# according to instructions: follow similar preprocessing as in the train subset
df_val = df_val.query("duration >= 1 & duration <= 60")

PU_DO_val = df_val[['PUlocationID', 'DOlocationID']].fillna(-1)

PU_DO_val = PU_DO_val.astype(str)

print(PU_DO_val.dtypes)

PU_DO_val = PU_DO_val.to_dict(orient='records')

PU_DO_val = dv.transform(PU_DO_val)

print(PU_DO_val.shape[1])

In [None]:
# Produce predictions on the val df
Y_val = df_val.duration.values

Y_pred_val = lr.predict(PU_DO_val)

In [None]:
actual_preds_val = pd.DataFrame({'Actual_val': Y_val, 'Predicted_val': Y_pred_val})


#Question 6: RMSE on validation
print(mean_squared_error(actual_preds_val.Actual_val, actual_preds_val.Predicted_val, 
                         squared=False))


# check the distributions
fig = actual_preds_val.plot.hist()
fig.show()