In [1]:
import pandas as pd

In [2]:
import pickle

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

## Q1. Downloading the data

We'll use [the same NYC taxi dataset](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page), but instead of "Green Taxi Trip Records", we'll use "For-Hire Vehicle Trip Records"

Download the data for January and February 2021

Note that you need "For-Hire Vehicle Trip Records", not "High Volume For-Hire Vehicle Trip Records".

Read the data for January. How many records are there?

In [7]:
df_train = pd.read_parquet('fhv_tripdata_2021-01.parquet')
df_val = pd.read_parquet('fhv_tripdata_2021-02.parquet')

target = ["duration"]
cat = ["PUlocationID", "DOlocationID"]

In [8]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1154112 entries, 0 to 1154111
Data columns (total 7 columns):
 #   Column                  Non-Null Count    Dtype         
---  ------                  --------------    -----         
 0   dispatching_base_num    1154112 non-null  object        
 1   pickup_datetime         1154112 non-null  datetime64[ns]
 2   dropOff_datetime        1154112 non-null  datetime64[ns]
 3   PUlocationID            195845 non-null   float64       
 4   DOlocationID            991892 non-null   float64       
 5   SR_Flag                 0 non-null        object        
 6   Affiliated_base_number  1153227 non-null  object        
dtypes: datetime64[ns](2), float64(2), object(3)
memory usage: 61.6+ MB


## Q2. Computing duration

Now let's compute the `duration` variable. It should contain the duration of a ride in minutes.

What's the average trip duration in January?

### Train

In [9]:
df_train['duration'] = df_train.dropOff_datetime - df_train.pickup_datetime 
df_train.duration = df_train.duration.apply(lambda td: td.total_seconds() / 60)

In [10]:
df_train['duration'].mean()

19.167224093791006

In [11]:
df_train = df_train[(df_train.duration >= 1) & (df_train.duration <= 60)]

### Validation

In [12]:
df_val['duration'] = df_val.dropOff_datetime - df_val.pickup_datetime 
df_val.duration = df_val.duration.apply(lambda td: td.total_seconds() / 60)

In [13]:
df_val["duration"].mean()

20.706986225199763

In [14]:
df_val = df_val[(df_val.duration >= 1) & (df_val.duration <= 60)]

## Q3. Missing values

The features we'll use for our model are the pickup and dropoff location IDs.

But they have a lot of missing values there. Let's replace them with "-1"

What's the factions of missing values for the pickup location ID? (Or the fraction of "-1"s after you filled the NAs)

In [15]:
df_train.PUlocationID.isna().mean()

0.8352732770722617

### Filling train and validation data data

In [16]:
df_train = df_train[cat+target].fillna(-1).copy()

In [17]:
df_val = df_val[cat+target].fillna(-1) 

In [18]:
for item in cat:
    df_train[item] = df_train[item].astype(str)
    df_val[item] = df_train[item].astype(str)

## Q4. One-hot encoding

Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.

* Turn the dataframe into a list of dictionaries
* Fit a dictionary vectorizer
* Get a feature matrix from it

In [19]:
dv = DictVectorizer()

train_dicts = df_train[cat].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[cat].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [20]:
X_train.shape

(1109826, 525)

## Q5. Training a model

Now let's use the feature matrix from the previous step to train a model.

* Train a plain linear regression model with default parameters
* Calculate the RMSE of the model on the training data

In [21]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [22]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_train = lr.predict(X_train)

mean_squared_error(y_train, y_pred_train, squared=False)

10.528519367437616

## Q6. Evaluating the model

Now let's apply this model to the validation dataset (Feb 2021).

What's the RMSE on validation?

In [23]:
y_pred = lr.predict(X_val)
mean_squared_error(y_val, y_pred, squared=False)

12.923581093608595