In [78]:
import pandas as pd
import datetime

# Q1. Downloading the data

In [88]:
jan = pd.read_parquet('fhv_tripdata_2021-01.parquet')
feb = pd.read_parquet('fhv_tripdata_2021-02.parquet')

## Read the data for January. How many records are there?

In [89]:
jan.shape

(1154112, 7)

# Q2. Computing duration
## Now let's compute the duration variable. It should contain the duration of a ride in minutes.
## What's the average trip duration in January?

In [90]:
jan.dtypes

dispatching_base_num              object
pickup_datetime           datetime64[ns]
dropOff_datetime          datetime64[ns]
PUlocationID                     float64
DOlocationID                     float64
SR_Flag                           object
Affiliated_base_number            object
dtype: object

In [91]:
jan['duration'] = (jan['dropOff_datetime'] - jan['pickup_datetime'])/pd.Timedelta(minutes=1)

In [92]:
round(jan['duration'].mean(),3)

19.167

# Q3. Missing values
## The features we'll use for our model are the pickup and dropoff location IDs.

## But they have a lot of missing values there. Let's replace them with "-1".

## What's the fractions of missing values for the pickup location ID? I.e. fraction of "-1"s after you filled the NAs.

In [93]:
jan.isna().sum()

dispatching_base_num            0
pickup_datetime                 0
dropOff_datetime                0
PUlocationID               958267
DOlocationID               162220
SR_Flag                   1154112
Affiliated_base_number        885
duration                        0
dtype: int64

In [94]:
jan['PUlocationID'].fillna(-1, inplace = True)
jan['DOlocationID'].fillna(-1, inplace = True)
jan['PUlocationID'].value_counts()

-1.0      958267
 221.0      8551
 206.0      7128
 129.0      5455
 92.0       5224
           ...  
 111.0         6
 34.0          5
 27.0          4
 2.0           2
 110.0         1
Name: PUlocationID, Length: 262, dtype: int64

In [98]:
jan['PUlocationID'].value_counts()[-1]/len(jan)

0.8303067639882438

# Q4. One-hot encoding
## Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.

## Turn the dataframe into a list of dictionaries
## Fit a dictionary vectorizer
## Get a feature matrix from it
## What's the dimensionality of this matrix? (The number of columns).

In [99]:
from sklearn.feature_extraction import DictVectorizer

In [100]:
df = jan.loc[:,['PUlocationID','DOlocationID']]

In [101]:
# turn X into dict
df = df.astype(str)
X_dict = df.to_dict(orient='records') # turn each row as key-value pairs
# show X_dict


In [102]:
# DictVectorizer
from sklearn.feature_extraction import DictVectorizer
# instantiate a Dictvectorizer object for X
dv_X = DictVectorizer() 


In [103]:
# apply dv_X on X_dict
X_encoded = dv_X.fit_transform(X_dict)
# show X_encoded
X_encoded

<1154112x525 sparse matrix of type '<class 'numpy.float64'>'
	with 2308224 stored elements in Compressed Sparse Row format>

# Q5. Training a model
## Now let's use the feature matrix from the previous step to train a model.

## Train a plain linear regression model with default parameters
## Calculate the RMSE of the model on the training data
## What's the RMSE on train?

In [104]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse

y = jan['duration'].values
lr = LinearRegression()
lr.fit(X_encoded,y)
print(mse(lr.predict(X_encoded),y))

158837.48794601538


In [105]:
print(mse(y,lr.predict(X_encoded))**0.5)

398.5442107797018


# Q6. Evaluating the model
## Now let's apply this model to the validation dataset (Feb 2021).

## What's the RMSE on validation?

In [109]:
def prepare_df(df,dv_X):

    df['PUlocationID'].fillna(-1, inplace = True)
    df['DOlocationID'].fillna(-1, inplace = True)
    df['duration'] = (df['dropOff_datetime'] - df['pickup_datetime'])/pd.Timedelta(minutes=1)
    y = df['duration'].values
    df = df.loc[:,['PUlocationID','DOlocationID']]
    df = df.astype(str)
    X_dict = df.to_dict(orient='records') # turn each row as key-value pairs
    from sklearn.feature_extraction import DictVectorizer



    X_encoded = dv_X.transform(X_dict)


    return X_encoded, y

In [110]:
X_test, y_test = prepare_df(feb,dv_X)

In [111]:
X_test

<1037692x525 sparse matrix of type '<class 'numpy.float64'>'
	with 2075381 stored elements in Compressed Sparse Row format>

In [112]:
mse(y_test,lr.predict(X_test),squared=False)

161.00634039296506