In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

In [2]:
# !wget https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-01.parquet

In [3]:
# !wget https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-02.parquet -O ../data/fhv_tripdata_2021-02.parquet

In [4]:
df = pd.read_parquet(f'../data/fhv_tripdata_2021-01.parquet')
df.head(3)

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013


In [5]:
len(df)

1154112

# Trip Duration

In [6]:
df['duration'] = df.dropOff_datetime - df.pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

In [7]:
df.head(3)

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009,17.0
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009,17.0
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013,110.0


In [8]:
df.duration.mean()

19.167224093791006

# Remove Outliers

In [9]:
df = df[(df.duration >= 1) & (df.duration <= 60)]
len(df)

1109826

# Perc missing values in PUlocationID

In [10]:
len(df[df['PUlocationID'].isna()]) / len(df)

0.8352732770722617

In [11]:
df['PUlocationID'] = df['PUlocationID'].fillna(-1)
df['PUlocationID'].value_counts()

-1.0      927008
 221.0      8330
 206.0      6797
 129.0      5379
 115.0      4082
           ...  
 111.0         5
 27.0          4
 34.0          3
 2.0           2
 110.0         1
Name: PUlocationID, Length: 262, dtype: int64

In [12]:
df['DOlocationID'] = df['DOlocationID'].fillna(-1)
df['DOlocationID'].value_counts()

-1.0      147907
 76.0      26375
 217.0     19488
 265.0     18628
 17.0      18422
           ...  
 27.0         18
 30.0         13
 2.0          11
 105.0         4
 199.0         1
Name: DOlocationID, Length: 263, dtype: int64

In [13]:
categorical = ['PUlocationID', 'DOlocationID']
df[categorical] = df[categorical].astype(str)

In [14]:
df['DOlocationID'].value_counts()

-1.0     147907
76.0      26375
217.0     19488
265.0     18628
17.0      18422
          ...  
27.0         18
30.0         13
2.0          11
105.0         4
199.0         1
Name: DOlocationID, Length: 263, dtype: int64

In [15]:
train_dicts = df[['PUlocationID', 'DOlocationID']].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

target = 'duration'
y_train = df[target].values

In [16]:
X_train

<1109826x525 sparse matrix of type '<class 'numpy.float64'>'
	with 2219652 stored elements in Compressed Sparse Row format>

In [17]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train) # use model to predict labels of X_train

mean_squared_error(y_train, y_pred, squared=False)

10.528519426833792

In [18]:
df_feb = pd.read_parquet(f'../data/fhv_tripdata_2021-02.parquet')

In [19]:
def prepare_val_data(df):
    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    df = df[(df.duration >= 1) & (df.duration <= 60)]

    df['PUlocationID'] = df['PUlocationID'].fillna(-1)
    df['DOlocationID'] = df['DOlocationID'].fillna(-1)

    categorical = ['PUlocationID', 'DOlocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [27]:
def get_val_mse(df, dict_vectorizer, lr_model):
    val_dicts = df[['PUlocationID', 'DOlocationID']].to_dict(orient='records')

    # dv = DictVectorizer()
    X_val = dv.transform(train_dicts)

    target = 'duration'
    y_val = df[target].values
    # lr = LinearRegression()
    # lr.fit(X_train, y_train)

    y_pred = lr_model.predict(X_val) # use model to predict labels of X_train

    print(y_va
    mse = mean_squared_error(y_val, y_pred, squared=False)
    return mse

In [22]:
df_feb = prepare_val_data(df_feb)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['PUlocationID'] = df['PUlocationID'].fillna(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['DOlocationID'] = df['DOlocationID'].fillna(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[categorical] = df[categorical].astype(str)


In [28]:
get_val_mse(df_feb, dv, lr)

ValueError: Found input variables with inconsistent numbers of samples: [990113, 1109826]