In [2]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [2]:
!ls

for-hire_duration.ipynb


In [3]:
df = pd.read_parquet('fhv_tripdata_2021-01.parquet')

df.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037


In [4]:
df.shape

(1154112, 7)

In [5]:
df.dtypes

dispatching_base_num              object
pickup_datetime           datetime64[ns]
dropOff_datetime          datetime64[ns]
PUlocationID                     float64
DOlocationID                     float64
SR_Flag                           object
Affiliated_base_number            object
dtype: object

In [6]:
df['duration'] = df['dropOff_datetime'] - df['pickup_datetime']
df['duration'] = df['duration'].apply(lambda x : x.total_seconds() / 60)

df['duration'].mean()

19.1672240937939

In [7]:
df['duration'].describe()

count    1.154112e+06
mean     1.916722e+01
std      3.986922e+02
min      1.666667e-02
25%      7.766667e+00
50%      1.340000e+01
75%      2.228333e+01
max      4.233710e+05
Name: duration, dtype: float64

In [8]:
#remove outliers
df_clean = df[(df['duration'] >=1) & (df['duration']<=60)]


In [9]:
#number of trips with a duration <1 min or >60min
df.shape[0] - df_clean.shape[0]

44286

In [10]:
df_clean.isna().sum()/df_clean.shape[0]

dispatching_base_num      0.000000
pickup_datetime           0.000000
dropOff_datetime          0.000000
PUlocationID              0.835273
DOlocationID              0.133270
SR_Flag                   1.000000
Affiliated_base_number    0.000697
duration                  0.000000
dtype: float64

In [12]:
df_clean[['PUlocationID', 'DOlocationID']] = df_clean[['PUlocationID', 'DOlocationID']].fillna(value=-1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean[['PUlocationID', 'DOlocationID']] = df_clean[['PUlocationID', 'DOlocationID']].fillna(value=-1)


In [13]:
df_clean.isna().sum()

dispatching_base_num            0
pickup_datetime                 0
dropOff_datetime                0
PUlocationID                    0
DOlocationID                    0
SR_Flag                   1109826
Affiliated_base_number        773
duration                        0
dtype: int64

In [14]:
df_clean[['PUlocationID', 'DOlocationID']] = df_clean[['PUlocationID', 'DOlocationID']].astype(str)
df_clean.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean[['PUlocationID', 'DOlocationID']] = df_clean[['PUlocationID', 'DOlocationID']].astype(str)


dispatching_base_num              object
pickup_datetime           datetime64[ns]
dropOff_datetime          datetime64[ns]
PUlocationID                      object
DOlocationID                      object
SR_Flag                           object
Affiliated_base_number            object
duration                         float64
dtype: object

In [15]:
train_dicts = df_clean[['PUlocationID', 'DOlocationID']].to_dict(orient='records')
dv = DictVectorizer()

X_train = dv.fit_transform(train_dicts)
X_train

<1109826x525 sparse matrix of type '<class 'numpy.float64'>'
	with 2219652 stored elements in Compressed Sparse Row format>

In [16]:
y_train = df_clean['duration'].values

In [17]:
y_train

array([17.        , 17.        ,  8.28333333, ..., 16.2       ,
       19.43333333, 36.        ])

In [18]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [19]:
y_pred = lr.predict(X_train)

In [20]:
mean_squared_error(y_train, y_pred, squared = False)

10.5285191072072

In [21]:
def read_df(filename):
    df = pd.read_parquet(filename)
    df['duration'] = df['dropOff_datetime'] - df['pickup_datetime']
    df['duration'] = df['duration'].apply(lambda x : x.total_seconds() / 60)
    
    df = df[(df['duration']>=1) & (df['duration']<=60)]
    
    categorical = ['PUlocationID', 'DOlocationID']
    df[categorical] = df[categorical].fillna(value=-1)
    
    df[categorical] = df[categorical].astype(str)
    
    return df

In [22]:
df_train = read_df('fhv_tripdata_2021-01.parquet')
df_val = read_df('fhv_tripdata_2021-02.parquet')

In [23]:
val_dicts = df_val[['PUlocationID', 'DOlocationID']].to_dict(orient='records')


X_val = dv.transform(val_dicts)
X_val

<990113x525 sparse matrix of type '<class 'numpy.float64'>'
	with 1980223 stored elements in Compressed Sparse Row format>

In [24]:
y_val = df_val['duration'].values

In [25]:
y_pred_val = lr.predict(X_val)

In [26]:
mean_squared_error(y_val, y_pred_val, squared = False)

11.014283163400654