In [1]:
!python -V

Python 3.9.9


In [66]:
import pandas as pd

In [67]:
import pickle

In [68]:
import seaborn as sns
import matplotlib.pyplot as plt

In [69]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

## Q1: Downloading the data

In [70]:
# Read the data for January. How many records are there?

df = pd.read_parquet('./data/fhv_tripdata_2021-01.parquet')

print(df.shape)


# numerical = ['trip_distance']

# df[categorical] = df[categorical].astype(str)

(1154112, 7)


In [71]:
df.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037


In [72]:
## Q2: Computing duration

In [73]:
df['duration'] = df.dropOff_datetime - df.pickup_datetime
df['duration'] = df['duration'].apply(lambda td: td.total_seconds() / 60)

In [74]:
# What's the average trip duration in January?
print(df['duration'].mean())

19.167224093791006


In [75]:
num_records_before_filtering = len(df)
df = df[(df.duration >= 1) & (df.duration <= 60)]
num_records_after_filtering = len(df)

#How many records did you drop?
print(num_records_before_filtering - num_records_after_filtering)

44286


## Q3 missing values

In [76]:
df.fillna(-1, inplace=True)

In [77]:
len(df[df['PUlocationID'] == -1])/ len(df) 

0.8352732770722617

## Q4 one-hot encoding

In [78]:
df.columns

Index(['dispatching_base_num', 'pickup_datetime', 'dropOff_datetime',
       'PUlocationID', 'DOlocationID', 'SR_Flag', 'Affiliated_base_number',
       'duration'],
      dtype='object')

In [79]:
categorical = ['PUlocationID', 'DOlocationID']

df[categorical] = df[categorical].astype(str)

train_dicts = df[categorical].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [80]:
print(X_train.shape)

(1109826, 525)


## Q5

In [81]:
target = 'duration'
y_train = df[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False)

10.528519107212144

## Q6

In [82]:
def read_dataframe(filename):
    if filename.endswith('.parquet'):
        df = pd.read_parquet(filename)
        
    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    df['duration'] = df['duration'].apply(lambda td: td.total_seconds() / 60)

    df.fillna(-1, inplace=True)

    categorical = ['PUlocationID', 'DOlocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [83]:
df_train = read_dataframe('./data/fhv_tripdata_2021-01.parquet')
df_val = read_dataframe('./data/fhv_tripdata_2021-02.parquet')

In [84]:
dv = DictVectorizer()

train_dicts = df_train[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [85]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [86]:
# lr = LinearRegression()
# lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)

160.985574812224

In [None]:
sns.distplot(y_pred, label='prediction')
sns.distplot(y_train, label='actual')

plt.legend()

