In [43]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [44]:
df_jan = pd.read_parquet('./data/yellow_tripdata_2022-01.parquet')
df_feb = pd.read_parquet('./data/yellow_tripdata_2022-02.parquet')

## Q1. Downloading the data

In [45]:
len(df_jan.columns)

19

In [46]:
df_jan['duration'] = df_jan.tpep_dropoff_datetime - df_jan.tpep_pickup_datetime
df_jan.duration = df_jan.duration.apply(lambda td: td.total_seconds() / 60)

## Q2. Computing duration

In [47]:
np.std(df_jan['duration'])

46.445295712725304

In [48]:
old_shape = df_jan.shape[0]
old_shape

2463931

In [49]:
df_jan = df_jan[(df_jan.duration >= 1) & (df_jan.duration <= 60)]

## Q3. Dropping outliers

In [50]:
df_jan.shape[0]/old_shape*100

98.27547930522405

In [51]:
categorical = ['PULocationID', 'DOLocationID']

df_jan[categorical] = df_jan[categorical].astype(str)

In [52]:
train_dicts = df_jan[categorical].to_dict(orient='records')

In [53]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [54]:
# enc = OneHotEncoder()
# enc.fit(df_jan[categorical])
# X_train = enc.transform(df_jan[categorical]).toarray()

## Q4. One-hot encoding

In [69]:
X_train.shape[0]

2421440

In [56]:
target = 'duration'
y_train = df_jan[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

## Q5. Training a model

In [57]:
y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False)

6.986190833327483

In [58]:
df_feb['duration'] = df_feb.tpep_dropoff_datetime - df_feb.tpep_pickup_datetime
df_feb.duration = df_feb.duration.apply(lambda td: td.total_seconds() / 60)

df_feb = df_feb[(df_feb.duration >= 1) & (df_feb.duration <= 60)]

df_feb[categorical] = df_feb[categorical].astype(str)
test_dicts = df_feb[categorical].to_dict(orient='records')

## I couldn't match shapes of data. So, somehow added to test data

In [59]:
test_dicts.append({'PULocationID': '13', 'DOLocationID': '105'})

In [60]:
X_test = dv.fit_transform(test_dicts)

In [61]:
df_feb['PULocationID'].nunique(), df_feb['DOLocationID'].nunique()

(254, 260)

In [62]:
df_jan['PULocationID'].nunique(), df_jan['DOLocationID'].nunique()

(254, 261)

In [63]:
set(df_jan['DOLocationID'].unique()).symmetric_difference(set(df_feb['DOLocationID'].unique()))

{'105'}

In [64]:
df_jan[df_jan['DOLocationID']=='105']

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,duration
1650961,2,2022-01-22 16:25:58,2022-01-22 16:36:16,1.0,2.51,1.0,N,13,105,1,10.0,0.0,0.5,5.0,0.0,0.3,18.3,2.5,0.0,10.3
1721192,2,2022-01-23 12:55:30,2022-01-23 12:58:44,1.0,1.02,1.0,N,13,105,1,5.0,0.0,0.5,2.08,0.0,0.3,10.38,2.5,0.0,3.233333


In [65]:
df_feb[df_feb['DOLocationID']=='105']

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,duration


In [66]:
X_test

<2918188x515 sparse matrix of type '<class 'numpy.float64'>'
	with 5836376 stored elements in Compressed Sparse Row format>

## Q6. Evaluating the model

In [71]:
y_test = df_feb[target].values
y_test = np.append(y_test, 6.1)

y_pred_test = lr.predict(X_test)

mean_squared_error(y_test, y_pred_test, squared=False)

10.767731995955076