In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#loading January data
df1 = pd.read_parquet('yellow_tripdata_2022-01.parquet')
#loading February data
df2 = pd.read_parquet('yellow_tripdata_2022-02.parquet')

In [3]:
#Q1: hecking number of columns
df1

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.80,1.0,N,142,236,1,14.50,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0
1,1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.10,1.0,N,236,42,1,8.00,0.5,0.5,4.00,0.0,0.3,13.30,0.0,0.0
2,2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,N,166,166,1,7.50,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0
3,2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,N,114,68,2,8.00,0.5,0.5,0.00,0.0,0.3,11.80,2.5,0.0
4,2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.30,1.0,N,68,163,1,23.50,0.5,0.5,3.00,0.0,0.3,30.30,2.5,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2463926,2,2022-01-31 23:36:53,2022-01-31 23:42:51,,1.32,,,90,170,0,8.00,0.0,0.5,2.39,0.0,0.3,13.69,,
2463927,2,2022-01-31 23:44:22,2022-01-31 23:55:01,,4.19,,,107,75,0,16.80,0.0,0.5,4.35,0.0,0.3,24.45,,
2463928,2,2022-01-31 23:39:00,2022-01-31 23:50:00,,2.10,,,113,246,0,11.22,0.0,0.5,2.00,0.0,0.3,16.52,,
2463929,2,2022-01-31 23:36:42,2022-01-31 23:48:45,,2.92,,,148,164,0,12.40,0.0,0.5,0.00,0.0,0.3,15.70,,


In [4]:
#calculation of trip duration in minutes
df1['dur_min'] = (df1['tpep_dropoff_datetime'] - df1['tpep_pickup_datetime']).apply(lambda x: x.total_seconds()/60)

df2['dur_min'] = (df2['tpep_dropoff_datetime'] - df2['tpep_pickup_datetime']).apply(lambda x: x.total_seconds()/60)

In [5]:
#Q2: Standard deviation of the trips duration in Jan 2022 Yellow Taxi Trip data
np.std(df1['dur_min'].to_list())

46.445295712725304

In [6]:
lower_threshold = 1.0
upper_threshold = 60.0

# Create bool flags for outliers
upper_out = np.where(df1['dur_min']>upper_threshold)[0]
lower_out = np.where(df1['dur_min']<lower_threshold)[0]

#lets keep original df1
df1_without_out = df1.copy()

# Removing the outliers
df1_without_out.drop(index=upper_out, inplace=True)
df1_without_out.drop(index=lower_out, inplace=True)

In [7]:
#Q3:fraction of the records left after you dropped the outliers
len(df1_without_out.index)/len(df1.index)

0.9827547930522406

In [8]:
#creating dummy variables
from sklearn.feature_extraction import DictVectorizer

cat = ['PULocationID', 'DOLocationID']

df1_without_out[cat] = df1_without_out[cat].astype(str)
train_dict = df1_without_out[cat].to_dict(orient = 'records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dict)

#the same for df2
df2[cat] = df2[cat].astype(str)
test_dict = df2[cat].to_dict(orient = 'records')
X_test = dv.transform(test_dict)

In [9]:
#Q4: check dimension of the matrix
X_train

<2421440x515 sparse matrix of type '<class 'numpy.float64'>'
	with 4842880 stored elements in Compressed Sparse Row format>

In [10]:
X_test

<2979431x515 sparse matrix of type '<class 'numpy.float64'>'
	with 5958853 stored elements in Compressed Sparse Row format>

In [11]:
#creating target variables
target = 'dur_min'
y_train = df1_without_out[target].values
y_test = df2[target].values

In [12]:
model = LinearRegression()

In [13]:
#train linear regression model
model.fit(X_train, y_train)

LinearRegression()

In [14]:
#prediction on train dataset for RMSE calculation
predictions_train = model.predict(X_train)
#Q5: calculate RMSE for train data set
metrics.mean_squared_error(y_train, predictions_train, squared = False)

6.986191076063921

In [15]:
#prediction on validation dataset for RMSE calculation
predictions_test = model.predict(X_test)
#Q6: calculate RMSE for validation data set
metrics.mean_squared_error(y_test, predictions_test, squared = False)

46.87727738698599