In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Q1. Downloading the data

In [4]:
df_train = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')

In [5]:
df_val = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')

In [10]:
len(df_train.columns)

19

# Q2. Computing duration

In [13]:
df_train.tpep_dropoff_datetime = pd.to_datetime(df_train.tpep_dropoff_datetime)
df_train.tpep_pickup_datetime = pd.to_datetime(df_train.tpep_pickup_datetime)

df_train['duration'] = df_train.tpep_dropoff_datetime - df_train.tpep_pickup_datetime
df_train.duration = df_train.duration.apply(lambda td: td.total_seconds()/60)

In [15]:
df_train.duration.describe()

count    3.066766e+06
mean     1.566900e+01
std      4.259435e+01
min     -2.920000e+01
25%      7.116667e+00
50%      1.151667e+01
75%      1.830000e+01
max      1.002918e+04
Name: duration, dtype: float64

# Q3. Dropping Outliers

In [16]:
((df_train.duration >= 1) & (df_train.duration <= 60)).mean()

0.9812202822125979

# Q4. One-hot encoding

In [18]:
df_train = df_train[(df_train.duration >= 1) & (df_train.duration <= 60)]

In [19]:
categorical = ['PULocationID', 'DOLocationID']

In [None]:
df_train[categorical] = df_train[categorical].astype(str)

In [21]:
train_dicts = df_train[categorical].to_dict(orient='records')

In [22]:
dv = DictVectorizer()
x_train = dv.fit_transform(train_dicts)

In [23]:
print(len(dv.feature_names_))

515


# Q5. Training a model

In [25]:
target = 'duration'
y_train = df_train[target].values

lr = LinearRegression()
lr.fit(x_train,y_train)

y_pred = lr.predict(x_train)

mean_squared_error(y_train,y_pred, squared=False)

7.6492610279057605

# Q6. Evaluating the model

In [27]:
df_val.tpep_dropoff_datetime = pd.to_datetime(df_val.tpep_dropoff_datetime)
df_val.tpep_pickup_datetime = pd.to_datetime(df_val.tpep_pickup_datetime)

df_val['duration'] = df_val.tpep_dropoff_datetime - df_val.tpep_pickup_datetime
df_val.duration = df_val.duration.apply(lambda td: td.total_seconds()/60)

df_val = df_val[(df_val.duration >= 1) & (df_val.duration <= 60)]

df_val[categorical] = df_val[categorical].astype(str)

In [28]:
val_dicts = df_val[categorical].to_dict(orient='records')
x_val = dv.transform(val_dicts)
y_val = df_val[target].values

y_pred = lr.predict(x_val)
mean_squared_error(y_val,y_pred, squared=False)

7.81183265470218