In [1]:
!python --version

Python 3.12.3


In [2]:
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet \
    -O ../data/yellow_tripdata_2023-01.parquet
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet \
    -O ../data/yellow_tripdata_2023-02.parquet

--2025-05-19 10:54:12--  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 3.164.82.197, 3.164.82.112, 3.164.82.160, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|3.164.82.197|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 47673370 (45M) [application/x-www-form-urlencoded]
Saving to: ‘../data/yellow_tripdata_2023-01.parquet’


2025-05-19 10:54:17 (9.23 MB/s) - ‘../data/yellow_tripdata_2023-01.parquet’ saved [47673370/47673370]

--2025-05-19 10:54:18--  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 3.164.82.40, 3.164.82.197, 3.164.82.112, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|3.164.82.40|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 47748012 (46M) 

In [3]:
import pandas as pd

In [4]:
# Sneaking into data
df = pd.read_parquet("../data/yellow_tripdata_2023-01.parquet")
print(df.head())
print(df.dtypes)

   VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
0         2  2023-01-01 00:32:10   2023-01-01 00:40:36              1.0   
1         2  2023-01-01 00:55:08   2023-01-01 01:01:27              1.0   
2         2  2023-01-01 00:25:04   2023-01-01 00:37:49              1.0   
3         1  2023-01-01 00:03:48   2023-01-01 00:13:25              0.0   
4         2  2023-01-01 00:10:29   2023-01-01 00:21:19              1.0   

   trip_distance  RatecodeID store_and_fwd_flag  PULocationID  DOLocationID  \
0           0.97         1.0                  N           161           141   
1           1.10         1.0                  N            43           237   
2           2.51         1.0                  N            48           238   
3           1.90         1.0                  N           138             7   
4           1.43         1.0                  N           107            79   

   payment_type  fare_amount  extra  mta_tax  tip_amount  tolls_amount  \


In [5]:
columns_cnt = len(df.columns)
print(f"Columns count: {columns_cnt}")

Columns count: 19


In [6]:
df["duration"] = df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]
df["duration"] = df["duration"].apply(lambda td: td.total_seconds() / 60)

In [7]:
duration_std = df["duration"].std()
print(f"Standard Deviation of the trips durations in Jan: {duration_std:2f}")

Standard Deviation of the trips durations in Jan: 42.594351


In [8]:
filtered_df = df[(df["duration"] >= 1) & (df["duration"] <= 60)]

In [9]:
frac = int((len(filtered_df) / len(df)) * 100)
print(f"Fraction of the records left: {frac} %")

Fraction of the records left: 98 %


In [10]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

features = ["PULocationID", "DOLocationID"]

In [11]:
def preprocess(filename: str, features: list) -> pd.DataFrame:
    df = pd.read_parquet(filename)

    df["duration"] = df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]
    df["duration"] = df["duration"].apply(lambda td: td.total_seconds() / 60)

    df = df[(df["duration"] >= 1) & (df["duration"] <= 60)]

    df[features] = df[features].astype(str)

    return df

In [12]:
dv = DictVectorizer()


def onehot_encode(df: pd.DataFrame, features: list, training: bool = True) -> tuple:
    target = "duration"

    data = df[features].to_dict(orient="records")
    X_data = dv.fit_transform(data) if training else dv.transform(data)
    y_data = df[target].values

    return X_data, y_data

In [13]:
model = LinearRegression()


def fit(x_train, y_train, x_val=None, y_val=None) -> float:
    model.fit(x_train, y_train)

    y_pred = model.predict(x_val if x_val is not None else x_train)

    rmse = root_mean_squared_error(y_true=y_val if y_val is not None else y_train, y_pred=y_pred)

    return rmse

In [14]:
train_df = preprocess("../data/yellow_tripdata_2023-01.parquet", features=features)
val_df = preprocess("../data/yellow_tripdata_2023-02.parquet", features=features)

In [15]:
X_train, y_train = onehot_encode(train_df, features=features, training=True)
X_val, y_val = onehot_encode(val_df, features=features, training=False)

In [16]:
print(f"Dimensionality of training matrix: {X_train.shape}")
print(f"Dimensionality of validation matrix: {X_val.shape}")

Dimensionality of training matrix: (3009173, 515)
Dimensionality of validation matrix: (2855951, 515)


In [17]:
rmse_train = fit(X_train, y_train)
print(f"RMSE on Training set: {rmse_train}")

RMSE on Training set: 7.649261822035489


In [18]:
rmse_val = fit(X_train, y_train, X_val, y_val)
print(f"RMSE on Validation set: {rmse_val}")

RMSE on Validation set: 7.811821332387183
