# Batch Monitoring Example baseline model

In [1]:
import requests
import pandas as pd
import datetime
from tqdm import tqdm
from joblib import load, dump

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [2]:
files = [
    ("green_tripdata_2022-02.parquet", "./data"),
    ("green_tripdata_2022-01.parquet", "./data"),
]

print("Download files:")
for file, path in files:
    url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
    resp = requests.get(url, stream=True)
    save_path = f"{path}/{file}"
    with open(save_path, "wb") as handle:
        for data in tqdm(
            resp.iter_content(),
            desc=f"{file}",
            postfix=f"save to {save_path}",
            total=int(resp.headers["Content-Length"]),
        ):
            handle.write(data)

Download files:


green_tripdata_2022-02.parquet: 100%|██████████| 1428262/1428262 [00:06<00:00, 226700.52it/s, save to ./data/green_tripdata_2022-02.parquet]
green_tripdata_2022-01.parquet: 100%|██████████| 1254291/1254291 [00:03<00:00, 358365.05it/s, save to ./data/green_tripdata_2022-01.parquet]


In [3]:
jan_data = pd.read_parquet("./data/green_tripdata_2022-01.parquet")
jan_data.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2022-01-01 00:14:21,2022-01-01 00:15:33,N,1.0,42,42,1.0,0.44,3.5,0.5,0.5,0.0,0.0,,0.3,4.8,2.0,1.0,0.0
1,1,2022-01-01 00:20:55,2022-01-01 00:29:38,N,1.0,116,41,1.0,2.1,9.5,0.5,0.5,0.0,0.0,,0.3,10.8,2.0,1.0,0.0
2,1,2022-01-01 00:57:02,2022-01-01 01:13:14,N,1.0,41,140,1.0,3.7,14.5,3.25,0.5,4.6,0.0,,0.3,23.15,1.0,1.0,2.75
3,2,2022-01-01 00:07:42,2022-01-01 00:15:57,N,1.0,181,181,1.0,1.69,8.0,0.5,0.5,0.0,0.0,,0.3,9.3,2.0,1.0,0.0
4,2,2022-01-01 00:07:50,2022-01-01 00:28:52,N,1.0,33,170,1.0,6.26,22.0,0.5,0.5,5.21,0.0,,0.3,31.26,1.0,1.0,2.75


In [8]:
jan_data["duration"] = (
    jan_data["lpep_dropoff_datetime"] - jan_data["lpep_pickup_datetime"]
)
jan_data["duration"] = jan_data["duration"].apply(
    lambda x: float(f"{x.total_seconds() / 60:.2f}")
)
jan_data = jan_data[(jan_data["duration"] >= 1) & (jan_data["duration"] <= 60)]
jan_data.sample(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  jan_data['duration'] = jan_data['lpep_dropoff_datetime'] - jan_data['lpep_pickup_datetime']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  jan_data['duration'] = jan_data['duration'].apply(lambda x: float(f'{x.total_seconds() / 60:.2f}'))


Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,duration
19589,1,2022-01-11 21:06:14,2022-01-11 21:34:04,N,1.0,106,91,1.0,0.0,25.2,...,0.5,0.0,0.0,,0.3,26.0,1.0,1.0,0.0,27.83
44318,2,2022-01-25 07:33:17,2022-01-25 07:44:39,N,1.0,42,69,1.0,2.16,10.0,...,0.5,0.0,0.0,,0.3,10.8,2.0,1.0,0.0,11.37
39035,2,2022-01-22 03:27:07,2022-01-22 03:34:30,N,5.0,223,129,1.0,2.72,20.0,...,0.0,4.06,0.0,,0.3,24.36,1.0,2.0,0.0,7.38
32617,2,2022-01-19 08:39:07,2022-01-19 08:45:30,N,1.0,41,75,1.0,0.87,6.0,...,0.5,1.0,0.0,,0.3,7.8,1.0,1.0,0.0,6.38
54963,2,2022-01-31 12:18:33,2022-01-31 12:51:55,N,1.0,52,49,1.0,2.28,19.5,...,0.5,0.0,0.0,,0.3,20.3,2.0,1.0,0.0,33.37
