# Baseline model for batch monitoring 

In [2]:
import pandas as pd 
import requests 
import datetime
import pyarrow
from joblib import load, dump
from tqdm import tqdm
from evidently import ColumnMapping
from evidently.report import Report
from evidently.metrics import ColumnDriftMetric, DatasetDriftMetric, DatasetMissingValuesMetric

from evidently.metrics import ColumnQuantileMetric
from evidently.report import Report

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.feature_extraction import DictVectorizer

In [3]:
# Load the raw dataset (no preprocessing!)
url = 'https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-03.parquet'
march_data_2024 = pd.read_parquet(url)


## Q1: Shape of the downloading data 

In [4]:
march_data_2024.shape

(57457, 20)

## Q2: What metric did you choose?

I chose to monitor the median (50th percentile) of the fare_amount column using the ColumnQuantileMetric.

## Q3: What is the maximum value of metric quantile = 0.5 on the "fare_amount" column during March 2024 (calculated daily)?

In [6]:
# Extract pickup date (for grouping)
march_data_2024['lpep_pickup_date'] = pd.to_datetime(march_data_2024['lpep_pickup_datetime']).dt.date

# Store daily median fare_amount values
daily_medians = []

for day, day_df in march_data_2024.groupby('lpep_pickup_date'):
    report = Report(metrics=[
        ColumnQuantileMetric(column_name="fare_amount", quantile=0.5)
    ])
    report.run(reference_data=day_df, current_data=day_df)
    
    result = report.as_dict()
    
    try:
        median_value = result["metrics"][0]["result"]["current"]["value"]
        daily_medians.append((str(day), median_value))
    except KeyError:
        continue  # skip if no result

# Convert to DataFrame and find the maximum
# Convert to DataFrame
median_df = pd.DataFrame(daily_medians, columns=["date", "median_fare"])

# Convert 'date' column to datetime type (if not already)
median_df['date'] = pd.to_datetime(median_df['date'])

# Filter only March 2024
median_df = median_df[
    (median_df['date'] >= '2024-03-01') & (median_df['date'] <= '2024-03-31')
]

# Find the date with the highest median fare in March
max_day = median_df.loc[median_df['median_fare'].idxmax()]

print(" Max median fare in March 2024:", max_day['date'].date())
print(" Max median fare_amount:", max_day['median_fare'])

 Max median fare in March 2024: 2024-03-03
 Max median fare_amount: 14.2


## Question 04 : Where to place a dashboard config file?

- project_folder/config (05-monitoring/config)=