In [1]:
# pip freeze | grep scikit-learn   ----> in bash terminal

In [2]:
# Import required libraries
import pickle
import pandas as pd

## Q1. Notebook

For the February 2022 data, what's the standard deviation of the predicted duration for this dataset?

In [3]:
# Open trained model
def get_model(model_path):
    with open(model_path, 'rb') as f_in:
        dv, model = pickle.load(f_in)
    return dv, model

In [4]:
# Function to read and preprocess data for predictions 
def read_data(input_file):
    df = pd.read_parquet(input_file)
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60
    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()
    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    return df

In [5]:
# Make predictions
def get_predictions(df, model_path):
    dicts = df[['PULocationID', 'DOLocationID']].to_dict(orient='records')
    dv, model = get_model(model_path)
    X_val = dv.transform(dicts)
    y_pred = model.predict(X_val)
    return y_pred

In [6]:
# Set variables
month = 2
year = 2022
input_file = f'C:/Users/Camila/OneDrive/Escritorio/mlops zoomcamp/data/yellow/yellow_tripdata_{year:04d}-{month:02d}.parquet'
model_path = 'model.bin'

In [7]:
# Read February 2022 yellow trip data
df = read_data(input_file)
df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,duration
0,1,2022-02-01 00:06:58,2022-02-01 00:19:24,1.0,5.40,1.0,N,138,252,1,17.00,1.75,0.5,3.90,0.00,0.3,23.45,0.0,1.25,12.433333
1,1,2022-02-01 00:38:22,2022-02-01 00:55:55,1.0,6.40,1.0,N,138,41,2,21.00,1.75,0.5,0.00,6.55,0.3,30.10,0.0,1.25,17.550000
2,1,2022-02-01 00:03:20,2022-02-01 00:26:59,1.0,12.50,1.0,N,138,200,2,35.50,1.75,0.5,0.00,6.55,0.3,44.60,0.0,1.25,23.650000
3,2,2022-02-01 00:08:00,2022-02-01 00:28:05,1.0,9.88,1.0,N,239,200,2,28.00,0.50,0.5,0.00,3.00,0.3,34.80,2.5,0.00,20.083333
4,2,2022-02-01 00:06:48,2022-02-01 00:33:07,1.0,12.16,1.0,N,138,125,1,35.50,0.50,0.5,8.11,0.00,0.3,48.66,2.5,1.25,26.316667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2979426,2,2022-02-28 23:50:00,2022-03-01 00:06:00,,3.40,,,163,193,0,14.37,0.00,0.5,0.00,0.00,0.3,17.67,,,16.000000
2979427,2,2022-02-28 23:06:57,2022-02-28 23:19:12,,3.48,,,141,4,0,14.51,0.00,0.5,2.00,0.00,0.3,19.81,,,12.250000
2979428,2,2022-02-28 23:48:13,2022-03-01 00:03:33,,3.05,,,161,151,0,14.38,0.00,0.5,3.81,0.00,0.3,21.49,,,15.333333
2979429,2,2022-02-28 23:56:41,2022-03-01 00:04:57,,2.62,,,141,226,0,12.53,0.00,0.5,1.71,0.00,0.3,17.54,,,8.266667


In [8]:
# get the standard deviation for duration
predictions = get_predictions(df, model_path)
std_trip = predictions.std()
std_trip

5.28140357655334

## Q2. Preparing the output

Write the ride id and the predictions to a dataframe with results. Save it as a parquet file.



In [9]:

def process_results(df, month, year, predictions):
    df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')
    output_file = f'C:/Users/Camila/OneDrive/Escritorio/mlops zoomcamp/HW4/yellow_tripdata_{year:04d}-{month:02d}.parquet'
    
    df_result = pd.DataFrame()
    df_result['ride_id'] = df['ride_id']
    df_result['predictions'] = predictions
    df_result.to_parquet(output_file, engine='pyarrow', compression=None, index=False)
    return df_result

    

In [10]:
results = process_results(df, month, year, predictions)
results

Unnamed: 0,ride_id,predictions
0,2022/02_0,18.527783
1,2022/02_1,23.065782
2,2022/02_2,33.686359
3,2022/02_3,23.757436
4,2022/02_4,21.492904
...,...,...
2979426,2022/02_2979426,12.038225
2979427,2022/02_2979427,11.441569
2979428,2022/02_2979428,11.890459
2979429,2022/02_2979429,15.102681
