In [1]:
!pip freeze | grep scikit-learn

scikit-learn==1.2.2


In [2]:
!python -V

Python 3.10.12


In [3]:
import pickle
import pandas as pd

In [4]:
with open('/content/model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [5]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')

    return df

In [6]:
df = read_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet')

In [7]:
df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,duration
0,2,2023-03-01 00:06:43,2023-03-01 00:16:43,1.0,0.00,1.0,N,238,42,2,8.60,1.0,0.5,0.00,0.0,1.0,11.10,0.0,0.00,10.000000
1,2,2023-03-01 00:08:25,2023-03-01 00:39:30,2.0,12.40,1.0,N,138,231,1,52.70,6.0,0.5,12.54,0.0,1.0,76.49,2.5,1.25,31.083333
2,1,2023-03-01 00:15:04,2023-03-01 00:29:26,0.0,3.30,1.0,N,140,186,1,18.40,3.5,0.5,4.65,0.0,1.0,28.05,2.5,0.00,14.366667
3,1,2023-03-01 00:49:37,2023-03-01 01:01:05,1.0,2.90,1.0,N,140,43,1,15.60,3.5,0.5,4.10,0.0,1.0,24.70,2.5,0.00,11.466667
4,2,2023-03-01 00:08:04,2023-03-01 00:11:06,1.0,1.23,1.0,N,79,137,1,7.20,1.0,0.5,2.44,0.0,1.0,14.64,2.5,0.00,3.033333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3403761,2,2023-03-31 23:24:25,2023-03-31 23:40:54,,3.16,,,163,75,0,12.13,0.0,0.5,4.23,0.0,1.0,20.36,,,16.483333
3403762,2,2023-03-31 23:24:50,2023-04-01 00:04:12,,6.89,,,125,198,0,40.92,0.0,0.5,8.98,0.0,1.0,53.90,,,39.366667
3403763,2,2023-03-31 23:26:31,2023-03-31 23:49:39,,4.01,,,50,224,0,24.02,0.0,0.5,0.00,0.0,1.0,28.02,,,23.133333
3403764,2,2023-03-31 23:07:51,2023-03-31 23:15:56,,1.31,,,113,158,0,8.51,0.0,0.5,3.50,0.0,1.0,16.01,,,8.083333


In [8]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

In [9]:
import numpy as np
std_dev_predicted_durations = np.std(y_pred)
print(f'Standard Deviation of Predicted Durations: {std_dev_predicted_durations}')

Standard Deviation of Predicted Durations: 6.247488852238703


In [10]:
# Create the ride_id column
year = 2023
month = 3
df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')

In [11]:
# Prepare the result dataframe with ride_id and predictions
df_result = pd.DataFrame({
    'ride_id': df['ride_id'],
    'predictions': y_pred
})

In [12]:
df_result

Unnamed: 0,ride_id,predictions
0,2023/03_0,16.245906
1,2023/03_1,26.134796
2,2023/03_2,11.884264
3,2023/03_3,11.997720
4,2023/03_4,10.234486
...,...,...
3403761,2023/03_3403761,11.952635
3403762,2023/03_3403762,20.049958
3403763,2023/03_3403763,11.595336
3403764,2023/03_3403764,13.113178


In [13]:
# Save the result dataframe as a parquet file
output_file = 'df_result.parquet'
df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
)

In [14]:
import os
file_size = os.path.getsize(output_file)
print(f"Size of the output file: {file_size / (1024 * 1024):.1f}M")

Size of the output file: 65.5M
