In [17]:
!pip freeze | grep scikit-learndd

In [18]:
!python -V

Python 3.9.19


In [19]:
import pickle
import pandas as pd
import numpy as np
import os

In [20]:
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [21]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    df['ride_id'] = f'{2023:04d}/{2:02d}_' + df.index.astype('str')
    
    return df

In [31]:
df = read_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-04.parquet')

In [23]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

In [24]:
std_dev = np.std(y_pred)
print(f'What is the standard deviation of the predicted duration for this dataset? {std_dev:.2f}')

What is the standard deviation of the predicted duration for this dataset? 6.15


In [25]:

df['ride_id'] = f'{2023:04d}/{2:02d}_' + df.index.astype('str')

In [26]:
df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,duration,ride_id
0,1,2023-02-01 00:32:53,2023-02-01 00:34:34,2.0,0.30,1.0,N,142,163,2,...,3.50,0.5,0.00,0.0,1.0,9.40,2.5,0.00,1.683333,2023/02_0
3,1,2023-02-01 00:29:33,2023-02-01 01:01:38,0.0,18.80,1.0,N,132,26,1,...,2.25,0.5,0.00,0.0,1.0,74.65,0.0,1.25,32.083333,2023/02_3
4,2,2023-02-01 00:12:28,2023-02-01 00:25:46,1.0,3.22,1.0,N,161,145,1,...,1.00,0.5,3.30,0.0,1.0,25.30,2.5,0.00,13.300000,2023/02_4
5,1,2023-02-01 00:52:40,2023-02-01 01:07:18,1.0,5.10,1.0,N,148,236,1,...,3.50,0.5,5.35,0.0,1.0,32.25,2.5,0.00,14.633333,2023/02_5
6,1,2023-02-01 00:12:39,2023-02-01 00:40:36,1.0,8.90,1.0,N,137,244,1,...,3.50,0.5,3.50,0.0,1.0,50.00,2.5,0.00,27.950000,2023/02_6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2913950,2,2023-02-28 23:46:00,2023-03-01 00:05:00,,4.65,,,249,140,0,...,0.00,0.5,4.84,0.0,1.0,29.06,,,19.000000,2023/02_2913950
2913951,2,2023-02-28 23:26:02,2023-02-28 23:37:10,,2.47,,,186,79,0,...,0.00,0.5,2.65,0.0,1.0,20.31,,,11.133333,2023/02_2913951
2913952,2,2023-02-28 23:24:00,2023-02-28 23:38:00,,3.49,,,158,143,0,...,0.00,0.5,0.00,0.0,1.0,21.64,,,14.000000,2023/02_2913952
2913953,2,2023-02-28 23:03:00,2023-02-28 23:10:00,,2.13,,,79,162,0,...,0.00,0.5,2.63,0.0,1.0,20.19,,,7.000000,2023/02_2913953


In [27]:
df_result = pd.DataFrame()
df_result['duration'] = df['duration']
df_result['ride_id'] = df['ride_id']
df_result

Unnamed: 0,duration,ride_id
0,1.683333,2023/02_0
3,32.083333,2023/02_3
4,13.300000,2023/02_4
5,14.633333,2023/02_5
6,27.950000,2023/02_6
...,...,...
2913950,19.000000,2023/02_2913950
2913951,11.133333,2023/02_2913951
2913952,14.000000,2023/02_2913952
2913953,7.000000,2023/02_2913953


In [28]:
df_result.to_parquet(
    'output.parquet',
    engine='pyarrow',
    compression=None,
    index=False
)

In [29]:
len(df_result)

2855951

In [30]:

file_path = './output.parquet'
file_size_bytes = os.path.getsize(file_path)

file_size_kb = file_size_bytes / 1024
file_size_mb = file_size_bytes / (1024 ** 2)
file_size_gb = file_size_bytes / (1024 ** 3)


print(f'Size of the output.parquet file is: {file_size_mb:.2f} MB')

Size of the output.parquet file is: 55.22 MB


Now let's turn the notebook into a script.

Which command you need to execute for that?

jupyter nbconvert --to script starter.ipynb


What's the first hash for the Scikit-Learn dependency?

sha256:057b991ac64b3e75c9c04b5f9395eaf19a6179244c089afdebaad98264bff37c

What's the mean predicted duration?

14.29
