In [42]:
!pip freeze | grep scikit-learn

scikit-learn==1.2.2


In [1]:
import pickle
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
pd.show_versions()




INSTALLED VERSIONS
------------------
commit           : 37ea63d540fd27274cad6585082c91b1283f963d
python           : 3.10.9.final.0
python-bits      : 64
OS               : Darwin
OS-release       : 22.4.0
Version          : Darwin Kernel Version 22.4.0: Mon Mar  6 21:00:17 PST 2023; root:xnu-8796.101.5~3/RELEASE_X86_64
machine          : x86_64
processor        : i386
byteorder        : little
LC_ALL           : None
LANG             : None
LOCALE           : None.UTF-8

pandas           : 2.0.1
numpy            : 1.23.5
pytz             : 2022.7
dateutil         : 2.8.2
setuptools       : 67.8.0
pip              : 23.1.2
Cython           : None
pytest           : 7.1.2
hypothesis       : None
sphinx           : 5.0.2
blosc            : None
feather          : None
xlsxwriter       : None
lxml.etree       : 4.9.1
html5lib         : None
pymysql          : None
psycopg2         : None
jinja2           : 3.1.2
IPython          : 8.10.0
pandas_datareader: None
bs4              : 4.11.1


In [66]:
categorical = ['PULocationID', 'DOLocationID']
def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [67]:
def load_model():
    with open('model.bin', 'rb') as f_in:
        dv, model = pickle.load(f_in)
    return dv, model

In [68]:
def get_input_output_url(year, month):
    input_file = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year:04d}-{month:02d}.parquet'
    output_file = f'output/yellow/{year:04d}-{month:02d}.parquet'
    return input_file, output_file


In [69]:
def apply_model(input_file, output_file):
    df = read_data(input_file)
    dicts = df[categorical].to_dict(orient='records')
    dv, model = load_model()
    X_val = dv.transform(dicts)
    y_pred = model.predict(X_val)
    return y_pred

In [70]:
input_file, output_file = get_input_output_url(year, month)
y_pred = apply_model(input_file, output_file)

In [71]:
y_val = df.duration.values

In [72]:
def get_matrix(y_val, y_pred):
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mae = mean_absolute_error(y_val, y_pred)
    return rmse, mae

In [73]:
rmse, mae = get_matrix(y_val, y_pred)

In [75]:
rmse

7.786409085078911

# Q2. Preparing the output

In [76]:
def write_prediction(df, y_pred, output_file):
    df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')
    df_result = pd.DataFrame()
    df_result['ride_id'] = df['ride_id']
    df_result['predicted_duration'] = y_pred
    
    df_result.to_parquet(
        output_file,
        engine='pyarrow',
        compression=None,
        index=False
    )

In [77]:
write_prediction(df, y_pred, output_file)