In [1]:
import os
import uuid
import pickle
import shutil
import os

import pandas as pd

import mlflow
import joblib

import mlflow
mlflow.set_tracking_uri("http://127.0.0.1:5000")

import uuid

In [2]:
may = pd.read_parquet('./green_tripdata_2025-05.parquet')
may.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,cbd_congestion_fee
0,2,2025-05-01 00:17:04,2025-05-01 00:56:06,N,1.0,25,216,1.0,9.34,44.3,...,0.5,0.0,0.0,,1.0,46.8,1.0,1.0,0.0,0.0
1,2,2025-05-01 00:56:16,2025-05-01 01:10:26,N,1.0,160,129,1.0,2.95,16.3,...,0.5,0.0,0.0,,1.0,18.8,2.0,1.0,0.0,0.0
2,1,2025-05-01 00:24:49,2025-05-01 00:42:29,N,1.0,260,179,1.0,3.0,18.4,...,1.5,0.0,0.0,,1.0,20.9,2.0,1.0,0.0,0.0
3,2,2025-05-01 00:27:11,2025-05-01 00:33:21,N,1.0,130,216,1.0,1.61,9.3,...,0.5,0.0,0.0,,1.0,11.8,2.0,1.0,0.0,0.0
4,2,2025-05-01 00:32:59,2025-05-01 00:41:34,N,1.0,244,151,2.0,3.44,15.6,...,0.5,4.52,0.0,,1.0,22.62,1.0,1.0,0.0,0.0


In [3]:
data = may[['lpep_pickup_datetime', 'lpep_dropoff_datetime', 
                 'PULocationID', 'DOLocationID', 'trip_distance']]
 


In [4]:
# Calculate trip duration in minutes
data = data.copy()
data['duration'] = data['lpep_dropoff_datetime'] - data['lpep_pickup_datetime']
data['duration'] = data['duration'].dt.total_seconds() / 60


In [5]:
# Remove outliers - filter duration and distance
data = data[(data['duration'] >= 1) & (data['duration'] <= 62)]
data = data[(data['trip_distance'] >= 1) & (data['trip_distance'] <= 20)]

# Convert location IDs to categorical data
data[['PULocationID', 'DOLocationID']] = (data[['PULocationID', 'DOLocationID']].astype('str'))

# Select final columns
data = data[['PULocationID', 'DOLocationID', 'trip_distance', 'duration']]

data.head()

Unnamed: 0,PULocationID,DOLocationID,trip_distance,duration
0,25,216,9.34,39.033333
1,160,129,2.95,14.166667
2,260,179,3.0,17.666667
3,130,216,1.61,6.166667
4,244,151,3.44,8.583333


In [6]:
import os
import shutil
import mlflow
import joblib
import pandas as pd
import uuid

# Step 1: Download the preprocessor 
artifact_uri = "mlflow-artifacts:/1/ab7ac267f5e945cba9566b7213e58524/artifacts/preprocessor/preprocessing.pkl"
target_folder = "./artifacts"
os.makedirs(target_folder, exist_ok=True)

# Download to temp and move
temp_path = mlflow.artifacts.download_artifacts(artifact_uri=artifact_uri)
destination_path = os.path.join(target_folder, "preprocessing.pkl")
shutil.copy(temp_path, destination_path)
print(f"Preprocessor saved to: {destination_path}")

# Load preprocessor
preprocessor = joblib.load(destination_path)

# === Step 2: Load the model ===
run_id = "ab7ac267f5e945cba9566b7213e58524"
logged_model = f"runs:/{run_id}/nyc-duration-model"


loaded_model = mlflow.pyfunc.load_model(logged_model)


# Extract features
batch_data = data[['PULocationID', 'DOLocationID', 'trip_distance']]

# === Step 4: Transform and Predict ===
preprocessed_data = preprocessor.transform(batch_data)
result = loaded_model.predict(pd.DataFrame(preprocessed_data))

# === Step 5: Format the output ===
df_result = pd.DataFrame()
df_result[['PULocationID', 'DOLocationID', 'trip_distance', 'duration']] = data[['PULocationID', 'DOLocationID', 'trip_distance', 'duration']]
df_result['predicted_duration'] = result
df_result['difference'] = df_result['duration'] - df_result['predicted_duration']

# Generate ride IDs
def generate_uuids(n):
    return [str(uuid.uuid4()) for _ in range(n)]

df_result['ride_id'] = generate_uuids(len(df_result))
df_result['model_version'] = run_id

# Optional: Save results
#df_result.to_csv("predictions_output.csv", index=False)

print("Batch prediction complete. Saved to predictions_output.csv")


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Preprocessor saved to: ./artifacts/preprocessing.pkl


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Batch prediction complete. Saved to predictions_output.csv


In [7]:
df_result.head()

Unnamed: 0,PULocationID,DOLocationID,trip_distance,duration,predicted_duration,difference,ride_id,model_version
0,25,216,9.34,39.033333,29.302652,9.730681,6056a39a-4fa9-445e-8663-d58058bee3fd,ab7ac267f5e945cba9566b7213e58524
1,160,129,2.95,14.166667,17.526724,-3.360057,89469ae6-9f35-4962-91d8-858c7e2d56e8,ab7ac267f5e945cba9566b7213e58524
2,260,179,3.0,17.666667,16.008978,1.657689,b2aff515-8a0d-42cc-a5f3-d4593059915a,ab7ac267f5e945cba9566b7213e58524
3,130,216,1.61,6.166667,10.345655,-4.178989,594be2a0-7836-46b3-9160-c2c591e78285,ab7ac267f5e945cba9566b7213e58524
4,244,151,3.44,8.583333,15.393679,-6.810345,63917892-fb6b-4f29-b15c-ccced10a035c,ab7ac267f5e945cba9566b7213e58524


In [8]:
%%writefile batch_processing.py

import os
import uuid
import argparse
import shutil

import pandas as pd
import joblib
import mlflow


def generate_uuids(n):
    return [str(uuid.uuid4()) for _ in range(n)]


def main(data_path: str, run_id: str):
    mlflow.set_tracking_uri("http://127.0.0.1:5000")

    #  Step 1: Load data 
    print(f"[INFO] Reading data from: {data_path}")
    df = pd.read_parquet(data_path)

    df = df[['lpep_pickup_datetime', 'lpep_dropoff_datetime', 'PULocationID', 'DOLocationID', 'trip_distance']].copy()

    # Calculate duration
    df['duration'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']
    df['duration'] = df['duration'].dt.total_seconds() / 60

    # Filter
    df = df[(df['duration'] >= 1) & (df['duration'] <= 62)]
    df = df[(df['trip_distance'] >= 1) & (df['trip_distance'] <= 20)]

    # Convert location IDs to string
    df[['PULocationID', 'DOLocationID']] = df[['PULocationID', 'DOLocationID']].astype(str)

    df = df[['PULocationID', 'DOLocationID', 'trip_distance', 'duration']]

    #Step 2: Load Preprocessor 
    preprocessor_uri = f"mlflow-artifacts:/1/{run_id}/artifacts/preprocessor/preprocessing.pkl"
    target_folder = "./artifacts"
    os.makedirs(target_folder, exist_ok=True)

    print("Downloading preprocessor...")
    temp_path = mlflow.artifacts.download_artifacts(artifact_uri=preprocessor_uri)
    destination_path = os.path.join(target_folder, "preprocessing.pkl")
    shutil.copy(temp_path, destination_path)
    preprocessor = joblib.load(destination_path)
    print("Preprocessor loaded.")

    #Step 3: Load Model
    logged_model = f"runs:/{run_id}/nyc-duration-model"
    print(f"Loading model from: {logged_model}")
    model = mlflow.pyfunc.load_model(logged_model)

    #Step 4: Transform and Predict
    features = df[['PULocationID', 'DOLocationID', 'trip_distance']]
    transformed = preprocessor.transform(features)
    predictions = model.predict(pd.DataFrame(transformed))

    #Step 5: create result
    df_result = df.copy()
    df_result['predicted_duration'] = predictions
    df_result['difference'] = df_result['duration'] - df_result['predicted_duration']
    df_result['ride_id'] = generate_uuids(len(df_result))
    df_result['model_version'] = run_id

    # Step 6: Save to CSV
    df_result.to_csv("predictions_output.csv", index=False)
    print(" Batch prediction complete. Output saved to: predictions_output.csv")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Batch predictor for ride duration")
    parser.add_argument("--data_path", required=True, help="Path to the Parquet data file")
    parser.add_argument("--run_id", required=True, help="MLflow run ID for model and preprocessor")
    args = parser.parse_args()

    main(data_path=args.data_path, run_id=args.run_id)


Overwriting batch_processing.py


In [9]:
!python batch_processing.py \
  --data_path ./green_tripdata_2025-05.parquet \
  --run_id ab7ac267f5e945cba9566b7213e58524


[INFO] Reading data from: ./green_tripdata_2025-05.parquet
Downloading preprocessor...
Downloading artifacts: 100%|█████████████████████| 1/1 [00:00<00:00, 841.72it/s]
Preprocessor loaded.
Loading model from: runs:/ab7ac267f5e945cba9566b7213e58524/nyc-duration-model
Downloading artifacts: 100%|█████████████████████| 7/7 [00:00<00:00, 457.31it/s]
 Batch prediction complete. Output saved to: predictions_output.csv


In [11]:
import pandas as pd
pd.read_csv('predictions_output.csv').head()

Unnamed: 0,PULocationID,DOLocationID,trip_distance,duration,predicted_duration,difference,ride_id,model_version
0,25,216,9.34,39.033333,29.302652,9.730681,d0d70cb7-dafc-4a35-9b89-8578ff5055db,ab7ac267f5e945cba9566b7213e58524
1,160,129,2.95,14.166667,17.526724,-3.360057,427d3a77-ca2a-4747-8fa2-680c93165aaa,ab7ac267f5e945cba9566b7213e58524
2,260,179,3.0,17.666667,16.008978,1.657689,5f10e18e-cdb3-44b9-a237-8a2d61708186,ab7ac267f5e945cba9566b7213e58524
3,130,216,1.61,6.166667,10.345655,-4.178989,4951f868-6391-4a03-98ea-85f2ca69374d,ab7ac267f5e945cba9566b7213e58524
4,244,151,3.44,8.583333,15.393679,-6.810345,4515559d-10b4-4add-b8c9-aed8fb7ccf04,ab7ac267f5e945cba9566b7213e58524
