In [1]:
import os
import uuid
import pickle
import shutil
import os

import pandas as pd

import mlflow
import joblib
from xgboost import XGBRegressor


import uuid

In [2]:
may = pd.read_parquet('./green_tripdata_2025-05.parquet')
may.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,cbd_congestion_fee
0,2,2025-05-01 00:17:04,2025-05-01 00:56:06,N,1.0,25,216,1.0,9.34,44.3,...,0.5,0.0,0.0,,1.0,46.8,1.0,1.0,0.0,0.0
1,2,2025-05-01 00:56:16,2025-05-01 01:10:26,N,1.0,160,129,1.0,2.95,16.3,...,0.5,0.0,0.0,,1.0,18.8,2.0,1.0,0.0,0.0
2,1,2025-05-01 00:24:49,2025-05-01 00:42:29,N,1.0,260,179,1.0,3.0,18.4,...,1.5,0.0,0.0,,1.0,20.9,2.0,1.0,0.0,0.0
3,2,2025-05-01 00:27:11,2025-05-01 00:33:21,N,1.0,130,216,1.0,1.61,9.3,...,0.5,0.0,0.0,,1.0,11.8,2.0,1.0,0.0,0.0
4,2,2025-05-01 00:32:59,2025-05-01 00:41:34,N,1.0,244,151,2.0,3.44,15.6,...,0.5,4.52,0.0,,1.0,22.62,1.0,1.0,0.0,0.0


In [3]:
data = may[['lpep_pickup_datetime', 'lpep_dropoff_datetime', 
                 'PULocationID', 'DOLocationID', 'trip_distance','fare_amount', "total_amount","passenger_count"]]



In [4]:
# Calculate trip duration in minutes
data = data.copy()
data['duration'] = data['lpep_dropoff_datetime'] - data['lpep_pickup_datetime']
data['duration'] = data['duration'].dt.total_seconds() / 60


In [5]:
# Remove outliers - filter duration and distance
data = data[(data['duration'] >= 0) & (data['duration'] <= 60)]
data = data[(data['trip_distance'] >= 0) & (data['trip_distance'] <= 20)]
data = data[(data['passenger_count'] >= 0) & (data['passenger_count'] <= 5)]


# Convert location IDs to categorical data
data[['PULocationID', 'DOLocationID']] = (data[['PULocationID', 'DOLocationID']].astype('str'))

# Select final columns
data = data[["passenger_count",'trip_distance','fare_amount', "total_amount",'PULocationID', 'DOLocationID', 'duration']]

data.head()

Unnamed: 0,passenger_count,trip_distance,fare_amount,total_amount,PULocationID,DOLocationID,duration
0,1.0,9.34,44.3,46.8,25,216,39.033333
1,1.0,2.95,16.3,18.8,160,129,14.166667
2,1.0,3.0,18.4,20.9,260,179,17.666667
3,1.0,1.61,9.3,11.8,130,216,6.166667
4,2.0,3.44,15.6,22.62,244,151,8.583333


In [6]:
import uuid
import joblib
import pandas as pd
from xgboost import XGBRegressor

# --- load preprocessor and model ---
preprocessor = joblib.load("preprocessing.pkl")

model = XGBRegressor()
model.load_model("my_model.ubj")   # <-- loads IN-PLACE, don't assign the return

# --- prepare features ---
FEATURES = ["passenger_count","trip_distance","fare_amount","total_amount","PULocationID","DOLocationID"]
batch_data = data[FEATURES]  # assumes `data` is your incoming DataFrame

# --- transform & predict ---
X = preprocessor.transform(batch_data)   # keep as-is; XGBoost accepts ndarray/sparse
pred = model.predict(X)

# --- format output ---
df_result = data[FEATURES + ["duration"]].copy()
df_result["predicted_duration"] = pred
df_result["difference"] = df_result["duration"] - df_result["predicted_duration"]
df_result["ride_id"] = [str(uuid.uuid4()) for _ in range(len(df_result))]

# Optional: persist
# df_result.to_csv("predictions_output.csv", index=False)
print("Batch prediction complete.")


Batch prediction complete.


In [7]:
%%writefile batch_processing.py
import os
import uuid
import argparse
import pandas as pd
import joblib
from xgboost import XGBRegressor

def generate_uuids(n):
    return [str(uuid.uuid4()) for _ in range(n)]


def main(data_path: str):
    # Step 1: Load data 
    print(f"[INFO] Reading data from: {data_path}")
    df = pd.read_parquet(data_path)

    # Select required columns
    df = df[['lpep_pickup_datetime', 'lpep_dropoff_datetime',
             'PULocationID', 'DOLocationID',
             'trip_distance', 'fare_amount',
             'total_amount', 'passenger_count']].copy()

    # Calculate duration in minutes
    df['duration'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']
    df['duration'] = df['duration'].dt.total_seconds() / 60

    # Filter outliers
    df = df[(df['duration'] >= 0) & (df['duration'] <= 60)]
    df = df[(df['trip_distance'] >= 0) & (df['trip_distance'] <= 20)]
    df = df[(df['passenger_count'] >= 0) & (df['passenger_count'] <= 5)]

    # Convert location IDs to string
    df[['PULocationID', 'DOLocationID']] = df[['PULocationID', 'DOLocationID']].astype(str)

    # Final columns
    df = df[['passenger_count', 'trip_distance', 'fare_amount',
             'total_amount', 'PULocationID', 'DOLocationID', 'duration']]

    # Step 2: Load preprocessor and model (from local files)
    preprocessor = joblib.load("preprocessing.pkl")
    model = XGBRegressor()
    model = model.load_model("my_model.ubj") 
    print("[INFO] Preprocessor and Model loaded.")

    preprocessor = joblib.load("preprocessing.pkl")
    model = XGBRegressor()
    model.load_model("my_model.ubj")           # ✅ loads into `model`
    print("[INFO] Preprocessor and Model loaded.")

    # Step 3: Transform and Predict
    features = df[['passenger_count', 'trip_distance', 'fare_amount',
                   'total_amount', 'PULocationID', 'DOLocationID']]
    transformed = preprocessor.transform(features)
    predictions = model.predict(transformed) 

    # Step 4: Create result
    df_result = df.copy()
    df_result['predicted_duration'] = predictions
    df_result['difference'] = df_result['duration'] - df_result['predicted_duration']
    df_result['ride_id'] = generate_uuids(len(df_result))

    # Step 5: Save to CSV
    df_result.to_csv("predictions_output.csv", index=False)
    print("Batch prediction complete. Output saved to: predictions_output.csv")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Batch predictor for ride duration")
    parser.add_argument("--data_path", required=True, help="Path to the Parquet data file")
    args = parser.parse_args()

    main(data_path=args.data_path)


Overwriting batch_processing.py


In [8]:
!python batch_processing.py \
  --data_path ./green_tripdata_2025-05.parquet 


[INFO] Reading data from: ./green_tripdata_2025-05.parquet
[INFO] Preprocessor and Model loaded.
[INFO] Preprocessor and Model loaded.
Batch prediction complete. Output saved to: predictions_output.csv


In [10]:
import pandas as pd
pd.read_csv('predictions_output.csv').head()

Unnamed: 0,passenger_count,trip_distance,fare_amount,total_amount,PULocationID,DOLocationID,duration,predicted_duration,difference,ride_id
0,1.0,9.34,44.3,46.8,25,216,39.033333,34.087833,4.9455,8c49699f-ce92-48f2-a9a2-4123fe5aac26
1,1.0,2.95,16.3,18.8,160,129,14.166667,13.25475,0.911917,374de948-81a5-48d3-878d-b0146c5a6fb6
2,1.0,3.0,18.4,20.9,260,179,17.666667,17.131167,0.5355,9a6db519-a573-47a5-8cea-a9562d0a2be9
3,1.0,1.61,9.3,11.8,130,216,6.166667,6.811333,-0.644667,2b00a3b8-d66c-4a06-beb4-68dff154d5ce
4,2.0,3.44,15.6,22.62,244,151,8.583333,10.274125,-1.690792,d6b8cf11-cee6-4cd5-8248-36612839dc6f
