In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import joblib

# Load reference data and model (do this once)
xb_data = pd.read_csv("D:/traffic-congestion-predictor/data/processed/xbdata.csv", parse_dates=["pickup_hour_dt"])
model = joblib.load("D:/traffic-congestion-predictor/notebooks/xgboost_congestion_model")

# Define rush hour and midweek
RUSH_HOURS = {7, 8, 9, 16, 17, 18, 19}
MIDWEEK_DAYS = {"Tuesday", "Wednesday", "Thursday"}

def predict_congestion(pickup_datetime, PUZone, DOZone):
    """Predict average speed for a trip between zones at a specific datetime."""
    
    # 1. Parse datetime and extract features
    if isinstance(pickup_datetime, str):
        pickup_datetime = pd.to_datetime(pickup_datetime)
    
    pickup_hour = pickup_datetime.hour
    pickup_day = pickup_datetime.strftime("%A")
    date_only = pickup_datetime.date()  # For matching date portion only
    
    is_rush_hour = int(pickup_hour in RUSH_HOURS)
    is_midweek = int(pickup_day in MIDWEEK_DAYS)

    # 2. Find matching records - first try exact hour match
    hour_match = xb_data[
        (xb_data["PUZone"] == PUZone) &
        (xb_data["DOZone"] == DOZone) &
        (xb_data["pickup_hour_dt"].dt.hour == pickup_hour)
    ]
    
    # If no exact hour match, try any record for these zones
    if hour_match.empty:
        zone_match = xb_data[
            (xb_data["PUZone"] == PUZone) &
            (xb_data["DOZone"] == DOZone)
        ]
        
        if zone_match.empty:
            raise ValueError(f"No records found for zone pair {PUZone} -> {DOZone}")
        
        # Take the most recent record for these zones
        match = zone_match.sort_values("pickup_hour_dt", ascending=False).iloc[0]
        print(f"⚠️ Using closest available record (hour {match['pickup_hour_dt'].hour})")
    else:
        # Take the most recent record with matching hour
        match = hour_match.sort_values("pickup_hour_dt", ascending=False).iloc[0]

    # 3. Construct input feature vector in EXACT order the model expects
    features = {
        'pickup_hour': pickup_hour,
        'pickup_day_Sunday': int(pickup_day == "Sunday"),
        'pickup_day_Monday': int(pickup_day == "Monday"),
        'pickup_day_Tuesday': int(pickup_day == "Tuesday"),
        'pickup_day_Wednesday': int(pickup_day == "Wednesday"),
        'pickup_day_Thursday': int(pickup_day == "Thursday"),
        'pickup_day_Friday': int(pickup_day == "Friday"),
        'pickup_day_Saturday': int(pickup_day == "Saturday"),
        'is_rush_hour': is_rush_hour,
        'is_midweek': is_midweek,
        'PULocationID': match['PULocationID'],
        'DOLocationID': match['DOLocationID'],
        'PUZone_freq': match['PUZone_freq'],
        'DOZone_freq': match['DOZone_freq'],
        'PUBorough_freq': match['PUBorough_freq'],
        'DOBorough_freq': match['DOBorough_freq'],
        'temp': match['temp'],
        'prcp': match['prcp'],
        'wspd': match['wspd'],
        'snowed': match['snowed'],
        'coco': match['coco']
    }

    # Create DataFrame with features in correct order
    feature_order = [
        'pickup_hour', 'pickup_day_Sunday', 'pickup_day_Monday',
        'pickup_day_Tuesday', 'pickup_day_Wednesday', 'pickup_day_Thursday',
        'pickup_day_Friday', 'pickup_day_Saturday', 'is_rush_hour', 'is_midweek',
        'PULocationID', 'DOLocationID', 'PUZone_freq', 'DOZone_freq',
        'PUBorough_freq', 'DOBorough_freq', 'temp', 'prcp', 'wspd', 'snowed', 'coco'
    ]
    
    input_df = pd.DataFrame([features])[feature_order]

    # 4. Make prediction
    speed = model.predict(input_df)[0]
    rounded_speed = round(speed, 2)

    if speed >= 10:
        congestion = "Low "
    elif speed >= 5:
        congestion = "Moderate"
    else:
        congestion = "High"

    return rounded_speed, congestion
    #except Exception as e:
    #return None, f"Prediction failed: {str(e)}"

In [3]:
# To predict speed for Jan 18, 2025 at 8:30pm, for a taxi trip form Upper West Side North to Morningside Heights
#speed = predict_speed("2025-01-18 20:30:00", "Upper West Side North", "Morningside Heights")

speed, congestion = predict_congestion("2025-01-18 20:30:00", "Upper West Side North", "Morningside Heights")
print(f"Congestion level: {congestion}")
print(f"Predicted speed: {speed} mph")
    

Congestion level: Low 
Predicted speed: 12.779999732971191 mph
