In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import joblib
import os

def train_demand_model():
    """
    Trains a Gradient Boosting Regressor to predict incident hotspots based on
    geospatial and temporal data.
    """
    print("Starting demand forecasting model training...")

    # --- 1. Load and Prepare Data ---
    try:
        # Correct path according to project structure
        data_path = "/content/historical_incidents_500.csv"
        df = pd.read_csv(data_path)
    except FileNotFoundError:
        print(f"Error: The dataset was not found at {data_path}")
        print("Please ensure 'historical_incidents_500.csv' is in the 'ai_models/routing/data/' directory.")
        return

    print(f"Loaded {len(df)} historical incident records.")

    # --- 2. Feature Engineering ---
    # Convert timestamp to datetime objects
    df['timestamp'] = pd.to_datetime(df['timestamp'])

    # Extract temporal features
    df['hour'] = df['timestamp'].dt.hour
    df['day_of_week'] = df['timestamp'].dt.dayofweek # Monday=0, Sunday=6
    df['month'] = df['timestamp'].dt.month

    # For simplicity in this example, we'll create a synthetic target variable.
    # In a real-world scenario, this would be a more complex aggregation,
    # e.g., counting incidents per hour in a given geographical grid cell.
    # Here, we'll just create a pseudo-random "incident_count" for demonstration.
    df['incident_count'] = 1 + (df['hour'] // 6) + (df['day_of_week'] % 3) + df.index % 5

    print("Temporal features (hour, day_of_week, month) extracted.")

    # Define features (X) and target (y)
    features = ['latitude', 'longitude', 'hour', 'day_of_week', 'month']
    target = 'incident_count'

    X = df[features]
    y = df[target]

    # --- 3. Model Training ---
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Training on {len(X_train)} samples, testing on {len(X_test)} samples.")

    # Initialize and train the model
    # A Gradient Boosting Regressor is a good choice for this kind of tabular prediction task.
    gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
    gbr.fit(X_train, y_train)
    print("Model training completed.")

    # --- 4. Evaluation ---
    y_pred = gbr.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print("\n--- Model Evaluation ---")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Root Mean Squared Error (RMSE): {mse**0.5:.4f}")

    # --- 5. Serialization ---
    # Save the trained model
    output_dir = os.path.join('..', 'routing')
    os.makedirs(output_dir, exist_ok=True)

    model_path = os.path.join(output_dir, 'demand_model.pkl')
    joblib.dump(gbr, model_path)

    print(f"\nTrained demand model saved to: {model_path}")
    print("\nTraining process finished successfully.")

if __name__ == "__main__":
    # This script is intended to be run from within the `ai_models/routing` directory
    # For example: `python train_demand_model.py`
    train_demand_model()

Starting demand forecasting model training...
Loaded 57 historical incident records.
Temporal features (hour, day_of_week, month) extracted.
Training on 45 samples, testing on 12 samples.
Model training completed.

--- Model Evaluation ---
Mean Squared Error (MSE): 5.6969
Root Mean Squared Error (RMSE): 2.3868

Trained demand model saved to: ../routing/demand_model.pkl

Training process finished successfully.
