In [1]:
pip install gym stable-baselines3[extra] numpy pandas




In [2]:
import gym
import numpy as np
import pandas as pd
from gym import spaces
from stable_baselines3 import PPO  # or DQN, A2C
from stable_baselines3.common.vec_env import DummyVecEnv
from sklearn.preprocessing import StandardScaler

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


In [3]:
pip install "shimmy>=2.0" gym==0.26.2 gymnasium



In [4]:
import gym
import numpy as np
import pandas as pd
from gym import spaces
from stable_baselines3 import PPO  # or DQN, A2C
from stable_baselines3.common.vec_env import DummyVecEnv
from sklearn.preprocessing import StandardScaler

# -----------------------------
# Example inputs you already have:
# - fleet_df: DataFrame with vehicle rows (vehicle_id, capacity, availability, lat, lon)
# - demand_df: DataFrame with demand rows (demand_id, time, lat, lon, demand_size_kg, traffic_speed_kmh)
# - state_matrix: N x F array where each row = [capacity, availability, demand_size_kg, adjusted_traffic, eta, distance]
#   BUT we will restructure to per-demand access to all vehicles.
# -----------------------------

# For the env we need per-demand features for all vehicles.
# Build a 3D array: demands x vehicles x features_per_pair
# (If you already have pair rows, you can pivot/group them accordingly.)
#
# For demonstration we'll create such a structure from your earlier variables:
# state_vectors was list of [capacity, availability, demand_size_kg, adjusted_traffic, eta, distance]

# --- Dummy code to reconstruct shapes (replace with your actual data)
# Suppose you have M vehicles and T demands; create arrays like below.
# vehicles = fleet_df.reset_index(drop=True)
# demands = demand_df.reset_index(drop=True)
#
# pair_features[d_idx, v_idx, :] = feature vector for (vehicle v_idx, demand d_idx)

# -----------------------------
# Gym Env: choose vehicle for each demand (sequential)
# -----------------------------
class FleetAssignmentEnv(gym.Env):
    """
    Each episode: iterate through a sequence of demands.
    Observation: vector of shape (num_vehicles, features_per_pair) flattened (or keep as matrix via Box).
    Action: select vehicle index (Discrete(num_vehicles)).
    Reward: +ve for good assignment (negative ETA), heavy penalty if capacity insufficient, small cost per step.
    """
    metadata = {'render.modes': ['human']}

    def __init__(self, pair_features, vehicle_capacities, demand_sizes, max_episode_demands=None, normalize=True):
        """
        pair_features: numpy array shape (num_demands, num_vehicles, feat_dim)
        vehicle_capacities: array shape (num_vehicles,)
        demand_sizes: array shape (num_demands,)
        """
        super().__init__()
        self.pair_features = pair_features
        self.vehicle_capacities = np.array(vehicle_capacities)
        self.demand_sizes = np.array(demand_sizes)
        self.num_demands, self.num_vehicles, self.feat_dim = pair_features.shape

        # Observation: we flatten the matrix into a vector for simplicity
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(self.num_vehicles * self.feat_dim,), dtype=np.float32)
        self.action_space = spaces.Discrete(self.num_vehicles)

        self.normalize = normalize
        if normalize:
            # fit scaler over flattened pair_features
            self.scaler = StandardScaler()
            flat = pair_features.reshape(-1, self.feat_dim)
            self.scaler.fit(flat)
        else:
            self.scaler = None

        self.max_episode_demands = max_episode_demands or self.num_demands
        self.current_idx = 0
        self.assigned = []  # optional: record assignments

    def reset(self):
        # Start at a random demand or sequentially from 0
        self.current_idx = 0
        self.assigned = []
        return self._get_obs()

    def _get_obs(self):
        feats = self.pair_features[self.current_idx]  # shape (num_vehicles, feat_dim)
        if self.normalize:
            feats = self.scaler.transform(feats)
        return feats.flatten().astype(np.float32)

    def step(self, action):
        """
        action: index of vehicle chosen
        """
        done = False
        info = {}

        v_idx = int(action)
        # retrieve pair features for chosen vehicle
        pair_feat = self.pair_features[self.current_idx, v_idx]  # [capacity, availability, demand_size, adjusted_traffic, eta, distance]
        capacity = self.vehicle_capacities[v_idx]
        demand_size = self.demand_sizes[self.current_idx]
        eta = pair_feat[4]  # assuming eta is at index 4
        availability = pair_feat[1]  # assumed

        # Reward design:
        # - reward = - eta (minimize ETA)
        # - large negative penalty if vehicle can't carry the demand
        # - small penalty for unavailable vehicles
        reward = -float(eta)

        if capacity < demand_size:
            reward -= 100.0  # infeasible assignment heavy penalty

        if availability == 0:
            reward -= 10.0  # discourage using unavailable vehicle

        # Optionally mark vehicle as used / reduce capacity (for this simple env we won't update capacity across steps)
        self.assigned.append((self.current_idx, v_idx, reward))

        # Move to next demand
        self.current_idx += 1
        if self.current_idx >= min(self.max_episode_demands, self.num_demands):
            done = True

        obs = self._get_obs() if not done else np.zeros(self.observation_space.shape, dtype=np.float32)

        return obs, reward, done, info

    def render(self, mode='human'):
        print(f"Demand idx: {self.current_idx}, last assignments: {self.assigned[-3:]}")

    def close(self):
        pass

# -----------------------------
# Example: prepare pair_features from your state_vectors
# -----------------------------
# Replace the following with your actual grouping logic.
# Suppose you have lists: vehicles_list, demands_list, and you computed a state vector per pair in order.
# Here we show a tiny standard example.

def build_demo_pair_features(fleet_df, demand_df):
    """
    Builds pair_features array consistent with state vector layout:
    [capacity, availability, demand_size, adjusted_traffic, eta, distance]
    """
    M = len(fleet_df)      # num vehicles
    T = len(demand_df)     # num demands
    feat_dim = 6
    pair_features = np.zeros((T, M, feat_dim), dtype=np.float32)
    for d_idx, drow in demand_df.reset_index(drop=True).iterrows():
        for v_idx, vrow in fleet_df.reset_index(drop=True).iterrows():
            capacity = vrow['capacity']
            availability = vrow.get('availability', 1)
            demand_size = drow['demand_size_kg']
            adjusted_traffic = drow.get('traffic_speed_kmh', 30.0)  # example
            # compute distance and eta example (replace with your actual values)
            distance = np.linalg.norm([vrow['lat'] - drow['lat'], vrow['lon'] - drow['lon']])
            # small constant to avoid div by zero
            eta = distance / (adjusted_traffic / 60.0 + 1e-6)  # minutes approx
            pair_features[d_idx, v_idx] = np.array([capacity, availability, demand_size, adjusted_traffic, eta, distance])
    return pair_features

# -----------------------------
# Example usage
# -----------------------------
# Load or use your actual fleet_df and demand_df
# For demo we will create tiny toy frames:
fleet_df = pd.DataFrame([
    {"vehicle_id": "V1", "capacity": 500, "lat": 12.9716, "lon": 77.5946, "availability": 1},
    {"vehicle_id": "V2", "capacity": 300, "lat": 12.9352, "lon": 77.6245, "availability": 1},
    {"vehicle_id": "V3", "capacity": 700, "lat": 13.0186, "lon": 77.5560, "availability": 0}
])

demand_df = pd.DataFrame([
    {"demand_id": "D1", "demand_size_kg": 200, "lat": 12.96, "lon": 77.62, "traffic_speed_kmh": 25},
    {"demand_id": "D2", "demand_size_kg": 400, "lat": 13.02, "lon": 77.56, "traffic_speed_kmh": 30},
    {"demand_id": "D3", "demand_size_kg": 100, "lat": 12.97, "lon": 77.70, "traffic_speed_kmh": 28},
])

pair_features = build_demo_pair_features(fleet_df, demand_df)
vehicle_capacities = fleet_df['capacity'].values
demand_sizes = demand_df['demand_size_kg'].values

env = FleetAssignmentEnv(pair_features, vehicle_capacities, demand_sizes)
vec_env = DummyVecEnv([lambda: env])  # SB3 requires vectorized envs

# Train a PPO policy
model = PPO("MlpPolicy", vec_env, verbose=1, n_steps=64, batch_size=32, ent_coef=0.01)
model.learn(total_timesteps=5000)

# Save / load
model.save("fleet_assign_ppo")
# inference example
obs = env.reset()
action, _states = model.predict(obs, deterministic=True)
print("Action chosen (vehicle idx):", action)




Using cpu device
----------------------------
| time/              |     |
|    fps             | 392 |
|    iterations      | 1   |
|    time_elapsed    | 0   |
|    total_timesteps | 64  |
----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 323          |
|    iterations           | 2            |
|    time_elapsed         | 0            |
|    total_timesteps      | 128          |
| train/                  |              |
|    approx_kl            | 0.0028219586 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.1         |
|    explained_variance   | -0.000411    |
|    learning_rate        | 0.0003       |
|    loss                 | 644          |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0215      |
|    value_loss           | 1.31e+03     |
------------------------------------------
------