In [None]:
"""
Supply Chain & Inventory Management using Reinforcement Learning (Lab Project)

This is an inventory control simulation where an RL agent
learns how much to order each day to minimize total cost.

Features:
- Environment with inventory, demand, optional lead time pipeline
- Q-Learning agent with discrete states/actions
- Baselines: Random policy, (s, S) order-up-to policy
- Training + evaluation + plots
"""

from dataclasses import dataclass
from collections import deque
import random
import math
import numpy as np
import matplotlib.pyplot as plt


# -----------------------------
# 1) Config
# -----------------------------
@dataclass
class Config:
    days_per_episode: int = 60
    train_episodes: int = 10000  # Increased to allow for finer state-space learning
    eval_episodes: int = 200

    # inventory settings
    max_inventory: int = 200
    max_order: int = 100         # Slightly increased
    order_step: int = 10         # Smaller steps (was 20) for finer control
    init_inventory: int = 60

    # demand settings
    demand_min: int = 5
    demand_max: int = 25

    # lead time (0 means immediate arrival)
    lead_time: int = 2

    # cost settings
    holding_cost: float = 0.5        # per unit per day
    stockout_cost: float = 30.0      # HIGHER (was 12.0) to force RL to prioritize fulfillment
    order_fixed_cost: float = 5.0    # Discourages tiny orders every single day
    order_unit_cost: float = 0.1     # per unit ordered

    # q-learning settings
    gamma: float = 0.95
    alpha: float = 0.1
    epsilon_start: float = 1.0
    epsilon_end: float = 0.01
    epsilon_decay: float = 0.9995    # Slower decay for better exploration

    # discretization (state bins) - SMALLER bins provide higher resolution
    inv_bin_size: int = 5            # Was 10
    pipeline_bin_size: int = 10      # Was 10

    seed: int = 42


# -----------------------------
# 2) Environment
# -----------------------------
class InventoryEnv:
    """
    State includes:
      - on_hand inventory
      - pipeline inventory (sum of outstanding orders arriving in future)
    Action:
      - order quantity (discrete: 0, step, 2*step, ... max_order)
    """
    def __init__(self, cfg: Config):
        self.cfg = cfg
        self.rng = random.Random(cfg.seed)
        self.actions = list(range(0, cfg.max_order + 1, cfg.order_step))

        self.on_hand = 0
        self.day = 0
        self.pipeline = deque([0] * cfg.lead_time)  # quantities arriving over next days

        # logs (for plotting in a single episode)
        self.log_on_hand = []
        self.log_demand = []
        self.log_order = []
        self.log_cost = []
        self.log_pipeline = []

    def reset(self):
        self.on_hand = self.cfg.init_inventory
        self.day = 0
        self.pipeline = deque([0] * self.cfg.lead_time)

        self.log_on_hand.clear()
        self.log_demand.clear()
        self.log_order.clear()
        self.log_cost.clear()
        self.log_pipeline.clear()

        return self._get_state()

    def step(self, action_idx: int):
        """
        One day transition:
        1) receive arriving order (if lead time > 0)
        2) place a new order (goes into pipeline)
        3) demand occurs, fulfill from on_hand
        4) compute reward = - total_cost
        """
        assert 0 <= action_idx < len(self.actions)
        order_qty = self.actions[action_idx]

        # 1) receive arrivals
        if self.cfg.lead_time > 0:
            arrived = self.pipeline.popleft()
            self.on_hand = min(self.cfg.max_inventory, self.on_hand + arrived)
        else:
            arrived = 0  # not used

        # 2) place order -> enters pipeline or arrives immediately
        if self.cfg.lead_time > 0:
            self.pipeline.append(order_qty)
        else:
            self.on_hand = min(self.cfg.max_inventory, self.on_hand + order_qty)

        # 3) demand
        demand = self.rng.randint(self.cfg.demand_min, self.cfg.demand_max)
        sold = min(self.on_hand, demand)
        unmet = demand - sold
        self.on_hand -= sold

        # 4) costs
        holding = self.cfg.holding_cost * self.on_hand
        stockout = self.cfg.stockout_cost * unmet
        order_fixed = self.cfg.order_fixed_cost if order_qty > 0 else 0.0
        order_var = self.cfg.order_unit_cost * order_qty
        total_cost = holding + stockout + order_fixed + order_var

        reward = -total_cost

        # logging
        self.log_on_hand.append(self.on_hand)
        self.log_demand.append(demand)
        self.log_order.append(order_qty)
        self.log_cost.append(total_cost)
        self.log_pipeline.append(sum(self.pipeline) if self.cfg.lead_time > 0 else 0)

        # done?
        self.day += 1
        done = (self.day >= self.cfg.days_per_episode)

        next_state = self._get_state()
        info = {
            "holding_cost": holding,
            "stockout_cost": stockout,
            "order_cost": order_fixed + order_var,
            "unmet": unmet,
            "demand": demand
        }
        return next_state, reward, done, info

    def _get_state(self):
        """Discretize inventory + pipeline into bins for tabular Q-learning."""
        inv_bin = min(self.cfg.max_inventory // self.cfg.inv_bin_size,
                      self.on_hand // self.cfg.inv_bin_size)

        pipe_sum = sum(self.pipeline) if self.cfg.lead_time > 0 else 0
        pipe_max = self.cfg.max_order * max(1, self.cfg.lead_time)
        pipe_bin = min(pipe_max // self.cfg.pipeline_bin_size,
                       pipe_sum // self.cfg.pipeline_bin_size)

        return (int(inv_bin), int(pipe_bin))

    def state_space_size(self):
        inv_bins = (self.cfg.max_inventory // self.cfg.inv_bin_size) + 1
        pipe_max = (self.cfg.max_order * max(1, self.cfg.lead_time))
        pipe_bins = (pipe_max // self.cfg.pipeline_bin_size) + 1
        return inv_bins, pipe_bins

    def action_space_size(self):
        return len(self.actions)


# -----------------------------
# 3) Q-Learning Agent
# -----------------------------
class QLearningAgent:
    def __init__(self, env: InventoryEnv, cfg: Config):
        self.cfg = cfg
        inv_bins, pipe_bins = env.state_space_size()
        n_actions = env.action_space_size()

        self.Q = np.zeros((inv_bins, pipe_bins, n_actions), dtype=np.float32)
        self.epsilon = cfg.epsilon_start
        self.n_actions = n_actions

    def choose_action(self, state, greedy=False):
        inv_bin, pipe_bin = state

        if (not greedy) and (random.random() < self.epsilon):
            return random.randrange(self.n_actions)

        qvals = self.Q[inv_bin, pipe_bin]
        # tie-breaking randomly
        best = np.max(qvals)
        best_actions = np.where(qvals == best)[0]
        return int(np.random.choice(best_actions))

    def update(self, state, action, reward, next_state, done):
        inv, pipe = state
        n_inv, n_pipe = next_state

        best_next = np.max(self.Q[n_inv, n_pipe])
        # Bellman Equation: $Q(s,a) = Q(s,a) + \alpha [R + \gamma \max Q(s',a') - Q(s,a)]$
        target = reward if done else (reward + self.cfg.gamma * best_next)

        self.Q[inv, pipe, action] += self.cfg.alpha * (target - self.Q[inv, pipe, action])

    def decay_epsilon(self):
        self.epsilon = max(self.cfg.epsilon_end, self.epsilon * self.cfg.epsilon_decay)


# -----------------------------
# 4) Baseline Policies
# -----------------------------
def policy_random(env: InventoryEnv):
    return random.randrange(env.action_space_size())

def policy_order_up_to(env: InventoryEnv, target_level=80):
    """
    Simple heuristic:
      Order enough so (on_hand + pipeline_sum) reaches target_level
    """
    current = env.on_hand
    pipe = sum(env.pipeline) if env.cfg.lead_time > 0 else 0
    need = max(0, target_level - (current + pipe))

    # round to nearest available action step
    step = env.cfg.order_step
    qty = int(math.ceil(need / step) * step)
    qty = min(env.cfg.max_order, qty)

    # map qty to action index
    return env.actions.index(qty)


# -----------------------------
# 5) Training / Evaluation
# -----------------------------
def run_episode(env: InventoryEnv, agent: QLearningAgent = None, policy_fn=None, greedy=False):
    state = env.reset()
    total_reward = 0.0
    total_cost = 0.0
    total_unmet = 0

    done = False
    while not done:
        if agent is not None:
            a = agent.choose_action(state, greedy=greedy)
        else:
            a = policy_fn(env)

        next_state, reward, done, info = env.step(a)

        total_reward += reward
        total_cost += (-reward)
        total_unmet += info["unmet"]

        if agent is not None and (not greedy):
            agent.update(state, a, reward, next_state, done)

        state = next_state

    return {
        "reward": total_reward,
        "cost": total_cost,
        "unmet": total_unmet
    }

def train_qlearning(cfg: Config):
    env = InventoryEnv(cfg)
    agent = QLearningAgent(env, cfg)

    history = {
        "episode_cost": [],
        "episode_unmet": [],
        "epsilon": []
    }

    for ep in range(cfg.train_episodes):
        stats = run_episode(env, agent=agent, greedy=False)
        agent.decay_epsilon()

        history["episode_cost"].append(stats["cost"])
        history["episode_unmet"].append(stats["unmet"])
        history["epsilon"].append(agent.epsilon)

        if (ep + 1) % 1000 == 0:
            print(f"  Episode {ep+1}/{cfg.train_episodes} | Avg Cost: {np.mean(history['episode_cost'][-100:]):.2f}")

    return env, agent, history

def evaluate(env: InventoryEnv, agent: QLearningAgent, cfg: Config):
    # Evaluate Q-learning (greedy)
    rl_costs, rl_unmet = [], []
    for _ in range(cfg.eval_episodes):
        stats = run_episode(env, agent=agent, greedy=True)
        rl_costs.append(stats["cost"])
        rl_unmet.append(stats["unmet"])

    # Evaluate baselines
    rnd_costs, rnd_unmet = [], []
    for _ in range(cfg.eval_episodes):
        stats = run_episode(env, agent=None, policy_fn=policy_random)
        rnd_costs.append(stats["cost"])
        rnd_unmet.append(stats["unmet"])

    heu_costs, heu_unmet = [], []
    for _ in range(cfg.eval_episodes):
        stats = run_episode(env, agent=None, policy_fn=lambda e: policy_order_up_to(e, target_level=80))
        heu_costs.append(stats["cost"])
        heu_unmet.append(stats["unmet"])

    results = {
        "RL": (np.mean(rl_costs), np.mean(rl_unmet)),
        "Random": (np.mean(rnd_costs), np.mean(rnd_unmet)),
        "OrderUpTo": (np.mean(heu_costs), np.mean(heu_unmet)),
    }
    return results

# ... (Plotting functions remain the same) ...

def plot_training(history):
    fig, ax = plt.subplots()
    ax.plot(history["episode_cost"])
    ax.set_title("Training: Episode Cost")
    ax.set_xlabel("Episode")
    ax.set_ylabel("Cost")
    plt.show()

def plot_one_episode(env: InventoryEnv, agent: QLearningAgent):
    run_episode(env, agent=agent, greedy=True)
    days = list(range(1, len(env.log_on_hand) + 1))

    fig, ax = plt.subplots()
    ax.plot(days, env.log_on_hand, label="On-hand Inventory")
    ax.plot(days, env.log_pipeline, label="Pipeline (Sum)")
    ax.set_title("Inventory & Pipeline Over Time (RL Episode)")
    ax.set_xlabel("Day")
    ax.set_ylabel("Units")
    ax.legend()
    plt.show()

# -----------------------------
# 6) Main
# -----------------------------
def main():
    cfg = Config()
    random.seed(cfg.seed)
    np.random.seed(cfg.seed)

    print("Training Q-Learning agent...")
    env, agent, history = train_qlearning(cfg)

    print("\nEvaluating policies...")
    results = evaluate(env, agent, cfg)

    print("\nAverage results over", cfg.eval_episodes, "episodes:")
    print(f"{'Policy':12s} | {'Avg Cost':12s} | {'Avg Unmet':10s}")
    print("-" * 40)
    for name, (avg_cost, avg_unmet) in results.items():
        print(f"  {name:10s} | {avg_cost:12.2f} | {avg_unmet:10.2f}")

    plot_training(history)
    plot_one_episode(env, agent)

if __name__ == "__main__":
    main()

: 