In [3]:
#!/usr/bin/env python3
"""
Minimal DQN for Milling Environment with Reward Normalization, 
Enhanced TensorBoard Logging, and GPU Support

This script implements a minimal DQN applied to our 3D milling environment 
(Milling3DEnvNoOverhang). The environment simulates a voxel grid where:
  - Stock cells (value 0) are meant to be removed.
  - Forbidden cells (value 2) represent the target region and boundaries.
  - The router starts at a safe location.
  
The DQN learns to select a target coordinate (discrete action) for the router 
to move to (removing stock along the path), with rewards given exclusively for 
removing non-target stock. Episodes run for a fixed number of steps (max_steps) 
unless a violation occurs. At the end of an episode (if no violation), a bonus is 
given based on the fraction of stock cleared.

The rewards are normalized using an exponential moving average to reduce variance.
Training metrics (loss, episode reward, steps, stock removed, fraction removed, termination reasons) 
are logged to TensorBoard.

Usage:
  python dqn_tensorboard_gpu.py
Then launch TensorBoard with:
  tensorboard --logdir=runs
"""

import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
from torch.utils.tensorboard import SummaryWriter
import datetime
import os

# -------------------------------------------------------------------------
# 1) Environment: MillingEnvironment
# -------------------------------------------------------------------------

class MillingEnvironment:
    def __init__(self, grid_size=8, min_radius=2, max_radius=3, max_steps=50):
        self.grid_size = grid_size
        self.min_radius = min_radius
        self.max_radius = max_radius
        self.max_steps = max_steps
        self.reset()

    def reset(self):
        n = self.grid_size
        # Create stock: all voxels initially present (value 0 means stock to remove)
        self.stock = np.ones((n, n, n), dtype=bool)
        # Create shape: a half-sphere (target region) anchored at z=0, marked as forbidden.
        self.shape = np.zeros((n, n, n), dtype=bool)
        r = np.random.randint(self.min_radius, self.max_radius + 1)
        # Choose random center in x,y ensuring shape fits; z is anchored at 0.
        cx = np.random.randint(r, n - r)
        cy = np.random.randint(r, n - r)
        cz = 0
        for x in range(n):
            for y in range(n):
                for z in range(n):
                    if z >= 0:
                        dx = x - cx
                        dy = y - cy
                        dz = z - cz
                        if dx*dx + dy*dy + dz*dz <= r*r:
                            self.shape[x, y, z] = True

        # Router starts at a safe corner.
        self.router_pos = np.array([1, 1, 1], dtype=np.int32)
        self.steps_taken = 0
        self.done = False
        self.total_removed = 0.0
        self.termination_reason = None
        return self._get_observation()

    def line_voxels(self, start, end):
        s = start.astype(float)
        e = end.astype(float)
        diff = e - s
        length = int(np.linalg.norm(diff))
        if length == 0:
            return [tuple(start)]
        steps = max(1, length * 2)
        visited = set()
        out = []
        for i in range(steps + 1):
            t = i / steps
            coords = np.round(s + diff * t).astype(int)
            c_tuple = tuple(coords)
            if c_tuple not in visited:
                visited.add(c_tuple)
                out.append(c_tuple)
        return out

    def do_move(self, target):
        if self.done:
            return self._get_observation(), 0.0, True

        path_vox = self.line_voxels(self.router_pos, target)
        failed = False
        reward = 0.0
        reason = None

        for vx, vy, vz in path_vox:
            if not (0 <= vx < self.grid_size and 0 <= vy < self.grid_size and 0 <= vz < self.grid_size):
                failed = True
                reward -= 20.0  # moderate penalty for out-of-bounds
                reason = "out_of_bounds"
                break
            if self.shape[vx, vy, vz]:
                failed = True
                reward -= 20.0  # moderate penalty for cutting target
                reason = "shape"
                break
            if self.stock[vx, vy, vz]:
                self.stock[vx, vy, vz] = False
                reward += 1.0  # reward for removing stock
                self.total_removed += 1.0

        if not failed:
            reward -= 0.1  # small step cost
            self.router_pos = target.copy()

        self.steps_taken += 1
        if self.steps_taken >= self.max_steps and not failed:
            failed = True
            reason = "max_steps"

        outside_mask = (self.shape == 0)
        if not failed and np.sum(self.stock[outside_mask]) == 0:
            reward += 50.0  # bonus for complete removal
            failed = True
            reason = "complete"

        if failed:
            self.done = True
            self.termination_reason = reason

        return self._get_observation(), reward, self.done

    def _get_observation(self):
        # Flatten stock (bool->float) and shape (bool->float) and append router coordinates.
        stock_f = self.stock.flatten().astype(np.float32)
        shape_f = self.shape.flatten().astype(np.float32)
        router_f = self.router_pos.astype(np.float32)
        return np.concatenate([stock_f, shape_f, router_f], axis=0)

    def step(self, action):
        # Decode action (discrete in [0, grid_size^3 -1]) into (x,y,z)
        n = self.grid_size
        z = action % n
        y = (action // n) % n
        x = (action // (n * n)) % n
        next_obs, reward, done = self.do_move(np.array([x, y, z], dtype=np.int32))
        return next_obs, reward, done, {}

    def reset_gym(self):
        return self.reset()

    @property
    def n_actions(self):
        return self.grid_size**3

    def fraction_outside_removed(self):
        outside_mask = (self.shape == 0)
        total_outside = np.sum(outside_mask)
        return self.total_removed / (total_outside + 1e-8)

# -------------------------------------------------------------------------
# 2) Reward Normalizer
# -------------------------------------------------------------------------

class RewardNormalizer:
    def __init__(self, momentum=0.99, eps=1e-8):
        self.momentum = momentum
        self.eps = eps
        self.mean = 0.0
        self.var = 1.0

    def update(self, r):
        self.mean = self.momentum * self.mean + (1 - self.momentum) * r
        self.var = self.momentum * self.var + (1 - self.momentum) * ((r - self.mean) ** 2)

    def normalize(self, r):
        return (r - self.mean) / (np.sqrt(self.var) + self.eps)

# -------------------------------------------------------------------------
# 3) Simple DQN and Replay Buffer
# -------------------------------------------------------------------------

class SimpleDQN(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=256):
        super(SimpleDQN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )
    def forward(self, x):
        return self.net(x)

class ReplayBuffer:
    def __init__(self, capacity=10000):
        self.buffer = deque(maxlen=capacity)
    def push(self, s, a, r, s_next, done):
        self.buffer.append((s, a, r, s_next, done))
    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        s, a, r, s_next, done = zip(*batch)
        return np.array(s), np.array(a), np.array(r), np.array(s_next), np.array(done)
    def __len__(self):
        return len(self.buffer)

# -------------------------------------------------------------------------
# 4) DQN Training Loop with TensorBoard Logging and Reward Normalization
# -------------------------------------------------------------------------

def train_dqn(env, num_episodes=200, batch_size=32, gamma=0.99, lr=1e-5,
              epsilon_start=1.0, epsilon_end=0.1, epsilon_decay=0.99995, replay_capacity=10000):
    # Use GPU if available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    obs_dim = len(env.reset_gym())
    n_actions = env.n_actions

    dqn = SimpleDQN(obs_dim, n_actions).to(device)
    optimizer = optim.Adam(dqn.parameters(), lr=lr)
    replay = ReplayBuffer(capacity=replay_capacity)

    reward_normalizer = RewardNormalizer(momentum=0.99)

    log_dir = os.path.join("runs", f"dqn_milling_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
    writer = SummaryWriter(log_dir=log_dir)

    epsilon = epsilon_start
    global_step = 0
    update_count = 0

    term_counters = {"shape": 0, "out_of_bounds": 0, "max_steps": 0, "complete": 0}

    for ep in range(num_episodes):
        obs = env.reset_gym()
        obs_t = torch.FloatTensor(obs).to(device)
        done = False
        total_reward = 0.0
        steps_this_ep = 0

        while not done:
            global_step += 1
            # Epsilon-greedy action selection
            if random.random() < epsilon:
                action = random.randint(0, n_actions - 1)
            else:
                with torch.no_grad():
                    q_vals = dqn(obs_t.unsqueeze(0))
                    action = q_vals.argmax(dim=1).item()

            next_obs, reward, done, _ = env.step(action)
            reward_normalizer.update(reward)
            norm_reward = reward_normalizer.normalize(reward)
            total_reward += reward

            replay.push(obs, action, norm_reward, next_obs, done)

            obs = next_obs
            obs_t = torch.FloatTensor(obs).to(device)
            steps_this_ep += 1

            if len(replay) >= batch_size:
                s_arr, a_arr, r_arr, s_next_arr, d_arr = replay.sample(batch_size)
                s_ten = torch.FloatTensor(s_arr).to(device)
                a_ten = torch.LongTensor(a_arr).to(device)
                r_ten = torch.FloatTensor(r_arr).to(device)
                s_next_ten = torch.FloatTensor(s_next_arr).to(device)
                d_ten = torch.BoolTensor(d_arr).to(device)

                q_vals = dqn(s_ten)
                q_s_a = q_vals.gather(1, a_ten.unsqueeze(1)).squeeze(1)

                with torch.no_grad():
                    q_next = dqn(s_next_ten)
                    max_q_next, _ = torch.max(q_next, dim=1)
                    max_q_next[d_ten] = 0.0
                target = r_ten + gamma * max_q_next

                loss = nn.MSELoss()(q_s_a, target)
                optimizer.zero_grad()
                loss.backward()
                # Optional: Gradient clipping can be applied here if necessary:
                # torch.nn.utils.clip_grad_norm_(dqn.parameters(), max_norm=1.0)
                optimizer.step()

                update_count += 1
                writer.add_scalar("Loss", loss.item(), update_count)

        epsilon = max(epsilon_end, epsilon * epsilon_decay)
        term_reason = env.termination_reason if env.termination_reason is not None else "max_steps"
        if term_reason in term_counters:
            term_counters[term_reason] += 1

        writer.add_scalar("EpisodeReward", total_reward, ep)
        writer.add_scalar("StepsPerEpisode", steps_this_ep, ep)
        writer.add_scalar("StockRemoved", env.total_removed, ep)
        outside_mask = (env.shape == 0)
        outside_total = np.sum(outside_mask)
        frac_removed = env.total_removed / (outside_total + 1e-8)
        writer.add_scalar("FractionRemoved", frac_removed, ep)
        writer.add_scalar("WorkerMoves", env.steps_taken, ep)
        writer.add_scalar("Epsilon", epsilon, ep)

        print(f"Episode {ep+1}/{num_episodes}: Reward={total_reward:.2f}, Steps={steps_this_ep}, "
              f"Epsilon={epsilon:.2f}, StockRemoved={env.total_removed:.0f}, "
              f"FractionRemoved={frac_removed:.3f}, Termination={term_reason}")

    for reason, count in term_counters.items():
        writer.add_scalar(f"Terminations/{reason}", count, 0)

    writer.close()
    print("Training complete.")
    return dqn

# -------------------------------------------------------------------------
# 5) Testing Routine: Evaluate over 100 Episodes
# -------------------------------------------------------------------------

def test_dqn(dqn, env, num_tests=100):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    total_frac = 0.0
    total_steps = 0

    for _ in range(num_tests):
        obs = env.reset_gym()
        done = False
        steps = 0

        while not done:
            obs_t = torch.FloatTensor(obs).to(device)
            with torch.no_grad():
                q_vals = dqn(obs_t.unsqueeze(0))
                action = q_vals.argmax(dim=1).item()
            obs, reward, done, _ = env.step(action)
            steps += 1

        frac_removed = env.fraction_outside_removed()  # fraction from 0 to 1
        total_frac += frac_removed
        total_steps += steps

    avg_frac = total_frac / num_tests
    avg_steps = total_steps / num_tests
    print(f"Average fraction of outside stock removed: {avg_frac*100:.2f}%")
    print(f"Average number of steps per episode: {avg_steps:.2f}")
    return avg_frac, avg_steps


In [None]:
# -------------------------------------------------------------------------
# 6) Main Script
# -------------------------------------------------------------------------

def main():
    env = MillingEnvironment(grid_size=8, min_radius=1, max_radius=3, max_steps=30)
    trained_dqn = train_dqn(env, num_episodes=30000)
    print("Training finished. Running tests...")
    test_dqn(trained_dqn, env, num_tests=100)
    print("Done.")

if __name__ == "__main__":
    main()

In [2]:
# now run tensorboard
!tensorboard --logdir=runs

TensorFlow installation not found - running with reduced feature set.
Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.19.0 at http://localhost:6006/ (Press CTRL+C to quit)
^C
