In [None]:
!pip install torch==2.1.0 torchvision==0.16.0
!pip install gym==0.26.2 numpy==1.23.5 matplotlib==3.7.1
!pip install fastapi==0.105.0 uvicorn==0.24.0.post1


In [101]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import matplotlib.pyplot as plt
from collections import defaultdict
import time


In [102]:
class TILAIEnv: #this might or might not be compatible with the testing environment???
    def __init__(self, seed=None): # Chat suggested this seed nonsense might help
        self.grid_size = 16
        self.max_steps = 100
        self.obstacles = {(random.randint(0, 15), random.randint(0, 15)) for _ in range(30)} # add obstacles
        self.reset(seed=seed) # to reset the episode

    def reset(self, seed=None): # resetting the whole episode
        if seed is not None:
            random.seed(seed)
            np.random.seed(seed)

        self.step_count = 0
        self.is_scout = bool(random.randint(0, 1)) # scout is 1 and then guard is 0
        self.direction = random.randint(0, 3) # 0 for right, 1 down, 2 left and 3 up from the repo

        while True:
            self.location = [random.randint(0, 15), random.randint(0, 15)] # random starting position
            if tuple(self.location) not in self.obstacles:
                break

        self.recon_points = {(random.randint(0, 15), random.randint(0, 15)) for _ in range(100)}
        self.missions = {(random.randint(0, 15), random.randint(0, 15)) for _ in range(20)}
        self.visited = set()
        self.done = False
        return self._get_obs()

    def _get_obs(self): # the viewcone needs to be 7 * 5
        viewcone = np.zeros((7, 5), dtype=np.uint8)
        return {
            "viewcone": viewcone,
            "direction": self.direction,
            "location": self.location,
            "scout": int(self.is_scout),
            "step": self.step_count
        } # pack into a dicttttttt

    def _min_manhattan_distance(self):
        targets = self.recon_points if self.is_scout else self.missions # this is to do mission if its scout, if guard ignore
        if not targets:
            return 0 
        return min(abs(self.location[0] - tx) + abs(self.location[1] - ty) for tx, ty in targets)

    def step(self, action):# make the thing move
        if self.done:
            return self._get_obs(), 0, True

        self.step_count += 1
        dx, dy = 0, 0
        if action == 0:
            dx, dy = self._move_vector(self.direction) # move forward
        elif action == 1:
            dx, dy = self._move_vector((self.direction + 2) % 4) # move backword
        elif action == 2:
            self.direction = (self.direction - 1) % 4 #turn left
        elif action == 3:
            self.direction = (self.direction + 1) % 4 # turn right

        new_location = [np.clip(self.location[0] + dx, 0, 15), np.clip(self.location[1] + dy, 0, 15)]
        if tuple(new_location) not in self.obstacles:
            self.location = new_location
        else:
            # print(f"Step {self.step_count}: Blocked by obstacle at {tuple(new_location)}")
            pass

        loc_tuple = tuple(self.location)
        reward = -0.01 # Maybe up this?? Maybe more incentive to move?

        if loc_tuple in self.visited:
            reward -= 0.01 #Penalise revisiting also
        else:
            self.visited.add(loc_tuple)

        if self.is_scout:
            if loc_tuple in self.recon_points:
                reward += 5
                self.recon_points.remove(loc_tuple)
            elif loc_tuple in self.missions:
                reward += 20
                self.missions.remove(loc_tuple)
            if random.random() < 0.01:
                reward -= 10
                self.done = True
        else:
            if random.random() < 0.01:
                reward += 30
                self.done = True

        reward += 0.01 * (1 / (1 + self._min_manhattan_distance()))
        if self.step_count >= self.max_steps:
            self.done = True

        return self._get_obs(), reward, self.done

    def _move_vector(self, direction):
        return [(1, 0), (0, 1), (-1, 0), (0, -1)][direction]

    def render(self): # print the grid
        grid = [['.' for _ in range(self.grid_size)] for _ in range(self.grid_size)]
        for ox, oy in self.obstacles:
            grid[oy][ox] = 'X'
        x, y = self.location
        grid[y][x] = 'S' if self.is_scout else 'G'
        # print("\n".join(" ".join(row) for row in grid))
        # print()


In [103]:
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.feature = nn.Sequential(nn.Linear(input_dim, 256), nn.ReLU())
        self.value = nn.Sequential(nn.Linear(256, 128), nn.ReLU(), nn.Linear(128, 1))
        self.advantage = nn.Sequential(nn.Linear(256, 128), nn.ReLU(), nn.Linear(128, output_dim))

    def forward(self, x):
        x = self.feature(x)
        value = self.value(x)
        adv = self.advantage(x)
        return value + adv - adv.mean(dim=1, keepdim=True)


In [104]:
class PrioritizedReplayBuffer: #needed to sample more important experiences more frequently
    def __init__(self, capacity=100000, alpha=0.6): #capacity is the number of exp it holds and alpha is how much of priority those experiences are
        self.capacity = capacity
        self.buffer, self.priorities = [], []
        self.pos, self.alpha = 0, alpha

    def push(self, s, a, r, ns, d, role): #push the experience
        max_prio = max(self.priorities, default=1.0) # the newest one most priority so it can be looked at asap
        data = (s, a, r, ns, d, role)
        if len(self.buffer) < self.capacity:
            self.buffer.append(data)
            self.priorities.append(max_prio)
        else: #this is if it is full to overwrite the oldest data
            self.buffer[self.pos] = data
            self.priorities[self.pos] = max_prio
        self.pos = (self.pos + 1) % self.capacity #increment in a circular fashion

    def sample(self, batch_size, beta=0.4): #0.4 is it too low?
        probs = np.array(self.priorities) ** self.alpha
        probs /= probs.sum()
        indices = np.random.choice(len(self.buffer), batch_size, p=probs)
        samples = [self.buffer[i] for i in indices]
        weights = ((len(self.buffer) * probs[indices]) ** -beta).astype(np.float32) #compute the importance of a sample
        weights /= weights.max()
        s, a, r, ns, d, roles = zip(*samples)
        return (np.array(s), np.array(a), np.array(r), np.array(ns),
                np.array(d), np.array(roles), weights, indices)

    def update_priorities(self, indices, priorities):
        for i, p in zip(indices, priorities):
            self.priorities[i] = p

    def __len__(self):
        return len(self.buffer)


In [105]:
class DQNAgent:# manages the two roles
    def __init__(self, state_dim, action_dim, gamma=0.99): #how big should the neural network be?
        self.gamma = gamma
        self.agents = {
            0: self._build_agent(state_dim, action_dim),
            1: self._build_agent(state_dim, action_dim)
        }

    def _build_agent(self, state_dim, action_dim):
        model = DQN(state_dim, action_dim).cuda()
        target = DQN(state_dim, action_dim).cuda()
        target.load_state_dict(model.state_dict())
        optim = torch.optim.Adam(model.parameters(), lr=1e-3)
        return {"model": model, "target": target, "optimizer": optim}

    def act(self, state, role, epsilon=0.1): # should episolon be higher so more exploring
        if random.random() < epsilon:
            return random.randint(0, 4)
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).cuda()
        with torch.no_grad():
            return self.agents[role]["model"](state).argmax().item()

    def update(self, buffer, batch_size=64, beta=0.4):
        if len(buffer) < batch_size:
            return
        s, a, r, ns, d, roles, w, idx = buffer.sample(batch_size, beta)
        for role in [0, 1]:
            ri = [i for i, rl in enumerate(roles) if rl == role] # filter by the roles
            if not ri: continue

            agent = self.agents[role] # use the role specific agent
            s_t = torch.tensor(s[ri], dtype=torch.float32).cuda()
            ns_t = torch.tensor(ns[ri], dtype=torch.float32).cuda()
            a_t = torch.tensor(a)[ri].long().unsqueeze(1).cuda()
            r_t = torch.tensor(r)[ri].unsqueeze(1).cuda()
            d_t = torch.tensor(d)[ri].unsqueeze(1).float().cuda()
            w_t = torch.tensor(w)[ri].unsqueeze(1).cuda()

            q_vals = agent["model"](s_t).gather(1, a_t)
            next_actions = agent["model"](ns_t).argmax(1, keepdim=True)
            next_q = agent["target"](ns_t).gather(1, next_actions)
            expected = r_t + self.gamma * next_q * (1 - d_t)
            loss = (w_t * (q_vals - expected.detach()).pow(2)).mean()

            td_err = (q_vals - expected.detach()).abs().detach().cpu().numpy().flatten()
            buffer.update_priorities([idx[i] for i in ri], td_err)

            agent["optimizer"].zero_grad()
            loss.backward()
            agent["optimizer"].step()

            if random.random() < 0.01:
                agent["target"].load_state_dict(agent["model"].state_dict())

In [None]:
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt

visited_counts = defaultdict(int)
reward_min, reward_max = float('inf'), float('-inf')
last_position = {0: None, 1: None}

def flatten_obs(obs):
    flat_view = np.array(obs['viewcone']).flatten() / 255.0
    return flat_view

def get_state_key(obs):
    return tuple(obs['location'])

def normalize_reward(r):
    global reward_min, reward_max
    reward_min = min(reward_min, r)
    reward_max = max(reward_max, r)
    if reward_max == reward_min:
        return 0.0  # return neutral if no variation yet
    return (r - reward_min) / (reward_max - reward_min) * 2 - 1

def compute_shaped_reward(obs, reward, role):
    state_key = get_state_key(obs)
    visited_counts[state_key] += 1

    rep_penalty = -0.01 * (visited_counts[state_key] - 1)
    idle_penalty = -0.02 if last_position[role] == obs['location'] else 0.0
    last_position[role] = obs['location']

    # i think closer means better
    agent_pos = obs['location']
    if role == 1:  # scout
        targets = env.recon_points or env.missions
    else:
        targets = env.missions
    if targets:
        min_dist = min(abs(agent_pos[0] - tx) + abs(agent_pos[1] - ty) for tx, ty in targets)
        dist_bonus = 0.01 * (1 / (1 + min_dist))
    else:
        dist_bonus = 0.0

    shaped = reward + rep_penalty + idle_penalty + dist_bonus
    return normalize_reward(shaped)

env = TILAIEnv()
state_dim = 40
action_dim = 5
agent = DQNAgent(state_dim, action_dim)
buffer = PrioritizedReplayBuffer()

epsilon = 1.0
final_epsilon = 0.05
decay_steps = 10000
decay_rate = (epsilon - final_epsilon) / decay_steps
episodes = 5000
all_rewards = []

for ep in range(episodes):
    obs = env.reset()
    role = obs['scout']
    state = flatten_obs(obs)
    total_reward = 0
    done = False

    visited_counts.clear()
    last_position[0] = None
    last_position[1] = None

    while not done:
        action = agent.act(state, role, epsilon)
        next_obs, reward, done = env.step(action)
        next_state = flatten_obs(next_obs)

        shaped_reward = compute_shaped_reward(next_obs, reward, role)

        buffer.push(state, action, shaped_reward, next_state, done, role)
        agent.update(buffer)

        if ep % 500 == 0:
            print(f"[Ep {ep}] Role: {role} | Loc: {obs['location']} | A: {action} | R: {reward:.2f} -> SR: {shaped_reward:.2f}")
            env.render()

        state = next_state
        total_reward += reward

    epsilon = max(final_epsilon, epsilon - decay_rate)
    all_rewards.append(total_reward)

    if ep % 1000 == 0:
        avg_r = np.mean(all_rewards[-1000:])
        print(f"Ep {ep} | Avg Reward (last 1k): {avg_r:.2f} | Epsilon: {epsilon:.3f}")

plt.plot(all_rewards)
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.title("Training Reward Curve")
plt.grid(True)
plt.show()


In [None]:
!pip install safetensors

import os
import json
import torch
from safetensors.torch import save_file

save_dir = "./model"
os.makedirs(save_dir, exist_ok=True)

flattened = {}
for role in ["scout", "guard"]:
    for part in ["model", "target", "optimizer"]:
        sd = agent.agents[0 if role == "scout" else 1][part].state_dict()
        for key, tensor in sd.items():
            if isinstance(tensor, torch.Tensor):
                flattened[f"{role}_{part}.{key}"] = tensor

save_file(flattened, os.path.join(save_dir, "model.safetensors"))

with open(os.path.join(save_dir, "config.json"), "w") as f:
    json.dump({
        "state_dim": state_dim,
        "action_dim": action_dim,
        "gamma": agent.gamma,
        "episodes": episodes,
        "final_epsilon": epsilon,
        "all_rewards": all_rewards  # optional: full list of rewards per episode
    }, f, indent=2)


In [110]:
class TILAIEnv:
    def __init__(self, seed=None):
        """Initialize the TIL-AI environment."""
        self.grid_size = 16
        self.max_steps = 100
        self.obstacles = set()  # Will be initialized in reset()
        self.reset(seed=seed)
        
    def reset(self, seed=None):
        """Reset the environment for a new episode."""
        if seed is not None:
            random.seed(seed)
            np.random.seed(seed)

        self.step_count = 0
        self.is_scout = bool(random.randint(0, 1))  # scout is 1, guard is 0
        self.direction = random.randint(0, 3)  # 0: right, 1: down, 2: left, 3: up
        
        # Create obstacles - but not too many to ensure agent can move
        self.obstacles = set()
        for _ in range(30):
            self.obstacles.add((random.randint(0, 15), random.randint(0, 15)))
        
        # Place agent at a location without obstacles
        while True:
            self.location = [random.randint(0, 15), random.randint(0, 15)]
            if tuple(self.location) not in self.obstacles:
                break
        
        # Generate recon points and missions, excluding obstacle and agent locations
        self.recon_points = set()
        self.missions = set()
        
        for _ in range(100):
            while True:
                point = (random.randint(0, 15), random.randint(0, 15))
                if point not in self.obstacles and point != tuple(self.location):
                    self.recon_points.add(point)
                    break
                    
        for _ in range(20):
            while True:
                point = (random.randint(0, 15), random.randint(0, 15))
                if point not in self.obstacles and point != tuple(self.location) and point not in self.recon_points:
                    self.missions.add(point)
                    break
        
        self.visited = set()
        self.done = False
        return self._get_obs()

    def _get_obs(self):
        """Generate observation based on agent's current state."""
        # Create a simplified viewcone
        viewcone = np.zeros((7, 5), dtype=np.uint8)
        
        # Add some simple content to the viewcone based on agent's surroundings
        x, y = self.location
        direction = self.direction
        
        # Fill the viewcone with simplified information
        # This is a simplified implementation - the actual competition will have more complex logic
        for i in range(7):
            for j in range(5):
                # Calculate relative position in the grid
                dx = j - 2  # -2 to 2 (left to right)
                dy = i - 2  # -2 to 4 (back to front, with more visibility forward)
                
                # Rotate based on agent's direction
                if direction == 0:  # right
                    nx, ny = x + dx, y + dy - 2  # Adjust forward visibility
                elif direction == 1:  # down
                    nx, ny = x - dy + 2, y + dx  # Rotate 90° clockwise
                elif direction == 2:  # left
                    nx, ny = x - dx, y - dy + 2  # Rotate 180°
                elif direction == 3:  # up
                    nx, ny = x + dy - 2, y - dx  # Rotate 270° clockwise
                
                # Set value based on what's at this position
                if 0 <= nx < 16 and 0 <= ny < 16:  # Within grid bounds
                    if (nx, ny) in self.obstacles:
                        viewcone[i, j] = 129  # Empty tile (1) with walls (128)
                    elif (nx, ny) in self.recon_points:
                        viewcone[i, j] = 2  # Recon point
                    elif (nx, ny) in self.missions:
                        viewcone[i, j] = 3  # Mission
                    else:
                        viewcone[i, j] = 1  # Empty tile
                else:
                    viewcone[i, j] = 0  # No vision (out of bounds)
        
        return {
            "viewcone": viewcone.tolist(),
            "direction": self.direction,
            "location": self.location,
            "scout": int(self.is_scout),
            "step": self.step_count
        }

    def _min_manhattan_distance(self):
        """Calculate minimum Manhattan distance to relevant targets."""
        targets = self.recon_points if self.is_scout else self.missions
        if not targets:
            return 0 
        return min(abs(self.location[0] - tx) + abs(self.location[1] - ty) for tx, ty in targets)

    def step(self, action):
        """Take a step in the environment based on the agent's action."""
        if self.done:
            return self._get_obs(), 0, True

        self.step_count += 1
        
        # Store original location for debugging
        original_location = self.location.copy()
        
        # Process the action
        dx, dy = 0, 0
        if action == 0:  # Move forward
            dx, dy = self._move_vector(self.direction)
        elif action == 1:  # Move backward
            dx, dy = self._move_vector((self.direction + 2) % 4)
        elif action == 2:  # Turn left
            self.direction = (self.direction - 1) % 4
        elif action == 3:  # Turn right
            self.direction = (self.direction + 1) % 4
        # Action 4 is stay (do nothing)

        # Calculate new location
        if action in [0, 1]:  # Only move for forward/backward actions
            new_x = np.clip(self.location[0] + dx, 0, 15)
            new_y = np.clip(self.location[1] + dy, 0, 15)
            new_loc = (new_x, new_y)
            
            # Only update if not blocked by an obstacle
            if new_loc not in self.obstacles:
                self.location = [new_x, new_y]
        
        # Debug movement
        # if original_location != self.location:
        #     print(f"Moved from {original_location} to {self.location}, action={action}")
        # elif action in [0, 1]:
        #     print(f"Movement blocked from {original_location}, action={action}")

        # Get current location as tuple for easier checking
        loc_tuple = tuple(self.location)
        
        # Base reward slightly negative to encourage efficient paths
        reward = -0.01
        
        # Penalty for revisiting locations
        if loc_tuple in self.visited:
            reward -= 0.01
        else:
            self.visited.add(loc_tuple)

        # Role-specific rewards
        if self.is_scout:
            # Scout collects recon points
            if loc_tuple in self.recon_points:
                reward += 1  # Match competition reward
                self.recon_points.remove(loc_tuple)
                # print(f"Collected recon point at {loc_tuple}! Reward +1")
                
            # Scout completes missions
            if loc_tuple in self.missions:
                reward += 5  # Match competition reward
                self.missions.remove(loc_tuple)
                # print(f"Completed mission at {loc_tuple}! Reward +5")
                
            # Small chance of capture (game ending for scout)
            if random.random() < 0.01:
                reward -= 50  # Match competition punishment
                self.done = True
                # print(f"Scout captured! Reward -50")
                
        else:  # Guard
            # Guard captures scout (simulation)
            if random.random() < 0.01:
                reward += 50  # Match competition reward
                self.done = True
                # print(f"Guard captured scout! Reward +50")

        # Add small reward for getting closer to objectives
        reward += 0.01 * (1 / (1 + self._min_manhattan_distance()))
        
        # Check for episode termination
        if self.step_count >= self.max_steps:
            self.done = True

        return self._get_obs(), reward, self.done

    def _move_vector(self, direction):
        """Get the movement vector for a given direction."""
        return [(1, 0), (0, 1), (-1, 0), (0, -1)][direction]

    def render(self):
        """Render the current state of the environment."""
        grid = [['.' for _ in range(self.grid_size)] for _ in range(self.grid_size)]
        
        # Add obstacles, recon points, and missions to the grid
        for ox, oy in self.obstacles:
            grid[oy][ox] = 'X'
        
        for rx, ry in self.recon_points:
            grid[ry][rx] = 'R'
            
        for mx, my in self.missions:
            grid[my][mx] = 'M'
        
        # Add agent to the grid
        x, y = self.location
        grid[y][x] = 'S' if self.is_scout else 'G'
        
        # Print the grid
        # print("\n".join(" ".join(row) for row in grid))
        # print(f"Direction: {['Right', 'Down', 'Left', 'Up'][self.direction]}")
        # print(f"Step: {self.step_count}/{self.max_steps}")
        # print()

In [125]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import random
import os
import time
import math
import copy

# The DQN architecture with gradient accumulation for more stable updates
class ImprovedDQN(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(ImprovedDQN, self).__init__()
        
        # Replace BatchNorm with LayerNorm which works with any batch size
        self.feature = nn.Sequential(
            nn.Linear(state_dim, 256),
            nn.LayerNorm(256),  # LayerNorm instead of BatchNorm
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.LayerNorm(256),  # LayerNorm instead of BatchNorm
            nn.ReLU()
        )
        
        # Value stream
        self.value = nn.Sequential(
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
        
        # Advantage stream with proper initialization
        self.advantage = nn.Sequential(
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, action_dim)
        )
        
        # Initialize weights for better performance
        self._init_weights()
    
    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.orthogonal_(m.weight, gain=1)
                nn.init.constant_(m.bias, 0)
    
    def forward(self, x):
        x = self.feature(x)
        value = self.value(x)
        adv = self.advantage(x)
        # Dueling architecture
        return value + adv - adv.mean(1, keepdim=True)

# Enhanced Prioritized Replay Buffer with separate buffers and better sampling
class StabilizedReplayBuffer:
    def __init__(self, capacity=100000, alpha=0.6, beta_start=0.4, beta_frames=100000):
        self.capacity = capacity
        # Separate buffers for scout and guard
        self.buffers = {
            0: {'data': [], 'priorities': []},  # Guard
            1: {'data': [], 'priorities': []}   # Scout
        }
        self.pos = {0: 0, 1: 0}
        self.alpha = alpha
        # Beta annealing for importance sampling
        self.beta_start = beta_start
        self.beta_frames = beta_frames
        self.frame_idx = 0
        
    def get_beta(self):
        """Calculate current beta value for importance sampling"""
        fraction = min(self.frame_idx / self.beta_frames, 1.0)
        return min(self.beta_start + fraction * (1.0 - self.beta_start), 1.0)
        
    def push(self, s, a, r, ns, d, role):
        """Add experience to the buffer with improved priority assignment"""
        self.frame_idx += 1
        buffer = self.buffers[role]
        
        # Use max priority for new samples or a default value
        max_prio = max(buffer['priorities'], default=1.0)
        data = (s, a, r, ns, d)
        
        if len(buffer['data']) < self.capacity:
            buffer['data'].append(data)
            buffer['priorities'].append(max_prio)
        else:
            buffer['data'][self.pos[role]] = data
            buffer['priorities'][self.pos[role]] = max_prio
            
        self.pos[role] = (self.pos[role] + 1) % self.capacity
        
    def sample(self, batch_size, role):
        """Sample with prioritization and proper beta scheduling"""
        buffer = self.buffers[role]
        if len(buffer['data']) < batch_size:
            return None
        
        # Get current beta value
        beta = self.get_beta()
            
        # Calculate sampling probabilities
        probs = np.array(buffer['priorities']) ** self.alpha
        probs /= probs.sum()
        
        # Sample with priority
        indices = np.random.choice(len(buffer['data']), batch_size, p=probs)
        samples = [buffer['data'][i] for i in indices]
        
        # Calculate importance sampling weights
        weights = ((len(buffer['data']) * probs[indices]) ** -beta).astype(np.float32)
        weights /= weights.max()
        
        s, a, r, ns, d = zip(*samples)
        return (np.array(s), np.array(a), np.array(r), np.array(ns),
                np.array(d), weights, indices)
                
    def update_priorities(self, indices, priorities, role):
        """Update priorities with clipping to prevent extreme values"""
        priorities = np.clip(priorities, 0.01, 10.0)  # Clip priorities for stability
        for i, p in zip(indices, priorities):
            self.buffers[role]['priorities'][i] = p
            
    def __len__(self, role):
        return len(self.buffers[role]['data'])

class RobustDQNAgent:
    def __init__(self, state_dim, action_dim, gamma=0.99):
        self.gamma = gamma
        
        # Role-specific configurations
        self.configs = {
            0: {  # Guard
                "lr": 2e-4,  # Increased from original
                "update_freq": 250,
                "grad_steps": 1
            },
            1: {  # Scout
                "lr": 5e-5,  # Lower for more stability
                "update_freq": 200,
                "grad_steps": 2
            }
        }
        
        # Build separate agents for scout and guard
        self.agents = {
            0: self._build_agent(state_dim, action_dim, self.configs[0]),  # Guard
            1: self._build_agent(state_dim, action_dim, self.configs[1])   # Scout
        }
        
        # Exploration parameters with slower decay
        self.epsilon = {
            0: 0.1,  # Guard starts with lower exploration
            1: 0.3   # Scout needs more exploration
        }
        self.epsilon_decay = {
            0: 0.9995,  # Slower decay
            1: 0.9998   # Even slower decay for scout
        }
        self.epsilon_final = {
            0: 0.05,
            1: 0.1   # Higher final epsilon for scout
        }
        
        # Training counters
        self.update_counter = {0: 0, 1: 0}
        
        # Historical models for self-play (new)
        self.historical_models = {
            0: [],  # Guard history
            1: []   # Scout history
        }
        self.historical_model_episodes = []
        
    def _build_agent(self, state_dim, action_dim, config):
        """Build agent components with improved configuration"""
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = ImprovedDQN(state_dim, action_dim).to(device)
        target = ImprovedDQN(state_dim, action_dim).to(device)
        target.load_state_dict(model.state_dict())
        
        # Freeze target network parameters
        for param in target.parameters():
            param.requires_grad = False
            
        # Use Adam with smaller epsilon for numerical stability
        optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"], eps=1e-5)
        
        # Add learning rate scheduler for better convergence
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5000, gamma=0.5)
        
        return {
            "model": model, 
            "target": target, 
            "optimizer": optimizer,
            "scheduler": scheduler,
            "device": device,
            "config": config
        }
    
    # Store historical models for self-play (new method)
    def store_historical_model(self, episode):
        """Store a snapshot of current models for later self-play"""
        guard_model = copy.deepcopy(self.agents[0]["model"].state_dict())
        scout_model = copy.deepcopy(self.agents[1]["model"].state_dict())
        
        self.historical_models[0].append(guard_model)
        self.historical_models[1].append(scout_model)
        self.historical_model_episodes.append(episode)
        
        # Keep only last 5 historical models to save memory
        if len(self.historical_models[0]) > 5:
            self.historical_models[0].pop(0)
            self.historical_models[1].pop(0)
            self.historical_model_episodes.pop(0)
        
        print(f"Stored historical models at episode {episode}")
    
    # Act using either current or historical model (modified method)
    def act(self, state, role, evaluation=False, use_historical=False, historical_idx=None):
        """Select action using epsilon-greedy with optional historical model"""
        epsilon = 0.01 if evaluation else self.epsilon[role]
        device = self.agents[role]["device"]
        
        # Epsilon-greedy exploration
        if random.random() < epsilon:
            return random.randint(0, 4)
        
        # Convert state to tensor
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
        
        # Use historical model if specified
        if use_historical and historical_idx is not None and len(self.historical_models[role]) > historical_idx:
            # Create temporary model with historical weights
            temp_model = ImprovedDQN(self.agents[role]["model"].feature[0].in_features, 
                                     self.agents[role]["model"].advantage[-1].out_features).to(device)
            temp_model.load_state_dict(self.historical_models[role][historical_idx])
            temp_model.eval()
            
            with torch.no_grad():
                q_values = temp_model(state)
                action = q_values.argmax().item()
                
            return action
        
        # Otherwise use current model
        self.agents[role]["model"].eval()
        with torch.no_grad():
            q_values = self.agents[role]["model"](state)
            action = q_values.argmax().item()
        self.agents[role]["model"].train()
        
        return action
        
    def decay_epsilon(self, role):
        """Decay epsilon according to role-specific schedule"""
        self.epsilon[role] = max(
            self.epsilon_final[role],
            self.epsilon[role] * self.epsilon_decay[role]
        )
            
    def update(self, buffer, role, batch_size=64):
        """Update the agent with improved training stability measures"""
        result = buffer.sample(batch_size, role)
        if result is None:
            return
            
        s, a, r, ns, d, w, idx = result
        agent = self.agents[role]
        device = agent["device"]
        config = agent["config"]
        
        # Convert numpy arrays to tensors
        s_t = torch.tensor(s, dtype=torch.float32).to(device)
        ns_t = torch.tensor(ns, dtype=torch.float32).to(device)
        a_t = torch.tensor(a).long().unsqueeze(1).to(device)
        r_t = torch.tensor(r).unsqueeze(1).to(device)
        d_t = torch.tensor(d).unsqueeze(1).float().to(device)
        w_t = torch.tensor(w).unsqueeze(1).to(device)
        
        # Ensure model is in training mode for batch updates
        agent["model"].train()
        agent["target"].eval()
        
        # Update step with multiple gradient accumulations for stability
        for _ in range(config["grad_steps"]):
            # Calculate current Q values
            q_vals = agent["model"](s_t).gather(1, a_t)
            
            # Double DQN update - reduces overestimation bias
            with torch.no_grad():
                next_actions = agent["model"](ns_t).argmax(1, keepdim=True)
                next_q = agent["target"](ns_t).gather(1, next_actions)
            
                # Expected Q values with proper scaling
                expected = r_t + self.gamma * next_q * (1 - d_t)
            
            # Use Huber loss for more stable updates
            loss = F.smooth_l1_loss(q_vals, expected, reduction='none')
            weighted_loss = (w_t * loss).mean()
            
            # Calculate TD errors for priority update
            with torch.no_grad():
                td_err = (q_vals - expected).abs().detach().cpu().numpy().flatten()
            
            # Optimization step
            agent["optimizer"].zero_grad()
            weighted_loss.backward()
            
            # Gradient clipping to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(agent["model"].parameters(), max_norm=1.0)
            
            agent["optimizer"].step()
        
        # Update priorities in the replay buffer
        buffer.update_priorities(idx, td_err, role)
        
        # Step the learning rate scheduler
        agent["scheduler"].step()
        
        # Target network update with fixed frequency
        self.update_counter[role] += 1
        if self.update_counter[role] % config["update_freq"] == 0:
            agent["target"].load_state_dict(agent["model"].state_dict())
            
    def save(self, path_prefix="./modelnew/"):
            """Save both scout and guard models using safetensors."""
            try:
                from safetensors.torch import save_file
                os.makedirs(path_prefix, exist_ok=True)
                flattened = {}
                for role_name, role_id in [("guard", 0), ("scout", 1)]:
                    for part in ["model", "target"]:
                        state_dict = self.agents[role_id][part].state_dict()
                        for key, tensor in state_dict.items():
                            if isinstance(tensor, torch.Tensor):
                                flattened[f"{role_name}_{part}.{key}"] = tensor

                save_file(flattened, os.path.join(path_prefix, "model.safetensors"))
                print(f"Safetensors model saved to {path_prefix}model.safetensors")
            except Exception as e:
                print(f"Error saving model: {e}")

    def load(self, path):
            """Load model from safetensors file"""
            try:
                from safetensors.torch import load_file
                loaded = load_file(path)

                # Extract and load the model weights
                for role_name, role_id in [("guard", 0), ("scout", 1)]:
                    for part in ["model", "target"]:
                        model = self.agents[role_id][part]
                        state_dict = model.state_dict()

                        # Update state dict with loaded weights
                        for key in state_dict:
                            loaded_key = f"{role_name}_{part}.{key}"
                            if loaded_key in loaded:
                                state_dict[key] = loaded[loaded_key]

                        # Load updated state dict
                        model.load_state_dict(state_dict)
                print(f"Model loaded from {path}")
            except Exception as e:
                print(f"Error loading model: {e}")
            
# Enhanced flatten observation function with normalization
def compute_improved_reward(obs, reward, role, env, current_phase=0):
    """Shaped rewards with better learning signals and stability penalties"""
    # Safety check for None observation
    if obs is None:
        print("Warning: Received None observation in compute_improved_reward. Returning original reward.")
        return reward
        
    agent_pos = obs['location']
    
    # Start with original reward
    shaped_reward = reward
    
    if role == 1:  # scout
        # Scout reward modifiers
        
        # 1. Make capture penalty less severe
        if reward <= -40:  # Captured
            shaped_reward = -10  # Less severe than before
        
        # 2. Increase proximity rewards
        targets = env.recon_points.union(env.missions) if hasattr(env, 'recon_points') and hasattr(env, 'missions') else set()
        if targets:
            # Calculate distances to targets
            distances = [abs(agent_pos[0] - tx) + abs(agent_pos[1] - ty) for tx, ty in targets]
            min_dist = min(distances) if distances else 16
            
            # Stronger proximity reward with slower decay
            proximity_reward = 1.2 * math.exp(-0.15 * min_dist)
            shaped_reward += proximity_reward
            
            # Progress reward - increase for getting closer
            if hasattr(env, 'last_scout_pos') and env.last_scout_pos is not None:
                old_distances = [abs(env.last_scout_pos[0] - tx) + abs(env.last_scout_pos[1] - ty) 
                               for tx, ty in targets]
                old_min_dist = min(old_distances) if old_distances else 16
                
                if min_dist < old_min_dist:
                    shaped_reward += 0.5  # Higher progress bonus
        
        # Store position for next step
        env.last_scout_pos = agent_pos.copy()
        
        # 3. Amplify collection rewards more
        if reward >= 1:  # Collected something
            shaped_reward *= 1.5  # Stronger amplification
            
    else:  # guard
        # Add small positive reward just for being a guard to offset negative bias
        shaped_reward += 0.1
        
        # Focus more on patrolling high-value areas
        targets = env.missions if hasattr(env, 'missions') else set()
        if targets:
            distances = [abs(agent_pos[0] - tx) + abs(agent_pos[1] - ty) for tx, ty in targets]
            min_dist = min(distances) if distances else 16
            
            # Stronger rewards for guards near targets
            shaped_reward += 0.2 * math.exp(-0.2 * min_dist)
        
        # Reward guards for captures (assumption: high positive reward means capture)
        if reward > 5:  # If there's a capture reward
            shaped_reward *= 1.5  # Amplify it
    
    # Add reward stability penalty to prevent wild oscillations (new)
    role_key = f'last_reward_{role}'
    if hasattr(env, role_key) and getattr(env, role_key) is not None:
        reward_delta = abs(shaped_reward - getattr(env, role_key))
        shaped_reward -= 0.1 * reward_delta  # Small penalty for dramatic reward changes
    
    # Store the current reward for next time
    setattr(env, role_key, shaped_reward)
    
    # Add exploration bonus in later phases (new)
    if current_phase >= 1:  # In Phase 2 or later
        # Small random exploration bonus to encourage trying new strategies
        shaped_reward += 0.1 * random.random()
    
    return shaped_reward

# Enhanced flatten observation function with normalization and safety checks
def preprocess_observation(obs):
    """
    Convert observation to a flat vector with consistent size and better normalization.
    """
    # Safety check for None observation
    if obs is None:
        print("Warning: Received None observation in preprocess_observation. Returning None.")
        return None
        
    try:
        # Get viewcone and normalize to [0,1]
        # Handle potential missing keys
        if 'viewcone' not in obs:
            print("Warning: 'viewcone' not found in observation. Using zeros.")
            flat_view = np.zeros(36)  # Assuming 36 viewcone elements
        else:
            flat_view = np.array(obs['viewcone']).flatten() / 255.0
        
        # Direction as one-hot encoding
        if 'direction' not in obs:
            print("Warning: 'direction' not found in observation. Using zeros.")
            direction_onehot = np.zeros(4)
        else:
            direction_onehot = np.zeros(4)
            direction_onehot[obs['direction']] = 1
        
        # Role indicator (scout or guard)
        if 'scout' not in obs:
            print("Warning: 'scout' not found in observation. Using default False.")
            is_scout = np.array([0])
        else:
            is_scout = np.array([obs['scout']])
        
        # Location normalized to [0,1]
        if 'location' not in obs:
            print("Warning: 'location' not found in observation. Using zeros.")
            location = np.zeros(2)
        else:
            location = np.array(obs['location']) / 15.0
        
        # Step count normalized
        if 'step' not in obs:
            print("Warning: 'step' not found in observation. Using zero.")
            step = np.array([0])
        else:
            step = np.array([obs['step'] / 100.0])
        
        # Combine all features
        return np.concatenate([flat_view, direction_onehot, is_scout, location, step])
        
    except Exception as e:
        print(f"Error in preprocess_observation: {e}")
        return None

# Main training function with improved curriculum and historical self-play
def train_with_curriculum(env_class, episodes=5000, save_interval=100):
    """Training with curriculum learning, improved stability, and historical self-play"""
    
    # Initialize environment
    env = env_class()
    state_dim = 43  # Based on flatten_obs output
    action_dim = 5
    
    # Create agent with optimized parameters
    agent = RobustDQNAgent(state_dim=state_dim, action_dim=action_dim)
    
    # Use replay buffer with proper prioritization
    buffer = StabilizedReplayBuffer(capacity=100000)
    
    # Training tracking
    rewards_window = {
        'all': [],
        'scout': [],
        'guard': []
    }
    scout_captures = 0
    scout_collections = 0
    guard_captures = 0
    
    # Ensure environment has attributes we need
    if not hasattr(env, 'last_scout_pos'):
        env.last_scout_pos = None
    if not hasattr(env, 'last_reward_0'):
        env.last_reward_0 = None  # Guard reward tracking
    if not hasattr(env, 'last_reward_1'):
        env.last_reward_1 = None  # Scout reward tracking
    
    start_time = time.time()
    
    # Modified curriculum with more gradual learning rate transitions
    curriculum = [
        # Phase 1: More balanced approach from the start
        {'episodes': 1000, 'scout_ratio': 0.6, 'scout_lr': 5e-5, 'guard_lr': 2e-4},
        # Phase 2: More gradual learning rate transition
        {'episodes': 2000, 'scout_ratio': 0.5, 'scout_lr': 4e-5, 'guard_lr': 1.5e-4},
        # Phase 3: Even more gradual reduction
        {'episodes': 2000, 'scout_ratio': 0.4, 'scout_lr': 3e-5, 'guard_lr': 1e-4}
    ]
    
    # Track progress through curriculum
    current_phase = 0
    phase_progress = 0
    phase_role_counts = {0: 0, 1: 0}  # Count episodes by role
    
    # Initialize counter for role balancing
    episodes_since_guard = 0
    episodes_since_scout = 0
    
    # Historical model snapshots
    historical_snapshot_interval = 250
    
    # Optional: Enable periodic evaluation
    evaluation_interval = 250
    best_eval_reward = -float('inf')
    
    for ep in range(1, episodes+1):
        # Update curriculum phase if needed
        if current_phase < len(curriculum) - 1:
            if phase_progress >= curriculum[current_phase]['episodes']:
                current_phase += 1
                phase_progress = 0
                phase_role_counts = {0: 0, 1: 0}
                print(f"Moving to curriculum phase {current_phase+1}")
                
                # Update learning rates
                for role in [0, 1]:
                    new_lr = curriculum[current_phase][f"{'scout' if role==1 else 'guard'}_lr"]
                    for param_group in agent.agents[role]["optimizer"].param_groups:
                        param_group['lr'] = new_lr
        
        # Store historical models periodically
        if ep % historical_snapshot_interval == 0:
            agent.store_historical_model(ep)
        
        # Improved role selection logic for balanced training
        scout_ratio = curriculum[current_phase]['scout_ratio']
        expected_guard_count = phase_progress * (1 - scout_ratio)
        
        # Debug stats every 10 episodes
        if ep % 10 == 0:
            print(f"Phase {current_phase+1}, Progress: {phase_progress}/{curriculum[current_phase]['episodes']}")
            print(f"Scout count: {phase_role_counts[1]}, Guard count: {phase_role_counts[0]}")
            print(f"Scout ratio: {scout_ratio}, Expected guard count: {expected_guard_count:.1f}")
        
        # Force a guard episode if we're behind or it's been too long since last guard episode
        if phase_role_counts[0] < expected_guard_count - 5 or episodes_since_guard > 15:
            role = 0  # Guard
            episodes_since_guard = 0
            episodes_since_scout += 1
        # Force a scout episode if it's been too long since last scout episode
        elif episodes_since_scout > 15:
            role = 1  # Scout
            episodes_since_scout = 0
            episodes_since_guard += 1
        # Regular probability-based selection
        else:
            if random.random() < scout_ratio:
                role = 1  # Scout
                episodes_since_guard += 1
                episodes_since_scout = 0
            else:
                role = 0  # Guard
                episodes_since_scout += 1
                episodes_since_guard = 0
            
        # Update phase tracking
        phase_progress += 1
        phase_role_counts[role] += 1
        
        # Set environment role
        env.is_scout = (role == 1)
        
        # Use historical opponent occasionally for self-play (new)
        use_historical_opponent = (len(agent.historical_models[0]) > 0 and
                                  random.random() < 0.2)  # 20% chance
        historical_idx = random.randint(0, len(agent.historical_models[0])-1) if use_historical_opponent else None
        
        if use_historical_opponent:
            print(f"Using historical opponent from episode {agent.historical_model_episodes[historical_idx]}")
        
        # Reset environment
        obs = env.reset()
        state = preprocess_observation(obs)
        episode_reward = 0
        
        # Track recon/mission collection
        initial_recon_count = len(env.recon_points) if hasattr(env, 'recon_points') else 0
        initial_mission_count = len(env.missions) if hasattr(env, 'missions') else 0
        
        # Episode loop
        done = False
        steps = 0
        
        while not done:
            # Select action (potentially using historical model for opponent)
            action = agent.act(state, role)
            next_obs, reward, done = env.step(action)
            next_state = preprocess_observation(next_obs)
            
            # Shape reward for better learning signal, with current phase information
            shaped_reward = compute_improved_reward(next_obs, reward, role, env, current_phase)
            
            # Store experience in buffer
            buffer.push(state, action, shaped_reward, next_state, done, role)
            
            # Update agent with adaptive frequency
            # More frequent updates to ensure learning
            update_freq = 1 if ep < 1000 or role == 0 else 2
            if steps % update_freq == 0:
                agent.update(buffer, role)
            
            # Print debug info periodically
            if ep % 100 == 0 and steps == 0:
                print(f"[Ep {ep}] Role: {role} | Loc: {obs['location']} | Dir: {obs['direction']}")
                
            # Move to next state
            state = next_state
            episode_reward += reward
            steps += 1
        
        # Check for scout captures or collections
        if role == 1:  # scout
            if reward <= -40:  # Captured
                scout_captures += 1
            
            # Check for collections
            if hasattr(env, 'recon_points') and hasattr(env, 'missions'):
                recon_collected = initial_recon_count - len(env.recon_points)
                missions_completed = initial_mission_count - len(env.missions)
                if recon_collected > 0 or missions_completed > 0:
                    scout_collections += 1
        else:  # guard
            # Assume positive reward for guard might be capture
            if reward > 5:
                guard_captures += 1
        
        # Decay exploration rate
        agent.decay_epsilon(role)
        
        # Track rewards by role (using sliding window)
        window_size = 50
        if len(rewards_window['all']) >= window_size:
            rewards_window['all'].pop(0)
        rewards_window['all'].append(episode_reward)
        
        if role == 1:  # scout
            if len(rewards_window['scout']) >= window_size:
                rewards_window['scout'].pop(0)
            rewards_window['scout'].append(episode_reward)
        else:  # guard
            if len(rewards_window['guard']) >= window_size:
                rewards_window['guard'].pop(0)
            rewards_window['guard'].append(episode_reward)
        
        # Log training progress periodically
        if ep % 100 == 0:
            elapsed = time.time() - start_time
            
            # Calculate stats
            avg_r = np.mean(rewards_window['all']) if rewards_window['all'] else 0
            avg_scout = np.mean(rewards_window['scout']) if rewards_window['scout'] else 0
            avg_guard = np.mean(rewards_window['guard']) if rewards_window['guard'] else 0
            
            # Log training status with guard captures
            print(f"Ep {ep}/{episodes} | "
                  f"Time: {elapsed:.1f}s | "
                  f"Avg Reward: {avg_r:.2f} | "
                  f"Scout: {avg_scout:.2f} | "
                  f"Guard: {avg_guard:.2f} | "
                  f"Scout Captures: {scout_captures} | "
                  f"Scout Collections: {scout_collections} | "
                  f"Guard Captures: {guard_captures} | "
                  f"Epsilon (S/G): {agent.epsilon[1]:.3f}/{agent.epsilon[0]:.3f}")
        
        # Enhanced evaluation with both current and historical opponents
        if ep % evaluation_interval == 0:
            eval_rewards = []
            eval_scout_rewards = []
            eval_guard_rewards = []
            
            # First evaluate with current opponent models
            print("Evaluating against current opponent models...")
            for _ in range(10):
                # Ensure we test both roles
                if _ < 5:
                    eval_role = 1  # Scout
                else:
                    eval_role = 0  # Guard
                    
                env.is_scout = (eval_role == 1)
                eval_obs = env.reset()
                eval_state = preprocess_observation(eval_obs)
                eval_episode_reward = 0
                eval_done = False
                
                while not eval_done:
                    # Use evaluation mode (minimal exploration)
                    eval_action = agent.act(eval_state, eval_role, evaluation=True)
                    eval_next_obs, eval_reward, eval_done = env.step(eval_action)
                    eval_next_state = preprocess_observation(eval_next_obs)
                    eval_episode_reward += eval_reward
                    eval_state = eval_next_state
                
                eval_rewards.append(eval_episode_reward)
                if eval_role == 1:
                    eval_scout_rewards.append(eval_episode_reward)
                else:
                    eval_guard_rewards.append(eval_episode_reward)
            
            avg_eval_reward = np.mean(eval_rewards)
            avg_eval_scout = np.mean(eval_scout_rewards) if eval_scout_rewards else 0
            avg_eval_guard = np.mean(eval_guard_rewards) if eval_guard_rewards else 0
            
            print(f"Current Opponent Evaluation: Overall: {avg_eval_reward:.2f}, Scout: {avg_eval_scout:.2f}, Guard: {avg_eval_guard:.2f}")
            
            # Also evaluate against historical opponents if available
            if len(agent.historical_models[0]) > 0:
                historical_eval_rewards = []
                
                print("Evaluating against historical opponent models...")
                # Use oldest historical model for diversity
                historical_idx = 0
                
                for _ in range(6):
                    # Test both roles
                    eval_role = 1 if _ < 3 else 0
                    
                    env.is_scout = (eval_role == 1)
                    eval_obs = env.reset()
                    eval_state = preprocess_observation(eval_obs)
                    eval_episode_reward = 0
                    eval_done = False
                    
                    while not eval_done:
                        # Current agent plays against historical opponent
                        eval_action = agent.act(eval_state, eval_role, evaluation=True)
                        eval_next_obs, eval_reward, eval_done = env.step(eval_action)
                        eval_next_state = preprocess_observation(eval_next_obs)
                        eval_episode_reward += eval_reward
                        eval_state = eval_next_state
                    
                    historical_eval_rewards.append(eval_episode_reward)
                
                avg_historical_eval = np.mean(historical_eval_rewards)
                print(f"Historical Opponent Evaluation: {avg_historical_eval:.2f}")
                
                # Combined evaluation score (80% current, 20% historical)
                combined_eval = 0.8 * avg_eval_reward + 0.2 * avg_historical_eval
                print(f"Combined Evaluation Score: {combined_eval:.2f}")
                
                # Use combined score for best model tracking
                if combined_eval > best_eval_reward:
                    best_eval_reward = combined_eval
                    agent.save("./model/best_model/")
                    print(f"New best model saved! Combined Reward: {best_eval_reward:.2f}")
            else:
                # If no historical models yet, just use current evaluation
                if avg_eval_reward > best_eval_reward:
                    best_eval_reward = avg_eval_reward
                    agent.save("./model/best_model/")
                    print(f"New best model saved! Reward: {best_eval_reward:.2f}")
        
        # Regular model saving
        if ep % save_interval == 0:
            agent.save()
    
    # Final save
    agent.save()
    print(f"Training complete! Scout episodes: {phase_role_counts[1]}, "
          f"Guard episodes: {phase_role_counts[0]}")
    return agent

if __name__ == "__main__":
    # Set random seeds for reproducibility
    random.seed(42)
    np.random.seed(42)
    torch.manual_seed(42)
    
    # Start improved training 
    agent = train_with_curriculum(TILAIEnv, episodes=5000, save_interval=100)

Phase 1, Progress: 9/1000
Scout count: 7, Guard count: 2
Scout ratio: 0.6, Expected guard count: 3.6
Phase 1, Progress: 19/1000
Scout count: 12, Guard count: 7
Scout ratio: 0.6, Expected guard count: 7.6
Phase 1, Progress: 29/1000
Scout count: 19, Guard count: 10
Scout ratio: 0.6, Expected guard count: 11.6
Phase 1, Progress: 39/1000
Scout count: 25, Guard count: 14
Scout ratio: 0.6, Expected guard count: 15.6
Phase 1, Progress: 49/1000
Scout count: 31, Guard count: 18
Scout ratio: 0.6, Expected guard count: 19.6
Phase 1, Progress: 59/1000
Scout count: 39, Guard count: 20
Scout ratio: 0.6, Expected guard count: 23.6
Phase 1, Progress: 69/1000
Scout count: 46, Guard count: 23
Scout ratio: 0.6, Expected guard count: 27.6
Phase 1, Progress: 79/1000
Scout count: 50, Guard count: 29
Scout ratio: 0.6, Expected guard count: 31.6
Phase 1, Progress: 89/1000
Scout count: 56, Guard count: 33
Scout ratio: 0.6, Expected guard count: 35.6
Phase 1, Progress: 99/1000
Scout count: 62, Guard count: 37
S