In [5]:
# Add this to the first cell of your notebook
%load_ext autoreload
%autoreload 2  # Reload all modules (except those excluded) before executing code

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
import os
# os.environ['CUDA_VISIBLE_DEVICES'] = ''   # comment this line if you want GPU again

In [4]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [7]:
!pip install tqdm psutil plotly kaleido --quiet
import os
import sys
import random
import time
import threading
import IPython
from google.colab import output
from datetime import datetime

In [8]:
# Ensure version compatibility with local setup
print("Installing compatible package versions...")

# Install specific versions to match local setup
!pip install torch==2.5.1 numpy==2.0.1 --quiet
import numpy as np


# Verify PyTorch and NumPy versions after installation
!python -c "import torch; print(f'PyTorch post-install: {torch.__version__}')"
!python -c "import numpy; print(f'NumPy post-install: {numpy.__version__}')"

# Force CUDA setup for PyTorch
import torch
print(f"CUDA setup: available={torch.cuda.is_available()}, device count={torch.cuda.device_count() if torch.cuda.is_available() else 0}")
if torch.cuda.is_available():
    print(f"Current CUDA device: {torch.cuda.current_device()}, name: {torch.cuda.get_device_name()}")

Installing compatible package versions...
PyTorch post-install: 2.5.1+cu124
NumPy post-install: 2.0.1
CUDA setup: available=True, device count=1
Current CUDA device: 0, name: NVIDIA L4


In [9]:
# Set path to your project on Google Drive
DRIVE_PATH = '/content/drive/MyDrive/CatanRL'

# Change to the project directory
%cd {DRIVE_PATH}



/content/drive/MyDrive/CatanRL


In [10]:
import time, threading
from google.colab import output

# 1. Define a dummy no‑op Python callback.
def _noop():
    return "ok"

# 2. Register it once – gives us a handle "keep_alive"
output.register_callback('keep_alive', _noop)

def keep_colab_alive(interval_sec: int = 60):
    """Ping the front‑end every <interval_sec> seconds.

    Works in 2025‑04 Colab because it uses the same mechanism Colab widgets use.
    """
    while True:
        try:
            # JS in the page calls the Python no‑op; the round‑trip is what matters
            output.eval_js('google.colab.kernel.invokeFunction("keep_alive", [], {})')
            print("♥", end="", flush=True)
        except Exception:
            # If the socket was momentarily closed, ignore and retry
            pass
        time.sleep(interval_sec)

print("Starting keep‑alive thread …")
threading.Thread(target=keep_colab_alive, daemon=True).start()


Starting keep‑alive thread …


In [None]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

device = torch.device('cpu')

# Set random seeds for reproducibility
def set_random_seeds(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    # if torch.cuda.is_available():
    #     torch.cuda.manual_seed(seed)
    #     torch.backends.cudnn.deterministic = True
    #     torch.backends.cudnn.benchmark = False

set_random_seeds()

# Step 5: Set up training parameters
# You can customize these parameters
import argparse

# Parse arguments from command line or use defaults
# This allows you to change parameters when running the notebook
parser = argparse.ArgumentParser(description="AlphaZero Catan Training")
parser.add_argument("--iterations", type=int, default=50, help="Number of training iterations")
parser.add_argument("--resume", type=str, default=None, help="Path to checkpoint to resume from")
parser.add_argument("--games", type=int, default=20, help="Number of self-play games per iteration")
parser.add_argument("--sims", type=int, default=100, help="Number of MCTS simulations per move")
parser.add_argument("--eval-games", type=int, default=10, help="Number of evaluation games")
parser.add_argument("--quick", action="store_true", help="Quick training (1 iteration, 2 games)")
parser.add_argument("--medium", action="store_true", help="Medium training (10 iterations, 5 games)")
parser.add_argument("--full", action="store_true", help="Full training (50 iterations, 20 games)")
parser.add_argument("--overnight", action="store_true", help="Overnight training (100 iterations, 30 games)")

# Parse the arguments directly
args = parser.parse_args(['--overnight', '--resume', 'models/model_iter_280.pt'])
#just overnight no resume
# args = parser.parse_args(['--medium'])
# Configure training mode
if args.quick:
    print("Running in QUICK mode")
    args.iterations = 1
    args.games = 2
    args.sims = 10
    args.eval_games = 2
elif args.medium:
    print("Running in MEDIUM mode")
    args.iterations = 10
    args.games = 5
    args.sims = 50
    args.eval_games = 5
elif args.full:
    print("Running in FULL mode")
    args.iterations = 50
    args.games = 20
    args.sims = 100
    args.eval_games = 10
elif args.overnight:
    print("Running in OVERNIGHT mode")
    args.iterations = 300
    args.games = 22
    args.sims = 150
    args.eval_games = 10

print(f"\n=== AlphaZero Catan Training ===")
print(f"Iterations: {args.iterations}")
print(f"Self-play games per iteration: {args.games}")
print(f"MCTS simulations per move: {args.sims}")
print(f"Resume from: {args.resume if args.resume else 'Starting fresh'}")

# Step 6: Get configuration and modify for GPU
from AlphaZero.utils.config import get_config
config = get_config()

# Customize config with command line arguments
config['num_iterations'] = args.iterations
config['self_play_games'] = args.games
config['num_simulations'] = args.sims
config['eval_games'] = args.eval_games
config['device'] = 'cpu'
config['starting_iter'] = 1 if args.resume is None else int(args.resume.split('_')[-1].split('.')[0]) + 1

# Step 7: Create logs and models directories
!mkdir -p logs
!mkdir -p models
!mkdir -p plots

# Step 8: Start the training
from AlphaZero.training.training_pipeline import TrainingPipeline

try:
    # Start time tracking
    start_time = time.time()

    # Create the training pipeline
    pipeline = TrainingPipeline(config)

    # Train for the specified iterations
    pipeline.train(args.iterations, resume_from=args.resume)

    # Calculate total training time
    total_time = time.time() - start_time
    hours = int(total_time // 3600)
    minutes = int((total_time % 3600) // 60)
    seconds = int(total_time % 60)

    print(f"\nTraining completed in {hours}h {minutes}m {seconds}s")

except KeyboardInterrupt:
    print("\nTraining interrupted! Saving checkpoint...")
    pipeline.save_model(pipeline.current_iteration)
    print("Checkpoint saved. You can resume with this checkpoint later.")
except Exception as e:
    print(f"Error during training: {e}")
    import traceback
    traceback.print_exc()

# Step 9: Copy results back to Google Drive
!mkdir -p {DRIVE_PATH}/models_{timestamp}
!mkdir -p {DRIVE_PATH}/logs_{timestamp}
!mkdir -p {DRIVE_PATH}/plots_{timestamp}

!cp -r models/* {DRIVE_PATH}/models_{timestamp}/
!cp -r logs/* {DRIVE_PATH}/logs_{timestamp}/
!cp -r plots/* {DRIVE_PATH}/plots_{timestamp}/
""
print(f"\nTraining results saved to Google Drive in folders with timestamp {timestamp}")

Running in OVERNIGHT mode

=== AlphaZero Catan Training ===
Iterations: 300
Self-play games per iteration: 22
MCTS simulations per move: 150
Resume from: models/model_iter_280.pt
[2025-04-22 16:42:53] AlphaZero Catan Training started at 20250422_164253
[2025-04-22 16:42:53] Configuration: {'state_dim': 992, 'action_dim': 200, 'hidden_dim': 256, 'learning_rate': 0.001, 'num_iterations': 300, 'self_play_games': 22, 'eval_games': 10, 'epochs': 10, 'batch_size': 256, 'buffer_size': 50000, 'num_simulations': 150, 'c_puct': 1.5, 'mcts_batch_size': 12, 'noise_eps': 0.25, 'noise_alpha': 0.3, 'max_moves': 200, 'device': 'cpu', 'placement_epochs': 10, 'placement_batch_size': 32, 'placement_lr': 0.001, 'placement_hidden_dim': 128, 'placement_train_frequency': 5, 'train_placement_network': True, 'use_placement_network': False, 'model_dir': 'models', 'starting_iter': 281}
[2025-04-22 16:42:53] Loading default placement network from models/placement_network.pt


  checkpoint = torch.load(placement_path, map_location=torch.device('cpu'))


[2025-04-22 16:42:54] Network dimensions: input_dim=260, hidden_dim=128, output_dim=54
[2025-04-22 16:42:54] Successfully loaded default placement network
[2025-04-22 16:42:55] Placement network components initialized successfully
[2025-04-22 16:42:55] Initial placement network training is ENABLED
[2025-04-22 16:42:55] Placement network settings:
[2025-04-22 16:42:55]   - Training epochs: 10
[2025-04-22 16:42:55]   - Batch size: 32
[2025-04-22 16:42:55]   - Learning rate: 0.001
[2025-04-22 16:42:55]   - Hidden dimensions: 128
[2025-04-22 16:42:55]   - Training frequency: Every 5 iterations


  checkpoint = torch.load(path)


[2025-04-22 16:42:59] Backup buffer loaded: 50000 examples, 201.6 MB
[2025-04-22 16:42:59] Checkpoint loaded from models/model_iter_280.pt, resuming from iteration 280
[2025-04-22 16:42:59] Resuming training from iteration 280
[2025-04-22 16:42:59] 
=== Iteration 281/300 ===
[2025-04-22 16:42:59] Starting self-play...


  state_dict = torch.load(checkpoint_path, map_location='cpu')
  state_dict = torch.load(checkpoint_path, map_location='cpu')
  state_dict = torch.load(checkpoint_path, map_location='cpu')
  state_dict = torch.load(checkpoint_path, map_location='cpu')
  state_dict = torch.load(checkpoint_path, map_location='cpu')
  state_dict = torch.load(checkpoint_path, map_location='cpu')
  state_dict = torch.load(checkpoint_path, map_location='cpu')
  state_dict = torch.load(checkpoint_path, map_location='cpu')
  state_dict = torch.load(checkpoint_path, map_location='cpu')
  state_dict = torch.load(checkpoint_path, map_location='cpu')
  state_dict = torch.load(checkpoint_path, map_location='cpu')


[Worker 0] Network parameter sum: 22125.208540
Win reward: base=1.00, time_bonus=0.11 (moves: 157/200)


Self-play games:  55%|█████▍    | 12/22 [03:46<03:36, 21.66s/it]

Win reward: base=1.00, time_bonus=0.13 (moves: 149/200)


Self-play games:  59%|█████▉    | 13/22 [04:15<03:35, 23.99s/it]

Win reward: base=1.00, time_bonus=0.16 (moves: 137/200)


Self-play games:  73%|███████▎  | 16/22 [04:22<00:59,  9.85s/it]

Win reward: base=1.00, time_bonus=0.02 (moves: 193/200)


Self-play games:  95%|█████████▌| 21/22 [04:55<00:07,  7.93s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games: 100%|██████████| 22/22 [05:00<00:00, 13.66s/it]


[2025-04-22 16:47:59] Self-play completed in 300.48s, generated 5181 examples (17.2 games/s)
[2025-04-22 16:47:59] Processing 4 games for initial placement data
[2025-04-22 16:47:59] Extracted 720 initial placement examples
[2025-04-22 16:47:59] Skipping placement network training for iteration 281 (training every 5 iterations)
[2025-04-22 16:47:59] Training network...
Epoch 1/10: Loss 1.8781 (Value 0.0381, Policy 1.8400)
Epoch 2/10: Loss 1.8460 (Value 0.0252, Policy 1.8208)
Epoch 3/10: Loss 1.8234 (Value 0.0222, Policy 1.8012)
Epoch 4/10: Loss 1.8258 (Value 0.0170, Policy 1.8088)
Epoch 5/10: Loss 1.8045 (Value 0.0133, Policy 1.7912)
Epoch 6/10: Loss 1.7876 (Value 0.0115, Policy 1.7761)
Epoch 7/10: Loss 1.7918 (Value 0.0101, Policy 1.7817)
Epoch 8/10: Loss 1.7737 (Value 0.0097, Policy 1.7640)
Epoch 9/10: Loss 1.7527 (Value 0.0092, Policy 1.7435)
Epoch 10/10: Loss 1.7631 (Value 0.0084, Policy 1.7546)
[2025-04-22 16:48:17] Training completed in 18.17s
[2025-04-22 16:48:17] Iteration 281 

Self-play games:   0%|          | 0/22 [00:00<?, ?it/s]

[Worker 0] Network parameter sum: 22220.954881
Win reward: base=1.00, time_bonus=0.24 (moves: 105/200)


Self-play games:   5%|▍         | 1/22 [01:29<31:16, 89.35s/it]

Win reward: base=1.00, time_bonus=0.27 (moves: 93/200)


Self-play games:  14%|█▎        | 3/22 [01:47<08:24, 26.57s/it]

Win reward: base=1.00, time_bonus=0.21 (moves: 117/200)


Self-play games:  18%|█▊        | 4/22 [01:49<05:03, 16.85s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  41%|████      | 9/22 [02:33<02:00,  9.31s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  50%|█████     | 11/22 [02:47<01:27,  7.97s/it]

Win reward: base=1.00, time_bonus=0.26 (moves: 97/200)


Self-play games:  55%|█████▍    | 12/22 [03:27<02:57, 17.78s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  59%|█████▉    | 13/22 [04:10<03:49, 25.54s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  73%|███████▎  | 16/22 [04:23<01:10, 11.70s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  77%|███████▋  | 17/22 [04:23<00:41,  8.26s/it]

Win reward: base=1.00, time_bonus=0.16 (moves: 137/200)


Self-play games:  82%|████████▏ | 18/22 [04:25<00:25,  6.47s/it]

Win reward: base=1.00, time_bonus=0.14 (moves: 145/200)


Self-play games:  91%|█████████ | 20/22 [04:35<00:11,  5.59s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  95%|█████████▌| 21/22 [04:35<00:03,  3.95s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games: 100%|██████████| 22/22 [04:47<00:00, 13.06s/it]


[2025-04-22 16:53:05] Self-play completed in 287.29s, generated 5023 examples (17.5 games/s)
[2025-04-22 16:53:05] Processing 4 games for initial placement data
[2025-04-22 16:53:05] Extracted 585 initial placement examples
[2025-04-22 16:53:05] Skipping placement network training for iteration 282 (training every 5 iterations)
[2025-04-22 16:53:05] Training network...
Epoch 1/10: Loss 1.8723 (Value 0.0373, Policy 1.8350)
Epoch 2/10: Loss 1.8428 (Value 0.0265, Policy 1.8163)
Epoch 3/10: Loss 1.8281 (Value 0.0206, Policy 1.8075)
Epoch 4/10: Loss 1.8098 (Value 0.0184, Policy 1.7914)
Epoch 5/10: Loss 1.8002 (Value 0.0139, Policy 1.7863)
Epoch 6/10: Loss 1.7862 (Value 0.0135, Policy 1.7726)
Epoch 7/10: Loss 1.7956 (Value 0.0110, Policy 1.7846)
Epoch 8/10: Loss 1.7674 (Value 0.0109, Policy 1.7564)
Epoch 9/10: Loss 1.7571 (Value 0.0084, Policy 1.7488)
Epoch 10/10: Loss 1.7855 (Value 0.0089, Policy 1.7766)
[2025-04-22 16:53:23] Training completed in 18.18s
[2025-04-22 16:53:23] Iteration 282 

Self-play games:   0%|          | 0/22 [00:00<?, ?it/s]

[Worker 0] Network parameter sum: 22396.449641
Win reward: base=1.00, time_bonus=0.32 (moves: 73/200)


Self-play games:   5%|▍         | 1/22 [01:07<23:34, 67.37s/it]

Win reward: base=1.00, time_bonus=0.26 (moves: 97/200)


Self-play games:  14%|█▎        | 3/22 [01:59<12:11, 38.49s/it]

Win reward: base=1.00, time_bonus=0.33 (moves: 69/200)


Self-play games:  45%|████▌     | 10/22 [02:35<01:25,  7.08s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  64%|██████▎   | 14/22 [04:03<02:44, 20.52s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  82%|████████▏ | 18/22 [04:25<00:35,  8.85s/it]

Win reward: base=1.00, time_bonus=0.08 (moves: 169/200)


Self-play games: 100%|██████████| 22/22 [04:53<00:00, 13.32s/it]


[2025-04-22 16:58:16] Self-play completed in 293.02s, generated 5040 examples (17.2 games/s)
[2025-04-22 16:58:16] Processing 4 games for initial placement data
[2025-04-22 16:58:16] Extracted 782 initial placement examples
[2025-04-22 16:58:16] Skipping placement network training for iteration 283 (training every 5 iterations)
[2025-04-22 16:58:16] Training network...
Epoch 1/10: Loss 1.8622 (Value 0.0358, Policy 1.8263)
Epoch 2/10: Loss 1.8535 (Value 0.0259, Policy 1.8277)
Epoch 3/10: Loss 1.8164 (Value 0.0205, Policy 1.7959)
Epoch 4/10: Loss 1.8315 (Value 0.0173, Policy 1.8141)
Epoch 5/10: Loss 1.8145 (Value 0.0166, Policy 1.7979)
Epoch 6/10: Loss 1.8164 (Value 0.0154, Policy 1.8010)
Epoch 7/10: Loss 1.7844 (Value 0.0115, Policy 1.7730)
Epoch 8/10: Loss 1.7686 (Value 0.0077, Policy 1.7609)
Epoch 9/10: Loss 1.7818 (Value 0.0072, Policy 1.7746)
Epoch 10/10: Loss 1.7678 (Value 0.0070, Policy 1.7608)
[2025-04-22 16:58:35] Training completed in 18.39s
[2025-04-22 16:58:35] Iteration 283 

Self-play games:   0%|          | 0/22 [00:00<?, ?it/s]

[Worker 0] Network parameter sum: 22318.444604
Win reward: base=1.00, time_bonus=0.28 (moves: 89/200)


Self-play games:   5%|▍         | 1/22 [01:11<25:00, 71.43s/it]

Win reward: base=1.00, time_bonus=0.22 (moves: 113/200)


Self-play games:   9%|▉         | 2/22 [01:13<10:13, 30.65s/it]

Win reward: base=1.00, time_bonus=0.12 (moves: 153/200)


Self-play games:  18%|█▊        | 4/22 [02:03<07:51, 26.20s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  27%|██▋       | 6/22 [02:11<03:35, 13.46s/it]

Win reward: base=1.00, time_bonus=0.06 (moves: 177/200)


Self-play games:  41%|████      | 9/22 [02:39<02:25, 11.18s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  59%|█████▉    | 13/22 [04:07<03:35, 23.95s/it]

Win reward: base=1.00, time_bonus=0.19 (moves: 125/200)


Self-play games:  77%|███████▋  | 17/22 [04:32<00:49,  9.99s/it]

Win reward: base=1.00, time_bonus=0.15 (moves: 141/200)


Self-play games:  82%|████████▏ | 18/22 [04:34<00:30,  7.52s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games: 100%|██████████| 22/22 [04:55<00:00, 13.42s/it]


[2025-04-22 17:03:30] Self-play completed in 295.32s, generated 5119 examples (17.3 games/s)
[2025-04-22 17:03:30] Processing 4 games for initial placement data
[2025-04-22 17:03:30] Extracted 466 initial placement examples
[2025-04-22 17:03:30] Skipping placement network training for iteration 284 (training every 5 iterations)
[2025-04-22 17:03:30] Training network...
Epoch 1/10: Loss 1.8594 (Value 0.0384, Policy 1.8210)
Epoch 2/10: Loss 1.8605 (Value 0.0284, Policy 1.8321)
Epoch 3/10: Loss 1.8312 (Value 0.0233, Policy 1.8079)
Epoch 4/10: Loss 1.8072 (Value 0.0166, Policy 1.7906)
Epoch 5/10: Loss 1.8091 (Value 0.0120, Policy 1.7971)
Epoch 6/10: Loss 1.8039 (Value 0.0107, Policy 1.7932)
Epoch 7/10: Loss 1.7760 (Value 0.0097, Policy 1.7663)
Epoch 8/10: Loss 1.7809 (Value 0.0082, Policy 1.7728)
Epoch 9/10: Loss 1.7578 (Value 0.0075, Policy 1.7503)
Epoch 10/10: Loss 1.7681 (Value 0.0080, Policy 1.7600)
[2025-04-22 17:03:48] Training completed in 18.21s
[2025-04-22 17:03:48] Iteration 284 

Self-play games:   0%|          | 0/22 [00:00<?, ?it/s]

[Worker 0] Network parameter sum: 22377.403046
Win reward: base=1.00, time_bonus=0.34 (moves: 65/200)


Self-play games:   5%|▍         | 1/22 [00:52<18:27, 52.75s/it]

Win reward: base=1.00, time_bonus=0.24 (moves: 105/200)


Self-play games:  14%|█▎        | 3/22 [02:04<12:45, 40.27s/it]

Win reward: base=1.00, time_bonus=0.05 (moves: 181/200)


Self-play games:  18%|█▊        | 4/22 [02:05<07:24, 24.67s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  27%|██▋       | 6/22 [02:24<04:21, 16.33s/it]

Win reward: base=1.00, time_bonus=0.20 (moves: 121/200)


Self-play games:  50%|█████     | 11/22 [02:39<00:56,  5.10s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  55%|█████▍    | 12/22 [02:53<01:16,  7.62s/it]

Win reward: base=1.00, time_bonus=0.23 (moves: 109/200)


Self-play games:  59%|█████▉    | 13/22 [03:30<02:31, 16.79s/it]

Win reward: base=1.00, time_bonus=0.17 (moves: 133/200)


Self-play games:  73%|███████▎  | 16/22 [04:20<01:47, 17.92s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  91%|█████████ | 20/22 [04:34<00:13,  6.68s/it]

Win reward: base=1.00, time_bonus=0.07 (moves: 173/200)


Self-play games: 100%|██████████| 22/22 [04:53<00:00, 13.36s/it]


[2025-04-22 17:08:42] Self-play completed in 293.99s, generated 5060 examples (17.2 games/s)
[2025-04-22 17:08:42] Processing 4 games for initial placement data
[2025-04-22 17:08:43] Extracted 700 initial placement examples
[2025-04-22 17:08:43] Training initial placement network on 700 examples
Epoch 1/10: Loss = 1.4020, Accuracy = 0.0193
Epoch 2/10: Loss = 1.3867, Accuracy = 0.0432
Epoch 3/10: Loss = 1.3684, Accuracy = 0.0789
Epoch 4/10: Loss = 1.3402, Accuracy = 0.0848
Epoch 5/10: Loss = 1.3327, Accuracy = 0.0818
Epoch 6/10: Loss = 1.3100, Accuracy = 0.0833
Epoch 7/10: Loss = 1.2934, Accuracy = 0.0938
Epoch 8/10: Loss = 1.2844, Accuracy = 0.0893
Epoch 9/10: Loss = 1.2552, Accuracy = 0.0908
Epoch 10/10: Loss = 1.2643, Accuracy = 0.0893
[2025-04-22 17:08:43] Placement network training completed in 0.86s
[2025-04-22 17:08:43] Metrics: Loss = 1.3237, Accuracy = 0.0754
[2025-04-22 17:08:43] Saved placement network to models/placement_network.pt
[2025-04-22 17:08:43] Training network...
E

Evaluation games:  10%|█         | 1/10 [00:15<02:22, 15.84s/it]

[2025-04-22 17:09:18] Game 1: duration=15.84s, moves=200, our_VP=3, winner=1 VP=5


Evaluation games:  20%|██        | 2/10 [00:29<01:58, 14.78s/it]

[2025-04-22 17:09:32] Game 2: duration=14.04s, moves=200, our_VP=4, winner=2 VP=6


Evaluation games:  30%|███       | 3/10 [00:45<01:47, 15.38s/it]

[2025-04-22 17:09:48] Game 3: duration=16.10s, moves=200, our_VP=9, winner=0 VP=9


Evaluation games:  40%|████      | 4/10 [01:00<01:29, 14.94s/it]

[2025-04-22 17:10:02] Game 4: duration=14.26s, moves=200, our_VP=4, winner=3 VP=5


Evaluation games:  50%|█████     | 5/10 [01:11<01:08, 13.61s/it]

[2025-04-22 17:10:14] Game 5: duration=11.26s, moves=200, our_VP=6, winner=0 VP=6


Evaluation games:  60%|██████    | 6/10 [01:26<00:55, 13.94s/it]

[2025-04-22 17:10:28] Game 6: duration=14.56s, moves=200, our_VP=6, winner=0 VP=6


Evaluation games:  70%|███████   | 7/10 [01:39<00:40, 13.64s/it]

[2025-04-22 17:10:41] Game 7: duration=13.03s, moves=200, our_VP=2, winner=3 VP=5


Evaluation games:  80%|████████  | 8/10 [01:56<00:29, 14.69s/it]

[2025-04-22 17:10:58] Game 8: duration=16.93s, moves=142, our_VP=11, winner=0 VP=11


Evaluation games:  90%|█████████ | 9/10 [02:07<00:13, 13.62s/it]

[2025-04-22 17:11:09] Game 9: duration=11.26s, moves=200, our_VP=6, winner=0 VP=6


Evaluation games: 100%|██████████| 10/10 [02:21<00:00, 14.12s/it]

[2025-04-22 17:11:23] Game 10: duration=13.93s, moves=200, our_VP=4, winner=0 VP=4
[2025-04-22 17:11:23] Evaluated 10 games in 141.22s (0.07 games/s)
[2025-04-22 17:11:23] Evaluation results: win_rate=0.60, avg_vp=5.50, avg_length=194.20, total_moves=1942
[2025-04-22 17:11:23] Eval resource usage: CPU 58.2%, RAM 11.8%, GPU peak memory 0.00 GB
[2025-04-22 17:11:23] Evaluation completed in 141.23s





[2025-04-22 17:11:25] Plotly metrics visualization saved to plots/training_metrics.html
[2025-04-22 17:11:25] Checkpoint saved: models/model_iter_285.pt
[2025-04-22 17:11:25] Saved placement network to models/placement_network.pt
[2025-04-22 17:11:26] Replay buffer saved: models/latest_buffer.pkl (50000 examples, 201.7 MB)
[2025-04-22 17:11:26] Iteration 285 done in 457.64s
[2025-04-22 17:11:26] Resource usage: CPU 9.7%, RAM 12.0%, GPU peak memory 0.00 GB
[2025-04-22 17:11:26] Network parameter sum after training: 22579.423485
[2025-04-22 17:11:26] 
=== Iteration 286/300 ===
[2025-04-22 17:11:26] Starting self-play...


Self-play games:   0%|          | 0/22 [00:00<?, ?it/s]

[Worker 0] Network parameter sum: 22579.423046
Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  18%|█▊        | 4/22 [02:16<06:09, 20.52s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  23%|██▎       | 5/22 [02:21<04:14, 14.99s/it]

Win reward: base=1.00, time_bonus=0.01 (moves: 197/200)


Self-play games:  45%|████▌     | 10/22 [02:49<01:27,  7.29s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  50%|█████     | 11/22 [02:58<01:24,  7.71s/it]

Win reward: base=1.00, time_bonus=0.29 (moves: 85/200)


Self-play games:  55%|█████▍    | 12/22 [03:59<03:59, 23.98s/it]

Win reward: base=1.00, time_bonus=0.15 (moves: 141/200)


Self-play games:  95%|█████████▌| 21/22 [04:42<00:02,  2.87s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games: 100%|██████████| 22/22 [04:44<00:00, 12.94s/it]


[2025-04-22 17:16:11] Self-play completed in 284.60s, generated 5121 examples (18.0 games/s)
[2025-04-22 17:16:11] Processing 4 games for initial placement data
[2025-04-22 17:16:11] Extracted 765 initial placement examples
[2025-04-22 17:16:11] Skipping placement network training for iteration 286 (training every 5 iterations)
[2025-04-22 17:16:11] Training network...
Epoch 1/10: Loss 1.8632 (Value 0.0317, Policy 1.8314)
Epoch 2/10: Loss 1.8755 (Value 0.0248, Policy 1.8507)
Epoch 3/10: Loss 1.8224 (Value 0.0177, Policy 1.8047)
Epoch 4/10: Loss 1.8171 (Value 0.0128, Policy 1.8043)
Epoch 5/10: Loss 1.8070 (Value 0.0118, Policy 1.7952)
Epoch 6/10: Loss 1.7993 (Value 0.0101, Policy 1.7892)
Epoch 7/10: Loss 1.7879 (Value 0.0074, Policy 1.7804)
Epoch 8/10: Loss 1.7790 (Value 0.0074, Policy 1.7716)
Epoch 9/10: Loss 1.7861 (Value 0.0076, Policy 1.7785)
Epoch 10/10: Loss 1.7853 (Value 0.0069, Policy 1.7785)
[2025-04-22 17:16:29] Training completed in 18.50s
[2025-04-22 17:16:29] Iteration 286 

Self-play games:   0%|          | 0/22 [00:00<?, ?it/s]

[Worker 0] Network parameter sum: 22625.207949


Self-play games:  14%|█▎        | 3/22 [02:07<09:53, 31.23s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  27%|██▋       | 6/22 [02:17<02:46, 10.42s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  36%|███▋      | 8/22 [02:23<01:26,  6.19s/it]

Win reward: base=1.00, time_bonus=0.06 (moves: 177/200)


Self-play games:  45%|████▌     | 10/22 [02:35<01:13,  6.13s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  55%|█████▍    | 12/22 [04:02<04:47, 28.74s/it]

Win reward: base=1.00, time_bonus=0.26 (moves: 97/200)


Self-play games:  77%|███████▋  | 17/22 [04:34<00:42,  8.54s/it]

Win reward: base=1.00, time_bonus=0.11 (moves: 157/200)


Self-play games: 100%|██████████| 22/22 [04:47<00:00, 13.05s/it]


[2025-04-22 17:21:17] Self-play completed in 287.09s, generated 5117 examples (17.8 games/s)
[2025-04-22 17:21:17] Processing 4 games for initial placement data
[2025-04-22 17:21:17] Extracted 675 initial placement examples
[2025-04-22 17:21:17] Skipping placement network training for iteration 287 (training every 5 iterations)
[2025-04-22 17:21:17] Training network...
Epoch 1/10: Loss 1.8548 (Value 0.0366, Policy 1.8182)
Epoch 2/10: Loss 1.8569 (Value 0.0275, Policy 1.8294)
Epoch 3/10: Loss 1.8372 (Value 0.0222, Policy 1.8150)
Epoch 4/10: Loss 1.8144 (Value 0.0186, Policy 1.7957)
Epoch 5/10: Loss 1.7976 (Value 0.0137, Policy 1.7839)
Epoch 6/10: Loss 1.7769 (Value 0.0086, Policy 1.7682)
Epoch 7/10: Loss 1.7836 (Value 0.0084, Policy 1.7752)
Epoch 8/10: Loss 1.7686 (Value 0.0068, Policy 1.7618)
Epoch 9/10: Loss 1.7953 (Value 0.0068, Policy 1.7884)
Epoch 10/10: Loss 1.7603 (Value 0.0058, Policy 1.7545)
[2025-04-22 17:21:35] Training completed in 18.59s
[2025-04-22 17:21:35] Iteration 287 

Self-play games:   0%|          | 0/22 [00:00<?, ?it/s]

[Worker 0] Network parameter sum: 22598.876624


Self-play games:  14%|█▎        | 3/22 [02:14<09:55, 31.35s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  18%|█▊        | 4/22 [02:26<07:06, 23.71s/it]

Win reward: base=1.00, time_bonus=0.04 (moves: 185/200)


Self-play games:  36%|███▋      | 8/22 [02:34<01:37,  6.96s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  50%|█████     | 11/22 [03:02<01:43,  9.40s/it]

Win reward: base=1.00, time_bonus=0.35 (moves: 61/200)


Self-play games:  55%|█████▍    | 12/22 [03:14<01:41, 10.10s/it]

Win reward: base=1.00, time_bonus=0.26 (moves: 97/200)


Self-play games:  64%|██████▎   | 14/22 [04:00<02:08, 16.12s/it]

Win reward: base=1.00, time_bonus=0.15 (moves: 141/200)


Self-play games:  82%|████████▏ | 18/22 [04:34<00:40, 10.21s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  86%|████████▋ | 19/22 [04:34<00:21,  7.29s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games: 100%|██████████| 22/22 [05:03<00:00, 13.80s/it]


[2025-04-22 17:26:39] Self-play completed in 303.72s, generated 5118 examples (16.9 games/s)
[2025-04-22 17:26:39] Processing 4 games for initial placement data
[2025-04-22 17:26:39] Extracted 690 initial placement examples
[2025-04-22 17:26:39] Skipping placement network training for iteration 288 (training every 5 iterations)
[2025-04-22 17:26:39] Training network...
Epoch 1/10: Loss 1.8656 (Value 0.0338, Policy 1.8318)
Epoch 2/10: Loss 1.8434 (Value 0.0264, Policy 1.8170)
Epoch 3/10: Loss 1.8464 (Value 0.0235, Policy 1.8229)
Epoch 4/10: Loss 1.8266 (Value 0.0203, Policy 1.8063)
Epoch 5/10: Loss 1.8199 (Value 0.0170, Policy 1.8029)
Epoch 6/10: Loss 1.8159 (Value 0.0169, Policy 1.7990)
Epoch 7/10: Loss 1.7928 (Value 0.0125, Policy 1.7803)
Epoch 8/10: Loss 1.7641 (Value 0.0109, Policy 1.7533)
Epoch 9/10: Loss 1.7817 (Value 0.0096, Policy 1.7721)
Epoch 10/10: Loss 1.7685 (Value 0.0092, Policy 1.7593)
[2025-04-22 17:26:58] Training completed in 18.55s
[2025-04-22 17:26:58] Iteration 288 

Self-play games:   0%|          | 0/22 [00:00<?, ?it/s]

[Worker 0] Network parameter sum: 22730.980294
Win reward: base=1.00, time_bonus=0.24 (moves: 105/200)


Self-play games:  18%|█▊        | 4/22 [02:14<07:22, 24.58s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  23%|██▎       | 5/22 [02:22<05:16, 18.59s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  45%|████▌     | 10/22 [02:45<01:21,  6.81s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  59%|█████▉    | 13/22 [04:01<03:01, 20.11s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  64%|██████▎   | 14/22 [04:25<02:49, 21.20s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  86%|████████▋ | 19/22 [04:48<00:19,  6.36s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games: 100%|██████████| 22/22 [05:03<00:00, 13.81s/it]


[2025-04-22 17:32:02] Self-play completed in 303.94s, generated 5357 examples (17.6 games/s)
[2025-04-22 17:32:02] Processing 4 games for initial placement data
[2025-04-22 17:32:02] Extracted 773 initial placement examples
[2025-04-22 17:32:02] Skipping placement network training for iteration 289 (training every 5 iterations)
[2025-04-22 17:32:02] Training network...
Epoch 1/10: Loss 1.8827 (Value 0.0446, Policy 1.8381)
Epoch 2/10: Loss 1.8350 (Value 0.0334, Policy 1.8016)
Epoch 3/10: Loss 1.8565 (Value 0.0258, Policy 1.8307)
Epoch 4/10: Loss 1.8007 (Value 0.0183, Policy 1.7824)
Epoch 5/10: Loss 1.8102 (Value 0.0162, Policy 1.7940)
Epoch 6/10: Loss 1.8161 (Value 0.0123, Policy 1.8038)
Epoch 7/10: Loss 1.7898 (Value 0.0103, Policy 1.7795)
Epoch 8/10: Loss 1.7762 (Value 0.0091, Policy 1.7671)
Epoch 9/10: Loss 1.7836 (Value 0.0079, Policy 1.7757)
Epoch 10/10: Loss 1.7757 (Value 0.0064, Policy 1.7693)
[2025-04-22 17:32:21] Training completed in 18.79s
[2025-04-22 17:32:21] Iteration 289 

Self-play games:   0%|          | 0/22 [00:00<?, ?it/s]

[Worker 0] Network parameter sum: 22797.813437
Win reward: base=1.00, time_bonus=0.29 (moves: 85/200)


Self-play games:  18%|█▊        | 4/22 [02:02<06:48, 22.68s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  23%|██▎       | 5/22 [02:11<04:58, 17.57s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  32%|███▏      | 7/22 [02:29<03:23, 13.60s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  59%|█████▉    | 13/22 [03:57<02:42, 18.07s/it]

Win reward: base=1.00, time_bonus=0.11 (moves: 157/200)


Self-play games:  86%|████████▋ | 19/22 [04:37<00:19,  6.65s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games: 100%|██████████| 22/22 [05:08<00:00, 14.03s/it]


[2025-04-22 17:37:29] Self-play completed in 308.65s, generated 5226 examples (16.9 games/s)
[2025-04-22 17:37:29] Processing 4 games for initial placement data
[2025-04-22 17:37:30] Extracted 768 initial placement examples
[2025-04-22 17:37:30] Training initial placement network on 1468 examples
Epoch 1/10: Loss = 1.3614, Accuracy = 0.0604
Epoch 2/10: Loss = 1.3203, Accuracy = 0.0764
Epoch 3/10: Loss = 1.2925, Accuracy = 0.0757
Epoch 4/10: Loss = 1.2734, Accuracy = 0.0819
Epoch 5/10: Loss = 1.2617, Accuracy = 0.0882
Epoch 6/10: Loss = 1.2550, Accuracy = 0.0938
Epoch 7/10: Loss = 1.2467, Accuracy = 0.0951
Epoch 8/10: Loss = 1.2456, Accuracy = 0.0944
Epoch 9/10: Loss = 1.2427, Accuracy = 0.0944
Epoch 10/10: Loss = 1.2363, Accuracy = 0.0972
[2025-04-22 17:37:31] Placement network training completed in 1.77s
[2025-04-22 17:37:31] Metrics: Loss = 1.2736, Accuracy = 0.0858
[2025-04-22 17:37:31] Saved placement network to models/placement_network.pt
[2025-04-22 17:37:31] Training network...


Evaluation games:  10%|█         | 1/10 [00:16<02:28, 16.46s/it]

[2025-04-22 17:38:07] Game 1: duration=16.46s, moves=200, our_VP=8, winner=0 VP=8


Evaluation games:  20%|██        | 2/10 [00:26<01:40, 12.62s/it]

[2025-04-22 17:38:17] Game 2: duration=9.93s, moves=200, our_VP=2, winner=1 VP=4


Evaluation games:  30%|███       | 3/10 [00:43<01:44, 14.88s/it]

[2025-04-22 17:38:34] Game 3: duration=17.56s, moves=200, our_VP=7, winner=0 VP=7


Evaluation games:  40%|████      | 4/10 [00:59<01:30, 15.02s/it]

[2025-04-22 17:38:50] Game 4: duration=15.25s, moves=200, our_VP=6, winner=0 VP=6


Evaluation games:  50%|█████     | 5/10 [01:16<01:19, 15.90s/it]

[2025-04-22 17:39:07] Game 5: duration=17.46s, moves=178, our_VP=11, winner=0 VP=11


Evaluation games:  60%|██████    | 6/10 [01:32<01:03, 15.85s/it]

[2025-04-22 17:39:23] Game 6: duration=15.76s, moves=200, our_VP=6, winner=0 VP=6


Evaluation games:  70%|███████   | 7/10 [01:44<00:44, 14.76s/it]

[2025-04-22 17:39:35] Game 7: duration=12.51s, moves=200, our_VP=5, winner=0 VP=5


Evaluation games:  80%|████████  | 8/10 [02:05<00:33, 16.60s/it]

[2025-04-22 17:39:56] Game 8: duration=20.55s, moves=193, our_VP=10, winner=0 VP=10


Evaluation games:  90%|█████████ | 9/10 [02:20<00:16, 16.05s/it]

[2025-04-22 17:40:11] Game 9: duration=14.82s, moves=200, our_VP=3, winner=1 VP=5


Evaluation games: 100%|██████████| 10/10 [02:36<00:00, 15.69s/it]

[2025-04-22 17:40:27] Game 10: duration=16.61s, moves=200, our_VP=4, winner=3 VP=8
[2025-04-22 17:40:27] Evaluated 10 games in 156.91s (0.06 games/s)
[2025-04-22 17:40:27] Evaluation results: win_rate=0.70, avg_vp=6.20, avg_length=197.10, total_moves=1971
[2025-04-22 17:40:27] Eval resource usage: CPU 58.8%, RAM 12.2%, GPU peak memory 0.00 GB
[2025-04-22 17:40:27] Evaluation completed in 156.91s





[2025-04-22 17:40:28] Plotly metrics visualization saved to plots/training_metrics.html
[2025-04-22 17:40:28] Checkpoint saved: models/model_iter_290.pt
[2025-04-22 17:40:28] Saved placement network to models/placement_network.pt
[2025-04-22 17:40:29] Replay buffer saved: models/latest_buffer.pkl (50000 examples, 201.7 MB)
[2025-04-22 17:40:29] Iteration 290 done in 488.62s
[2025-04-22 17:40:29] Resource usage: CPU 9.4%, RAM 12.2%, GPU peak memory 0.00 GB
[2025-04-22 17:40:29] Network parameter sum after training: 22895.792985
[2025-04-22 17:40:29] 
=== Iteration 291/300 ===
[2025-04-22 17:40:29] Starting self-play...


Self-play games:   0%|          | 0/22 [00:00<?, ?it/s]

[Worker 0] Network parameter sum: 22895.794266
Win reward: base=1.00, time_bonus=0.34 (moves: 65/200)


Self-play games:   9%|▉         | 2/22 [02:15<22:59, 68.98s/it]

Win reward: base=1.00, time_bonus=0.07 (moves: 173/200)


Self-play games:  14%|█▎        | 3/22 [02:15<11:53, 37.55s/it]

Win reward: base=1.00, time_bonus=0.05 (moves: 181/200)


Self-play games:  23%|██▎       | 5/22 [02:20<04:34, 16.12s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  41%|████      | 9/22 [02:27<01:02,  4.80s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  50%|█████     | 11/22 [02:39<00:58,  5.30s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  68%|██████▊   | 15/22 [04:30<02:10, 18.58s/it]

Win reward: base=1.00, time_bonus=0.04 (moves: 185/200)


Self-play games: 100%|██████████| 22/22 [04:49<00:00, 13.16s/it]


[2025-04-22 17:45:19] Self-play completed in 289.43s, generated 5102 examples (17.6 games/s)
[2025-04-22 17:45:19] Processing 4 games for initial placement data
[2025-04-22 17:45:19] Extracted 830 initial placement examples
[2025-04-22 17:45:19] Skipping placement network training for iteration 291 (training every 5 iterations)
[2025-04-22 17:45:19] Training network...
Epoch 1/10: Loss 1.8867 (Value 0.0401, Policy 1.8465)
Epoch 2/10: Loss 1.8743 (Value 0.0261, Policy 1.8482)
Epoch 3/10: Loss 1.8588 (Value 0.0203, Policy 1.8384)
Epoch 4/10: Loss 1.8112 (Value 0.0133, Policy 1.7979)
Epoch 5/10: Loss 1.8073 (Value 0.0109, Policy 1.7964)
Epoch 6/10: Loss 1.8009 (Value 0.0112, Policy 1.7897)
Epoch 7/10: Loss 1.8004 (Value 0.0085, Policy 1.7919)
Epoch 8/10: Loss 1.7769 (Value 0.0087, Policy 1.7682)
Epoch 9/10: Loss 1.7638 (Value 0.0063, Policy 1.7575)
Epoch 10/10: Loss 1.7655 (Value 0.0069, Policy 1.7586)
[2025-04-22 17:45:38] Training completed in 19.06s
[2025-04-22 17:45:38] Iteration 291 

Self-play games:   0%|          | 0/22 [00:00<?, ?it/s]

[Worker 0] Network parameter sum: 22996.534096


Self-play games:   5%|▍         | 1/22 [01:12<25:21, 72.45s/it]

Win reward: base=1.00, time_bonus=0.21 (moves: 117/200)


Self-play games:  14%|█▎        | 3/22 [02:18<13:47, 43.55s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  23%|██▎       | 5/22 [02:21<05:05, 17.96s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  32%|███▏      | 7/22 [02:26<02:20,  9.34s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  55%|█████▍    | 12/22 [03:34<03:11, 19.18s/it]

Win reward: base=1.00, time_bonus=0.26 (moves: 97/200)


Self-play games:  59%|█████▉    | 13/22 [03:40<02:16, 15.15s/it]

Win reward: base=1.00, time_bonus=0.24 (moves: 105/200)


Self-play games:  68%|██████▊   | 15/22 [03:56<01:18, 11.20s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  73%|███████▎  | 16/22 [04:10<01:11, 11.88s/it]

Win reward: base=1.00, time_bonus=0.07 (moves: 173/200)


Self-play games: 100%|██████████| 22/22 [04:44<00:00, 12.92s/it]


[2025-04-22 17:50:22] Self-play completed in 284.17s, generated 4917 examples (17.3 games/s)
[2025-04-22 17:50:22] Processing 4 games for initial placement data
[2025-04-22 17:50:23] Extracted 693 initial placement examples
[2025-04-22 17:50:23] Skipping placement network training for iteration 292 (training every 5 iterations)
[2025-04-22 17:50:23] Training network...
Epoch 1/10: Loss 1.8802 (Value 0.0306, Policy 1.8496)
Epoch 2/10: Loss 1.8602 (Value 0.0194, Policy 1.8409)
Epoch 3/10: Loss 1.8308 (Value 0.0181, Policy 1.8127)
Epoch 4/10: Loss 1.8151 (Value 0.0158, Policy 1.7993)
Epoch 5/10: Loss 1.8170 (Value 0.0156, Policy 1.8014)
Epoch 6/10: Loss 1.8104 (Value 0.0112, Policy 1.7992)
Epoch 7/10: Loss 1.7830 (Value 0.0082, Policy 1.7748)
Epoch 8/10: Loss 1.7804 (Value 0.0078, Policy 1.7726)
Epoch 9/10: Loss 1.7726 (Value 0.0085, Policy 1.7641)
Epoch 10/10: Loss 1.7667 (Value 0.0076, Policy 1.7591)
[2025-04-22 17:50:42] Training completed in 18.94s
[2025-04-22 17:50:42] Iteration 292 

Self-play games:   0%|          | 0/22 [00:00<?, ?it/s]

[Worker 0] Network parameter sum: 22957.969260
Win reward: base=1.00, time_bonus=0.32 (moves: 73/200)


Self-play games:   5%|▍         | 1/22 [01:06<23:13, 66.35s/it]

Win reward: base=1.00, time_bonus=0.26 (moves: 97/200)


Self-play games:   9%|▉         | 2/22 [01:12<10:13, 30.66s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  18%|█▊        | 4/22 [02:07<07:58, 26.58s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  23%|██▎       | 5/22 [02:09<04:59, 17.63s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  36%|███▋      | 8/22 [02:34<02:49, 12.14s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  50%|█████     | 11/22 [02:53<01:34,  8.55s/it]

Win reward: base=1.00, time_bonus=0.31 (moves: 77/200)


Self-play games:  59%|█████▉    | 13/22 [03:10<01:20,  8.95s/it]

Win reward: base=1.00, time_bonus=0.38 (moves: 49/200)


Self-play games:  68%|██████▊   | 15/22 [03:52<01:53, 16.28s/it]

Win reward: base=1.00, time_bonus=0.05 (moves: 181/200)


Self-play games:  82%|████████▏ | 18/22 [04:18<00:43, 10.82s/it]

Win reward: base=1.00, time_bonus=0.07 (moves: 173/200)


Self-play games: 100%|██████████| 22/22 [04:39<00:00, 12.69s/it]


[2025-04-22 17:55:21] Self-play completed in 279.23s, generated 4848 examples (17.4 games/s)
[2025-04-22 17:55:21] Processing 4 games for initial placement data
[2025-04-22 17:55:21] Extracted 597 initial placement examples
[2025-04-22 17:55:21] Skipping placement network training for iteration 293 (training every 5 iterations)
[2025-04-22 17:55:21] Training network...
Epoch 1/10: Loss 1.9058 (Value 0.0332, Policy 1.8726)
Epoch 2/10: Loss 1.8565 (Value 0.0279, Policy 1.8286)
Epoch 3/10: Loss 1.8090 (Value 0.0174, Policy 1.7916)
Epoch 4/10: Loss 1.8134 (Value 0.0128, Policy 1.8005)
Epoch 5/10: Loss 1.7927 (Value 0.0115, Policy 1.7812)
Epoch 6/10: Loss 1.7907 (Value 0.0086, Policy 1.7822)
Epoch 7/10: Loss 1.7883 (Value 0.0072, Policy 1.7811)
Epoch 8/10: Loss 1.7660 (Value 0.0076, Policy 1.7584)
Epoch 9/10: Loss 1.7625 (Value 0.0061, Policy 1.7563)
Epoch 10/10: Loss 1.7677 (Value 0.0058, Policy 1.7619)
[2025-04-22 17:55:40] Training completed in 19.16s
[2025-04-22 17:55:40] Iteration 293 

Self-play games:   0%|          | 0/22 [00:00<?, ?it/s]

[Worker 0] Network parameter sum: 22993.238661


Self-play games:  23%|██▎       | 5/22 [02:16<04:24, 15.57s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  32%|███▏      | 7/22 [02:31<02:58, 11.89s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  50%|█████     | 11/22 [02:42<00:51,  4.65s/it]

Win reward: base=1.00, time_bonus=0.24 (moves: 105/200)


Self-play games:  59%|█████▉    | 13/22 [03:37<02:16, 15.14s/it]

Win reward: base=1.00, time_bonus=0.26 (moves: 97/200)


Self-play games:  95%|█████████▌| 21/22 [04:33<00:04,  4.59s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games: 100%|██████████| 22/22 [05:04<00:00, 13.83s/it]


[2025-04-22 18:00:44] Self-play completed in 304.28s, generated 5026 examples (16.5 games/s)
[2025-04-22 18:00:44] Processing 4 games for initial placement data
[2025-04-22 18:00:45] Extracted 776 initial placement examples
[2025-04-22 18:00:45] Skipping placement network training for iteration 294 (training every 5 iterations)
[2025-04-22 18:00:45] Training network...
Epoch 1/10: Loss 1.8553 (Value 0.0388, Policy 1.8164)
Epoch 2/10: Loss 1.8704 (Value 0.0259, Policy 1.8445)
Epoch 3/10: Loss 1.8200 (Value 0.0213, Policy 1.7988)
Epoch 4/10: Loss 1.8178 (Value 0.0178, Policy 1.8000)
Epoch 5/10: Loss 1.7943 (Value 0.0133, Policy 1.7810)
Epoch 6/10: Loss 1.8126 (Value 0.0113, Policy 1.8013)
Epoch 7/10: Loss 1.7894 (Value 0.0110, Policy 1.7785)
Epoch 8/10: Loss 1.7655 (Value 0.0101, Policy 1.7554)
Epoch 9/10: Loss 1.7632 (Value 0.0094, Policy 1.7538)
Epoch 10/10: Loss 1.7707 (Value 0.0098, Policy 1.7609)
[2025-04-22 18:01:04] Training completed in 18.91s
[2025-04-22 18:01:04] Iteration 294 

Self-play games:   0%|          | 0/22 [00:00<?, ?it/s]

[Worker 0] Network parameter sum: 23095.258406
Win reward: base=1.00, time_bonus=0.33 (moves: 69/200)


Self-play games:   5%|▍         | 1/22 [00:51<18:08, 51.85s/it]

Win reward: base=1.00, time_bonus=0.14 (moves: 145/200)


Self-play games:   9%|▉         | 2/22 [02:08<22:03, 66.18s/it]

Win reward: base=1.00, time_bonus=0.08 (moves: 169/200)


Self-play games:  18%|█▊        | 4/22 [02:11<06:52, 22.91s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  32%|███▏      | 7/22 [02:22<02:16,  9.11s/it]

Win reward: base=1.00, time_bonus=0.15 (moves: 141/200)


Self-play games:  41%|████      | 9/22 [02:37<01:50,  8.54s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  55%|█████▍    | 12/22 [02:54<01:07,  6.77s/it]

Win reward: base=1.00, time_bonus=0.22 (moves: 113/200)


Self-play games:  68%|██████▊   | 15/22 [04:18<02:05, 17.93s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  82%|████████▏ | 18/22 [04:28<00:31,  7.85s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games: 100%|██████████| 22/22 [04:54<00:00, 13.38s/it]


[2025-04-22 18:05:58] Self-play completed in 294.44s, generated 5079 examples (17.2 games/s)
[2025-04-22 18:05:58] Processing 4 games for initial placement data
[2025-04-22 18:05:58] Extracted 718 initial placement examples
[2025-04-22 18:05:58] Training initial placement network on 2186 examples
Epoch 1/10: Loss = 1.2671, Accuracy = 0.0749
Epoch 2/10: Loss = 1.2416, Accuracy = 0.0882
Epoch 3/10: Loss = 1.2316, Accuracy = 0.0896
Epoch 4/10: Loss = 1.2250, Accuracy = 0.0864
Epoch 5/10: Loss = 1.2178, Accuracy = 0.0960
Epoch 6/10: Loss = 1.2127, Accuracy = 0.0933
Epoch 7/10: Loss = 1.2061, Accuracy = 0.0933
Epoch 8/10: Loss = 1.2066, Accuracy = 0.0910
Epoch 9/10: Loss = 1.2025, Accuracy = 0.0970
Epoch 10/10: Loss = 1.2018, Accuracy = 0.0983
[2025-04-22 18:06:01] Placement network training completed in 2.77s
[2025-04-22 18:06:01] Metrics: Loss = 1.2213, Accuracy = 0.0908
[2025-04-22 18:06:01] Saved placement network to models/placement_network.pt
[2025-04-22 18:06:01] Training network...


Evaluation games:  10%|█         | 1/10 [00:18<02:44, 18.27s/it]

[2025-04-22 18:06:38] Game 1: duration=18.27s, moves=200, our_VP=7, winner=0 VP=7


Evaluation games:  20%|██        | 2/10 [00:33<02:13, 16.63s/it]

[2025-04-22 18:06:54] Game 2: duration=15.49s, moves=200, our_VP=5, winner=0 VP=5


Evaluation games:  30%|███       | 3/10 [00:44<01:36, 13.79s/it]

[2025-04-22 18:07:04] Game 3: duration=10.39s, moves=200, our_VP=2, winner=1 VP=6


Evaluation games:  40%|████      | 4/10 [00:57<01:21, 13.65s/it]

[2025-04-22 18:07:18] Game 4: duration=13.45s, moves=200, our_VP=5, winner=0 VP=5


Evaluation games:  50%|█████     | 5/10 [01:14<01:14, 14.81s/it]

[2025-04-22 18:07:34] Game 5: duration=16.87s, moves=200, our_VP=9, winner=0 VP=9


Evaluation games:  60%|██████    | 6/10 [01:27<00:56, 14.09s/it]

[2025-04-22 18:07:47] Game 6: duration=12.69s, moves=143, our_VP=11, winner=0 VP=11


Evaluation games:  70%|███████   | 7/10 [01:39<00:40, 13.67s/it]

[2025-04-22 18:08:00] Game 7: duration=12.79s, moves=200, our_VP=4, winner=0 VP=4


Evaluation games:  80%|████████  | 8/10 [01:52<00:26, 13.39s/it]

[2025-04-22 18:08:13] Game 8: duration=12.81s, moves=200, our_VP=4, winner=0 VP=4


Evaluation games:  90%|█████████ | 9/10 [02:08<00:14, 14.01s/it]

[2025-04-22 18:08:28] Game 9: duration=15.37s, moves=200, our_VP=7, winner=0 VP=7


Evaluation games: 100%|██████████| 10/10 [02:20<00:00, 14.01s/it]

[2025-04-22 18:08:40] Game 10: duration=11.94s, moves=200, our_VP=2, winner=1 VP=6
[2025-04-22 18:08:40] Evaluated 10 games in 140.09s (0.07 games/s)
[2025-04-22 18:08:40] Evaluation results: win_rate=0.80, avg_vp=5.60, avg_length=194.30, total_moves=1943
[2025-04-22 18:08:40] Eval resource usage: CPU 60.5%, RAM 12.3%, GPU peak memory 0.00 GB
[2025-04-22 18:08:40] Evaluation completed in 140.09s





[2025-04-22 18:08:40] Plotly metrics visualization saved to plots/training_metrics.html
[2025-04-22 18:08:40] New best model at iteration 295 (win_rate=0.80)
[2025-04-22 18:08:41] Checkpoint saved: models/model_iter_295.pt
[2025-04-22 18:08:41] Best model saved: models/best_model.pt
[2025-04-22 18:08:41] Saved placement network to models/placement_network.pt
[2025-04-22 18:08:41] Copied best placement network to models/placement_network_iter_295.pt
[2025-04-22 18:08:42] Replay buffer saved: models/latest_buffer.pkl (50000 examples, 201.7 MB)
[2025-04-22 18:08:42] Iteration 295 done in 458.52s
[2025-04-22 18:08:42] Resource usage: CPU 9.2%, RAM 12.3%, GPU peak memory 0.00 GB
[2025-04-22 18:08:42] Network parameter sum after training: 23162.527837
[2025-04-22 18:08:42] 
=== Iteration 296/300 ===
[2025-04-22 18:08:42] Starting self-play...


Self-play games:   0%|          | 0/22 [00:00<?, ?it/s]

[Worker 0] Network parameter sum: 23162.527001


Self-play games:   5%|▍         | 1/22 [02:01<42:36, 121.72s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  18%|█▊        | 4/22 [02:17<06:33, 21.85s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  27%|██▋       | 6/22 [02:26<03:12, 12.00s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  36%|███▋      | 8/22 [02:35<01:55,  8.24s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  41%|████      | 9/22 [02:36<01:16,  5.89s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  45%|████▌     | 10/22 [02:43<01:15,  6.32s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  59%|█████▉    | 13/22 [04:12<03:29, 23.31s/it]

Win reward: base=1.00, time_bonus=0.10 (moves: 161/200)


Self-play games:  77%|███████▋  | 17/22 [04:32<00:49,  9.94s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games: 100%|██████████| 22/22 [04:48<00:00, 13.13s/it]


[2025-04-22 18:13:31] Self-play completed in 288.97s, generated 5128 examples (17.7 games/s)
[2025-04-22 18:13:31] Processing 4 games for initial placement data
[2025-04-22 18:13:31] Extracted 765 initial placement examples
[2025-04-22 18:13:31] Skipping placement network training for iteration 296 (training every 5 iterations)
[2025-04-22 18:13:31] Training network...
Epoch 1/10: Loss 1.8986 (Value 0.0409, Policy 1.8577)
Epoch 2/10: Loss 1.8545 (Value 0.0363, Policy 1.8182)
Epoch 3/10: Loss 1.8440 (Value 0.0283, Policy 1.8157)
Epoch 4/10: Loss 1.8187 (Value 0.0205, Policy 1.7981)
Epoch 5/10: Loss 1.8434 (Value 0.0168, Policy 1.8266)
Epoch 6/10: Loss 1.8069 (Value 0.0139, Policy 1.7930)
Epoch 7/10: Loss 1.7935 (Value 0.0127, Policy 1.7808)
Epoch 8/10: Loss 1.7891 (Value 0.0126, Policy 1.7764)
Epoch 9/10: Loss 1.7782 (Value 0.0107, Policy 1.7675)
Epoch 10/10: Loss 1.7686 (Value 0.0097, Policy 1.7589)
[2025-04-22 18:13:50] Training completed in 18.59s
[2025-04-22 18:13:50] Iteration 296 

Self-play games:   0%|          | 0/22 [00:00<?, ?it/s]

[Worker 0] Network parameter sum: 23193.364165


Self-play games:   5%|▍         | 1/22 [02:06<44:15, 126.46s/it]

Win reward: base=1.00, time_bonus=0.11 (moves: 157/200)


Self-play games:   9%|▉         | 2/22 [02:19<19:49, 59.48s/it] 

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  14%|█▎        | 3/22 [02:21<10:33, 33.32s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  36%|███▋      | 8/22 [02:35<01:21,  5.81s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  45%|████▌     | 10/22 [02:41<00:48,  4.03s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  50%|█████     | 11/22 [02:46<00:50,  4.55s/it]

Win reward: base=1.00, time_bonus=0.11 (moves: 157/200)


Self-play games:  59%|█████▉    | 13/22 [04:07<02:58, 19.84s/it]

Win reward: base=1.00, time_bonus=0.12 (moves: 153/200)


Self-play games:  77%|███████▋  | 17/22 [04:46<00:54, 10.94s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  82%|████████▏ | 18/22 [04:49<00:35,  8.85s/it]

Win reward: base=1.00, time_bonus=0.05 (moves: 181/200)


Self-play games: 100%|██████████| 22/22 [05:15<00:00, 14.36s/it]


[2025-04-22 18:19:06] Self-play completed in 315.85s, generated 5460 examples (17.3 games/s)
[2025-04-22 18:19:06] Processing 4 games for initial placement data
[2025-04-22 18:19:06] Extracted 724 initial placement examples
[2025-04-22 18:19:06] Skipping placement network training for iteration 297 (training every 5 iterations)
[2025-04-22 18:19:06] Training network...
Epoch 1/10: Loss 1.8900 (Value 0.0365, Policy 1.8536)
Epoch 2/10: Loss 1.8638 (Value 0.0251, Policy 1.8387)
Epoch 3/10: Loss 1.8529 (Value 0.0211, Policy 1.8317)
Epoch 4/10: Loss 1.8357 (Value 0.0163, Policy 1.8195)
Epoch 5/10: Loss 1.8269 (Value 0.0125, Policy 1.8144)
Epoch 6/10: Loss 1.7899 (Value 0.0108, Policy 1.7791)
Epoch 7/10: Loss 1.7764 (Value 0.0112, Policy 1.7652)
Epoch 8/10: Loss 1.7694 (Value 0.0093, Policy 1.7602)
Epoch 9/10: Loss 1.7780 (Value 0.0078, Policy 1.7702)
Epoch 10/10: Loss 1.7610 (Value 0.0090, Policy 1.7520)
[2025-04-22 18:19:25] Training completed in 18.61s
[2025-04-22 18:19:25] Iteration 297 

Self-play games:   0%|          | 0/22 [00:00<?, ?it/s]

[Worker 0] Network parameter sum: 23356.677812
Win reward: base=1.00, time_bonus=0.21 (moves: 117/200)


Self-play games:  27%|██▋       | 6/22 [02:26<03:07, 11.69s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  50%|█████     | 11/22 [02:44<00:48,  4.45s/it]

Win reward: base=1.00, time_bonus=0.27 (moves: 93/200)


Self-play games:  59%|█████▉    | 13/22 [03:42<02:01, 13.54s/it]

Win reward: base=1.00, time_bonus=0.08 (moves: 169/200)


Self-play games:  64%|██████▎   | 14/22 [04:17<02:35, 19.47s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  77%|███████▋  | 17/22 [04:37<00:55, 11.04s/it]

Win reward: base=1.00, time_bonus=0.12 (moves: 153/200)


Self-play games: 100%|██████████| 22/22 [04:51<00:00, 13.23s/it]


[2025-04-22 18:24:16] Self-play completed in 291.12s, generated 5223 examples (17.9 games/s)
[2025-04-22 18:24:16] Processing 4 games for initial placement data
[2025-04-22 18:24:16] Extracted 523 initial placement examples
[2025-04-22 18:24:16] Skipping placement network training for iteration 298 (training every 5 iterations)
[2025-04-22 18:24:16] Training network...
Epoch 1/10: Loss 1.8922 (Value 0.0395, Policy 1.8526)
Epoch 2/10: Loss 1.8737 (Value 0.0304, Policy 1.8433)
Epoch 3/10: Loss 1.8740 (Value 0.0206, Policy 1.8534)
Epoch 4/10: Loss 1.8272 (Value 0.0188, Policy 1.8084)
Epoch 5/10: Loss 1.8124 (Value 0.0178, Policy 1.7946)
Epoch 6/10: Loss 1.7867 (Value 0.0168, Policy 1.7699)
Epoch 7/10: Loss 1.8037 (Value 0.0140, Policy 1.7896)
Epoch 8/10: Loss 1.7853 (Value 0.0110, Policy 1.7743)
Epoch 9/10: Loss 1.7788 (Value 0.0112, Policy 1.7676)
Epoch 10/10: Loss 1.7701 (Value 0.0112, Policy 1.7589)
[2025-04-22 18:24:35] Training completed in 19.22s
[2025-04-22 18:24:35] Iteration 298 

Self-play games:   0%|          | 0/22 [00:00<?, ?it/s]

[Worker 0] Network parameter sum: 23393.091018
Win reward: base=1.00, time_bonus=0.25 (moves: 101/200)


Self-play games:   5%|▍         | 1/22 [01:19<27:41, 79.12s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:   9%|▉         | 2/22 [02:00<18:56, 56.84s/it]

Win reward: base=1.00, time_bonus=0.16 (moves: 137/200)


Self-play games:  14%|█▎        | 3/22 [02:11<11:24, 36.05s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  50%|█████     | 11/22 [03:12<02:17, 12.50s/it]

Win reward: base=1.00, time_bonus=0.16 (moves: 137/200)


Self-play games:  68%|██████▊   | 15/22 [04:41<02:11, 18.80s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games: 100%|██████████| 22/22 [05:08<00:00, 14.01s/it]


[2025-04-22 18:29:44] Self-play completed in 308.32s, generated 5436 examples (17.6 games/s)
[2025-04-22 18:29:44] Processing 4 games for initial placement data
[2025-04-22 18:29:44] Extracted 653 initial placement examples
[2025-04-22 18:29:44] Skipping placement network training for iteration 299 (training every 5 iterations)
[2025-04-22 18:29:44] Training network...
Epoch 1/10: Loss 1.8782 (Value 0.0362, Policy 1.8420)
Epoch 2/10: Loss 1.8528 (Value 0.0256, Policy 1.8272)
Epoch 3/10: Loss 1.8591 (Value 0.0214, Policy 1.8377)
Epoch 4/10: Loss 1.8116 (Value 0.0173, Policy 1.7944)
Epoch 5/10: Loss 1.8268 (Value 0.0154, Policy 1.8114)
Epoch 6/10: Loss 1.8200 (Value 0.0140, Policy 1.8060)
Epoch 7/10: Loss 1.7791 (Value 0.0130, Policy 1.7661)
Epoch 8/10: Loss 1.7768 (Value 0.0127, Policy 1.7641)
Epoch 9/10: Loss 1.7812 (Value 0.0118, Policy 1.7694)
Epoch 10/10: Loss 1.7552 (Value 0.0088, Policy 1.7464)
[2025-04-22 18:30:02] Training completed in 18.59s
[2025-04-22 18:30:02] Iteration 299 

Self-play games:   0%|          | 0/22 [00:00<?, ?it/s]

[Worker 0] Network parameter sum: 23397.159261


Self-play games:   5%|▍         | 1/22 [01:42<35:45, 102.17s/it]

Win reward: base=1.00, time_bonus=0.09 (moves: 165/200)


Self-play games:  14%|█▎        | 3/22 [02:09<11:02, 34.85s/it]

Win reward: base=1.00, time_bonus=0.06 (moves: 177/200)


Self-play games:  23%|██▎       | 5/22 [02:13<04:12, 14.83s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  50%|█████     | 11/22 [02:59<01:36,  8.74s/it]

Win reward: base=1.00, time_bonus=0.19 (moves: 125/200)


Self-play games:  59%|█████▉    | 13/22 [04:06<02:52, 19.19s/it]

Win reward: base=1.00, time_bonus=0.01 (moves: 197/200)


Self-play games:  64%|██████▎   | 14/22 [04:17<02:12, 16.59s/it]

Win reward: base=1.00, time_bonus=0.13 (moves: 149/200)


Self-play games:  68%|██████▊   | 15/22 [04:24<01:35, 13.59s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  73%|███████▎  | 16/22 [04:28<01:03, 10.66s/it]

Win reward: base=1.00, time_bonus=0.20 (moves: 121/200)


Self-play games:  86%|████████▋ | 19/22 [04:42<00:22,  7.47s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games: 100%|██████████| 22/22 [04:48<00:00, 13.10s/it]


[2025-04-22 18:34:51] Self-play completed in 288.32s, generated 5162 examples (17.9 games/s)
[2025-04-22 18:34:51] Processing 4 games for initial placement data
[2025-04-22 18:34:51] Extracted 697 initial placement examples
[2025-04-22 18:34:51] Training initial placement network on 2883 examples
Epoch 1/10: Loss = 1.3036, Accuracy = 0.0806
Epoch 2/10: Loss = 1.2893, Accuracy = 0.0833
Epoch 3/10: Loss = 1.2783, Accuracy = 0.0840
Epoch 4/10: Loss = 1.2729, Accuracy = 0.0875
Epoch 5/10: Loss = 1.2677, Accuracy = 0.0875
Epoch 6/10: Loss = 1.2615, Accuracy = 0.0858
Epoch 7/10: Loss = 1.2606, Accuracy = 0.0847
Epoch 8/10: Loss = 1.2589, Accuracy = 0.0840
Epoch 9/10: Loss = 1.2561, Accuracy = 0.0875
Epoch 10/10: Loss = 1.2547, Accuracy = 0.0892
[2025-04-22 18:34:55] Placement network training completed in 3.99s
[2025-04-22 18:34:55] Metrics: Loss = 1.2704, Accuracy = 0.0854
[2025-04-22 18:34:55] Saved placement network to models/placement_network.pt
[2025-04-22 18:34:55] Training network...


Evaluation games:  10%|█         | 1/10 [00:15<02:22, 15.80s/it]

[2025-04-22 18:35:30] Game 1: duration=15.80s, moves=200, our_VP=5, winner=0 VP=5


Evaluation games:  20%|██        | 2/10 [00:30<02:01, 15.22s/it]

[2025-04-22 18:35:44] Game 2: duration=14.81s, moves=200, our_VP=8, winner=0 VP=8


Evaluation games:  30%|███       | 3/10 [00:45<01:46, 15.28s/it]

[2025-04-22 18:36:00] Game 3: duration=15.34s, moves=200, our_VP=8, winner=0 VP=8


Evaluation games:  40%|████      | 4/10 [00:58<01:24, 14.03s/it]

[2025-04-22 18:36:12] Game 4: duration=12.12s, moves=200, our_VP=6, winner=2 VP=7


Evaluation games:  50%|█████     | 5/10 [01:14<01:13, 14.73s/it]

[2025-04-22 18:36:28] Game 5: duration=15.96s, moves=200, our_VP=2, winner=1 VP=6


Evaluation games:  60%|██████    | 6/10 [01:26<00:56, 14.03s/it]

[2025-04-22 18:36:40] Game 6: duration=12.67s, moves=200, our_VP=4, winner=0 VP=4


Evaluation games:  70%|███████   | 7/10 [01:39<00:40, 13.56s/it]

[2025-04-22 18:36:53] Game 7: duration=12.61s, moves=200, our_VP=5, winner=0 VP=5


Evaluation games:  80%|████████  | 8/10 [01:51<00:26, 13.19s/it]

[2025-04-22 18:37:05] Game 8: duration=12.39s, moves=200, our_VP=2, winner=1 VP=8


Evaluation games:  90%|█████████ | 9/10 [02:07<00:13, 13.88s/it]

[2025-04-22 18:37:21] Game 9: duration=15.39s, moves=200, our_VP=7, winner=0 VP=7


Evaluation games: 100%|██████████| 10/10 [02:21<00:00, 14.11s/it]

[2025-04-22 18:37:35] Game 10: duration=13.98s, moves=200, our_VP=5, winner=2 VP=6
[2025-04-22 18:37:35] Evaluated 10 games in 141.09s (0.07 games/s)
[2025-04-22 18:37:35] Evaluation results: win_rate=0.60, avg_vp=5.20, avg_length=200.00, total_moves=2000
[2025-04-22 18:37:35] Eval resource usage: CPU 63.4%, RAM 12.3%, GPU peak memory 0.00 GB
[2025-04-22 18:37:35] Evaluation completed in 141.09s





[2025-04-22 18:37:35] Plotly metrics visualization saved to plots/training_metrics.html
[2025-04-22 18:37:35] Checkpoint saved: models/model_iter_300.pt
[2025-04-22 18:37:35] Saved placement network to models/placement_network.pt
[2025-04-22 18:37:37] Replay buffer saved: models/latest_buffer.pkl (50000 examples, 201.6 MB)
[2025-04-22 18:37:37] Iteration 300 done in 454.45s
[2025-04-22 18:37:37] Resource usage: CPU 9.3%, RAM 12.4%, GPU peak memory 0.00 GB
[2025-04-22 18:37:37] Network parameter sum after training: 23373.752742
[2025-04-22 18:37:37] 
=== Iteration 301/300 ===
[2025-04-22 18:37:37] Starting self-play...


Self-play games:   0%|          | 0/22 [00:00<?, ?it/s]

[Worker 0] Network parameter sum: 23373.750669


Self-play games:   9%|▉         | 2/22 [02:19<20:02, 60.14s/it] 

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  14%|█▎        | 3/22 [02:22<10:47, 34.08s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  23%|██▎       | 5/22 [02:28<04:18, 15.23s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  27%|██▋       | 6/22 [02:34<03:09, 11.85s/it]

Win reward: base=1.00, time_bonus=0.02 (moves: 193/200)


Self-play games:  36%|███▋      | 8/22 [02:38<01:31,  6.56s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  41%|████      | 9/22 [02:41<01:09,  5.35s/it]

Win reward: base=1.00, time_bonus=0.04 (moves: 185/200)


Self-play games:  45%|████▌     | 10/22 [02:44<00:54,  4.55s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  50%|█████     | 11/22 [02:44<00:35,  3.19s/it]

Win reward: base=1.00, time_bonus=0.30 (moves: 81/200)


Self-play games:  59%|█████▉    | 13/22 [03:51<02:24, 16.09s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  73%|███████▎  | 16/22 [04:33<01:15, 12.53s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  86%|████████▋ | 19/22 [04:46<00:21,  7.14s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  95%|█████████▌| 21/22 [04:50<00:04,  4.51s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games: 100%|██████████| 22/22 [04:54<00:00, 13.40s/it]


[2025-04-22 18:42:32] Self-play completed in 294.81s, generated 5220 examples (17.7 games/s)
[2025-04-22 18:42:32] Processing 4 games for initial placement data
[2025-04-22 18:42:32] Extracted 620 initial placement examples
[2025-04-22 18:42:32] Skipping placement network training for iteration 301 (training every 5 iterations)
[2025-04-22 18:42:32] Training network...
Epoch 1/10: Loss 1.8669 (Value 0.0344, Policy 1.8324)
Epoch 2/10: Loss 1.8446 (Value 0.0235, Policy 1.8211)
Epoch 3/10: Loss 1.8287 (Value 0.0187, Policy 1.8099)
Epoch 4/10: Loss 1.8099 (Value 0.0139, Policy 1.7961)
Epoch 5/10: Loss 1.7970 (Value 0.0103, Policy 1.7867)
Epoch 6/10: Loss 1.8006 (Value 0.0094, Policy 1.7912)
Epoch 7/10: Loss 1.7866 (Value 0.0099, Policy 1.7768)
Epoch 8/10: Loss 1.7815 (Value 0.0078, Policy 1.7737)
Epoch 9/10: Loss 1.7587 (Value 0.0076, Policy 1.7511)
Epoch 10/10: Loss 1.7640 (Value 0.0068, Policy 1.7572)
[2025-04-22 18:42:50] Training completed in 18.44s
[2025-04-22 18:42:50] Iteration 301 

Self-play games:   0%|          | 0/22 [00:00<?, ?it/s]

[Worker 0] Network parameter sum: 23482.166683
Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  50%|█████     | 11/22 [03:01<01:21,  7.43s/it]

Win reward: base=1.00, time_bonus=0.24 (moves: 105/200)


Self-play games:  55%|█████▍    | 12/22 [03:51<03:21, 20.17s/it]

Win reward: base=1.00, time_bonus=0.08 (moves: 169/200)


Self-play games:  59%|█████▉    | 13/22 [04:02<02:37, 17.52s/it]

Win reward: base=1.00, time_bonus=0.14 (moves: 145/200)


Self-play games:  64%|██████▎   | 14/22 [04:17<02:13, 16.70s/it]

Win reward: base=1.00, time_bonus=0.21 (moves: 117/200)


Self-play games:  73%|███████▎  | 16/22 [04:29<01:10, 11.72s/it]

Win reward: base=1.00, time_bonus=0.08 (moves: 169/200)


Self-play games: 100%|██████████| 22/22 [04:49<00:00, 13.17s/it]


[2025-04-22 18:47:40] Self-play completed in 289.81s, generated 5169 examples (17.8 games/s)
[2025-04-22 18:47:40] Processing 4 games for initial placement data
[2025-04-22 18:47:40] Extracted 913 initial placement examples
[2025-04-22 18:47:40] Skipping placement network training for iteration 302 (training every 5 iterations)
[2025-04-22 18:47:40] Training network...
Epoch 1/10: Loss 1.9000 (Value 0.0422, Policy 1.8578)
Epoch 2/10: Loss 1.8768 (Value 0.0332, Policy 1.8436)
Epoch 3/10: Loss 1.8447 (Value 0.0240, Policy 1.8207)
Epoch 4/10: Loss 1.8059 (Value 0.0174, Policy 1.7884)
Epoch 5/10: Loss 1.8331 (Value 0.0189, Policy 1.8142)
Epoch 6/10: Loss 1.8040 (Value 0.0133, Policy 1.7907)
Epoch 7/10: Loss 1.7790 (Value 0.0110, Policy 1.7680)
Epoch 8/10: Loss 1.7654 (Value 0.0104, Policy 1.7550)
Epoch 9/10: Loss 1.7735 (Value 0.0090, Policy 1.7645)
Epoch 10/10: Loss 1.7769 (Value 0.0073, Policy 1.7695)
[2025-04-22 18:47:59] Training completed in 18.73s
[2025-04-22 18:47:59] Iteration 302 

Self-play games:   0%|          | 0/22 [00:00<?, ?it/s]

[Worker 0] Network parameter sum: 23506.843164
Win reward: base=1.00, time_bonus=0.19 (moves: 125/200)


Self-play games:   9%|▉         | 2/22 [01:54<16:01, 48.09s/it] 

Win reward: base=1.00, time_bonus=0.13 (moves: 149/200)


Self-play games:  14%|█▎        | 3/22 [02:02<09:26, 29.82s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  18%|█▊        | 4/22 [02:06<05:57, 19.84s/it]

Win reward: base=1.00, time_bonus=0.10 (moves: 161/200)


Self-play games:  45%|████▌     | 10/22 [02:56<02:01, 10.09s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  55%|█████▍    | 12/22 [03:38<02:55, 17.55s/it]

Win reward: base=1.00, time_bonus=0.15 (moves: 141/200)


Self-play games:  64%|██████▎   | 14/22 [04:13<02:10, 16.32s/it]

Win reward: base=1.00, time_bonus=0.11 (moves: 157/200)


Self-play games:  68%|██████▊   | 15/22 [04:14<01:23, 11.91s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  77%|███████▋  | 17/22 [04:31<00:51, 10.37s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  82%|████████▏ | 18/22 [04:32<00:30,  7.63s/it]

Win reward: base=1.00, time_bonus=0.19 (moves: 125/200)


Self-play games:  95%|█████████▌| 21/22 [04:43<00:05,  5.09s/it]

Win reward: base=1.00, time_bonus=0.07 (moves: 173/200)


Self-play games: 100%|██████████| 22/22 [04:45<00:00, 12.95s/it]


[2025-04-22 18:52:44] Self-play completed in 285.02s, generated 5092 examples (17.9 games/s)
[2025-04-22 18:52:44] Processing 4 games for initial placement data
[2025-04-22 18:52:44] Extracted 647 initial placement examples
[2025-04-22 18:52:44] Skipping placement network training for iteration 303 (training every 5 iterations)
[2025-04-22 18:52:44] Training network...
Epoch 1/10: Loss 1.8792 (Value 0.0349, Policy 1.8443)
Epoch 2/10: Loss 1.8616 (Value 0.0247, Policy 1.8369)
Epoch 3/10: Loss 1.8529 (Value 0.0206, Policy 1.8323)
Epoch 4/10: Loss 1.8230 (Value 0.0140, Policy 1.8090)
Epoch 5/10: Loss 1.8275 (Value 0.0126, Policy 1.8149)
Epoch 6/10: Loss 1.7975 (Value 0.0117, Policy 1.7858)
Epoch 7/10: Loss 1.8014 (Value 0.0098, Policy 1.7916)
Epoch 8/10: Loss 1.7756 (Value 0.0085, Policy 1.7671)
Epoch 9/10: Loss 1.7777 (Value 0.0078, Policy 1.7699)
Epoch 10/10: Loss 1.7714 (Value 0.0069, Policy 1.7645)
[2025-04-22 18:53:03] Training completed in 18.79s
[2025-04-22 18:53:03] Iteration 303 

Self-play games:   0%|          | 0/22 [00:00<?, ?it/s]

[Worker 0] Network parameter sum: 23598.665479
Win reward: base=1.00, time_bonus=0.29 (moves: 85/200)


Self-play games:   5%|▍         | 1/22 [01:27<30:33, 87.30s/it]

Win reward: base=1.00, time_bonus=0.07 (moves: 173/200)


Self-play games:  14%|█▎        | 3/22 [02:06<10:33, 33.32s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  18%|█▊        | 4/22 [02:07<06:15, 20.86s/it]

Win reward: base=1.00, time_bonus=0.09 (moves: 165/200)


Self-play games:  41%|████      | 9/22 [02:39<01:46,  8.16s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  45%|████▌     | 10/22 [02:40<01:12,  6.04s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  55%|█████▍    | 12/22 [04:04<04:44, 28.46s/it]

Win reward: base=1.00, time_bonus=0.21 (moves: 117/200)


Self-play games:  59%|█████▉    | 13/22 [04:09<03:12, 21.38s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games: 100%|██████████| 22/22 [04:53<00:00, 13.34s/it]


[2025-04-22 18:57:57] Self-play completed in 293.44s, generated 5136 examples (17.5 games/s)
[2025-04-22 18:57:57] Processing 4 games for initial placement data
[2025-04-22 18:57:57] Extracted 634 initial placement examples
[2025-04-22 18:57:57] Skipping placement network training for iteration 304 (training every 5 iterations)
[2025-04-22 18:57:57] Training network...
Epoch 1/10: Loss 1.8765 (Value 0.0381, Policy 1.8385)
Epoch 2/10: Loss 1.8345 (Value 0.0289, Policy 1.8057)
Epoch 3/10: Loss 1.8152 (Value 0.0187, Policy 1.7966)
Epoch 4/10: Loss 1.8325 (Value 0.0154, Policy 1.8171)
Epoch 5/10: Loss 1.8021 (Value 0.0101, Policy 1.7920)
Epoch 6/10: Loss 1.7999 (Value 0.0082, Policy 1.7917)
Epoch 7/10: Loss 1.7758 (Value 0.0075, Policy 1.7684)
Epoch 8/10: Loss 1.7623 (Value 0.0059, Policy 1.7565)
Epoch 9/10: Loss 1.7717 (Value 0.0056, Policy 1.7661)
Epoch 10/10: Loss 1.7624 (Value 0.0056, Policy 1.7567)
[2025-04-22 18:58:15] Training completed in 18.61s
[2025-04-22 18:58:15] Iteration 304 

Self-play games:   0%|          | 0/22 [00:00<?, ?it/s]

[Worker 0] Network parameter sum: 23587.953780
Win reward: base=1.00, time_bonus=0.35 (moves: 61/200)


Self-play games:   5%|▍         | 1/22 [00:40<14:12, 40.61s/it]

Win reward: base=1.00, time_bonus=0.05 (moves: 181/200)


Self-play games:  14%|█▎        | 3/22 [02:09<11:57, 37.75s/it]

Win reward: base=1.00, time_bonus=0.15 (moves: 141/200)


Self-play games:  45%|████▌     | 10/22 [02:40<01:11,  5.94s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  50%|█████     | 11/22 [02:48<01:11,  6.53s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  55%|█████▍    | 12/22 [03:01<01:22,  8.24s/it]

Win reward: base=1.00, time_bonus=0.28 (moves: 89/200)


Self-play games:  59%|█████▉    | 13/22 [03:15<01:32, 10.24s/it]

Win reward: base=1.00, time_bonus=0.20 (moves: 121/200)


Self-play games:  64%|██████▎   | 14/22 [03:54<02:31, 18.91s/it]

Win reward: base=1.00, time_bonus=0.22 (moves: 113/200)


Self-play games:  68%|██████▊   | 15/22 [03:58<01:39, 14.21s/it]

Win reward: base=1.00, time_bonus=0.04 (moves: 185/200)


Self-play games:  73%|███████▎  | 16/22 [04:09<01:20, 13.42s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  77%|███████▋  | 17/22 [04:24<01:09, 13.84s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games:  95%|█████████▌| 21/22 [04:37<00:05,  5.42s/it]

Win reward: base=1.00, time_bonus=0.00 (moves: 200/200)


Self-play games: 100%|██████████| 22/22 [04:44<00:00, 12.93s/it]


[2025-04-22 19:03:00] Self-play completed in 284.47s, generated 5009 examples (17.6 games/s)
[2025-04-22 19:03:00] Processing 4 games for initial placement data
[2025-04-22 19:03:00] Extracted 540 initial placement examples
[2025-04-22 19:03:00] Training initial placement network on 3423 examples
Epoch 1/10: Loss = 1.3130, Accuracy = 0.0834
Epoch 2/10: Loss = 1.2991, Accuracy = 0.0834
Epoch 3/10: Loss = 1.2924, Accuracy = 0.0876
Epoch 4/10: Loss = 1.2879, Accuracy = 0.0881
Epoch 5/10: Loss = 1.2864, Accuracy = 0.0876
Epoch 6/10: Loss = 1.2846, Accuracy = 0.0858
Epoch 7/10: Loss = 1.2792, Accuracy = 0.0873
Epoch 8/10: Loss = 1.2792, Accuracy = 0.0855
Epoch 9/10: Loss = 1.2760, Accuracy = 0.0873
Epoch 10/10: Loss = 1.2725, Accuracy = 0.0884
[2025-04-22 19:03:05] Placement network training completed in 4.50s
[2025-04-22 19:03:05] Metrics: Loss = 1.2870, Accuracy = 0.0864
[2025-04-22 19:03:05] Saved placement network to models/placement_network.pt
[2025-04-22 19:03:05] Training network...


Evaluation games:  10%|█         | 1/10 [00:13<02:01, 13.53s/it]

[2025-04-22 19:03:37] Game 1: duration=13.53s, moves=120, our_VP=11, winner=0 VP=11


Evaluation games:  20%|██        | 2/10 [00:26<01:45, 13.19s/it]

[2025-04-22 19:03:50] Game 2: duration=12.95s, moves=200, our_VP=7, winner=0 VP=7


Evaluation games:  30%|███       | 3/10 [00:46<01:53, 16.22s/it]

[2025-04-22 19:04:10] Game 3: duration=19.82s, moves=165, our_VP=10, winner=0 VP=10


Evaluation games:  40%|████      | 4/10 [00:57<01:24, 14.08s/it]

[2025-04-22 19:04:21] Game 4: duration=10.81s, moves=200, our_VP=3, winner=2 VP=6


Evaluation games:  50%|█████     | 5/10 [01:08<01:06, 13.24s/it]

[2025-04-22 19:04:32] Game 5: duration=11.75s, moves=200, our_VP=4, winner=1 VP=7


Evaluation games:  60%|██████    | 6/10 [01:25<00:57, 14.37s/it]

[2025-04-22 19:04:49] Game 6: duration=16.54s, moves=200, our_VP=4, winner=0 VP=4


Evaluation games:  70%|███████   | 7/10 [01:37<00:40, 13.58s/it]

[2025-04-22 19:05:01] Game 7: duration=11.96s, moves=200, our_VP=6, winner=0 VP=6


Evaluation games:  80%|████████  | 8/10 [01:56<00:30, 15.46s/it]

[2025-04-22 19:05:20] Game 8: duration=19.48s, moves=200, our_VP=9, winner=0 VP=9


Evaluation games:  90%|█████████ | 9/10 [02:11<00:15, 15.12s/it]

[2025-04-22 19:05:35] Game 9: duration=14.38s, moves=200, our_VP=6, winner=0 VP=6


Evaluation games: 100%|██████████| 10/10 [02:24<00:00, 14.47s/it]

[2025-04-22 19:05:48] Game 10: duration=13.49s, moves=200, our_VP=2, winner=2 VP=6
[2025-04-22 19:05:48] Evaluated 10 games in 144.73s (0.07 games/s)
[2025-04-22 19:05:48] Evaluation results: win_rate=0.70, avg_vp=6.20, avg_length=188.50, total_moves=1885
[2025-04-22 19:05:48] Eval resource usage: CPU 59.5%, RAM 12.3%, GPU peak memory 0.00 GB
[2025-04-22 19:05:48] Evaluation completed in 144.73s





[2025-04-22 19:05:49] Plotly metrics visualization saved to plots/training_metrics.html
[2025-04-22 19:05:49] Checkpoint saved: models/model_iter_305.pt
[2025-04-22 19:05:49] Saved placement network to models/placement_network.pt
[2025-04-22 19:05:50] Replay buffer saved: models/latest_buffer.pkl (50000 examples, 201.8 MB)
[2025-04-22 19:05:50] Iteration 305 done in 454.89s
[2025-04-22 19:05:50] Resource usage: CPU 9.4%, RAM 12.3%, GPU peak memory 0.00 GB
[2025-04-22 19:05:50] Network parameter sum after training: 23622.341337
[2025-04-22 19:05:50] 
=== Iteration 306/300 ===
[2025-04-22 19:05:50] Starting self-play...


Self-play games:   0%|          | 0/22 [00:00<?, ?it/s]

[Worker 0] Network parameter sum: 23622.340637


Self-play games:   0%|          | 0/22 [00:44<?, ?it/s]Process ForkPoolWorker-3:
Process ForkPoolWorker-8:
Process ForkPoolWorker-5:
Process ForkPoolWorker-4:
Process ForkPoolWorker-1:
Process ForkPoolWorker-10:
Process ForkPoolWorker-2:
Process ForkPoolWorker-11:
Process ForkPoolWorker-9:
Process ForkPoolWorker-7:
Process ForkPoolWorker-6:
Traceback (most recent call last):
  File "/usr/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
  File "/usr/lib/python3.11/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
                    ^^^^^^^^^^^^^^^^^^^
  File "/content/drive/MyDrive/CatanRL/AlphaZero/training/self_play.py", line 98, in play_one_game_entry
    game.process_ai_turn()
  File "/usr/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
  

[2025-04-22 19:06:35] Training interrupted by user; saving current model...


  File "/content/drive/MyDrive/CatanRL/AlphaZero/training/self_play.py", line 98, in play_one_game_entry
    game.process_ai_turn()
  File "/content/drive/MyDrive/CatanRL/AlphaZero/agent/alpha_agent.py", line 388, in get_action
    action_probs, value_estimate = self.mcts.search(state)
                                   ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/copy.py", line 172, in deepcopy
    y = _reconstruct(x, memo, *rv)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/drive/MyDrive/CatanRL/AlphaZero/training/self_play.py", line 98, in play_one_game_entry
    game.process_ai_turn()
  File "/content/drive/MyDrive/CatanRL/AlphaZero/core/mcts.py", line 327, in search
    self._process_evaluation(eval_node, eval_path, policy, value)
  File "/content/drive/MyDrive/CatanRL/AlphaZero/training/self_play.py", line 98, in play_one_game_entry
    game.process_ai_turn()

  File "/usr/lib/python3.11/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **k

[2025-04-22 19:06:35] Checkpoint saved: models/model_iter_305.pt
[2025-04-22 19:06:35] Saved placement network to models/placement_network.pt


Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.11/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
                    ^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
                    ^^^^^^^^^^^^^^^^^^^
  File "/content/drive/MyDrive/CatanRL/AlphaZero/training/self_play.py", line 98, in play_one_game_entry
    game.process_ai_turn()
  File "/content/drive/MyDrive/CatanRL/AlphaZero/trainin

[2025-04-22 19:06:39] Replay buffer saved: models/latest_buffer.pkl (50000 examples, 201.8 MB)
[2025-04-22 19:06:39] 
=== Training Finished (8620.94s / 2.39h) ===
[2025-04-22 19:06:40] Checkpoint saved: models/model_iter_305.pt
[2025-04-22 19:06:40] Saved placement network to models/placement_network.pt
[2025-04-22 19:06:43] Replay buffer saved: models/latest_buffer.pkl (50000 examples, 201.8 MB)
[2025-04-22 19:06:46] Plotly metrics visualization saved to plots/training_metrics.html


In [None]:
# ===== CPU info, (self-play is cpu based for multiprocessing) =====
# ===== CPU model, core / thread counts, and base turbo freq =====
!lscpu | egrep 'Model name|Socket|Thread|Core|MHz'

# ===== Current clock speed of every logical core (updates once) =====
!grep \"cpu MHz\" /proc/cpuinfo | head

# ===== Simple “how fast is it?” micro‑benchmark =====
import time, numpy as np
N = 6000
a = np.random.randn(N, N).astype(np.float32)
b = np.random.randn(N, N).astype(np.float32)

t0 = time.time()
c = a @ b          # single BLAS call – leverages all cores & any MKL/OPENBLAS
elapsed = time.time() - t0
gflops = 2*N**3 / elapsed / 1e9

print(f"\n{elapsed:.3f} s   ≈ {gflops:.1f} GFLOP/s (single large mat‑mul)")


In [None]:
!grep -m1 'model name' /proc/cpuinfo
!nproc
