In [1]:
import itertools
from datetime import datetime
import os
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
import numpy as np
from typing import List, Tuple, Optional, Dict
from collections import deque
import pickle
import matplotlib.pyplot as plt

from learners.mo_dqn.mo_dqn_trainer import ReplayBuffer, MODQNTrainer
from learners.mo_dqn.mo_dqn_policy import MODQN
import gymnasium as gym
import mo_gymnasium as mo_gym

  from pkg_resources import resource_stream, resource_exists


In [2]:
def smooth(values, weight=0.9):
    """
    Exponential moving average smoothing.
    weight close to 1 → more smoothing.
    """
    smoothed = []
    last = values[0]
    for v in values:
        last = last * weight + (1 - weight) * v
        smoothed.append(last)
    return smoothed


def plot_logs(log_file_path, smooth_factor=0.9):
    """
    Load logs (saved as .pkl) and produce:
    - A plot for each scalar log (episode_return, loss_per_update, avg_loss_per_episode)
    - Separate plots for each objective in mo_return
    - Optional smoothing (EMA)
    """

    with open(log_file_path, "rb") as f:
        logs = pickle.load(f)

    # ----------------------------
    # Plot scalar logs
    # ----------------------------
    for key in logs:
        if key == "Multi-Objective Returns":
            continue

        values = logs[key]

        if len(values) == 0:
            print(f"Skipping {key} (empty)")
            continue

        # Smooth
        smoothed = smooth(values, weight=smooth_factor)

        plt.figure(figsize=(6, 4))
        plt.plot(values, alpha=0.3)
        plt.plot(smoothed)
        plt.title(key)
        plt.xlabel("Episode / Update Index")
        plt.ylabel(key.replace("_", " "))
        plt.grid(True)
        plt.show()

    # ----------------------------
    # Plot multi-objective returns
    # ----------------------------
    mo_returns = logs["Multi-Objective Returns"]

    if len(mo_returns) > 0:
        mo_array = np.array(mo_returns)
        num_obj = mo_array.shape[1]

        for obj_idx in range(num_obj):
            raw = mo_array[:, obj_idx]
            smoothed = smooth(raw, weight=smooth_factor)

            plt.figure(figsize=(6, 4))
            plt.plot(raw, alpha=0.3)
            plt.plot(smoothed)
            plt.title(f"MO Return – Objective {obj_idx}")
            plt.xlabel("Episode")
            plt.ylabel(f"Return[{obj_idx}]")
            plt.grid(True)
            plt.show()

## Environment 1: Resource Gathering


### Learn the Policy

In [None]:
num_episodes = 10000
ep_len = 100
warmup_ep_len = 50

hyperparameters = { 
    'lr': 0.0005,
    'gamma': 0.99,
    'batch_size': 256,
    'utility_fn': [0.4, 0.3, 0.3],
    'layer_sizes': [128, 128],
    'epsilon_decay': 0.9997,
    'updates_per_episode': 50,
    'max_buffer_size':50000
}

num_actions, num_obs, num_objectives = 4, 4, 3
env = mo_gym.make("resource-gathering-v0") 

# Init classes
policy = MODQN(
    hyperparameters['utility_fn'],
    num_actions, num_obs, num_objectives,
    hyperparameters['layer_sizes']
)

replay_buffer = ReplayBuffer(max_size=hyperparameters['max_buffer_size'])

trainer = MODQNTrainer(
    policy,
    hyperparameters['lr'],
    hyperparameters['gamma'],
    hyperparameters['batch_size'],
    replay_buffer,
    env,
    epsilon_decay=hyperparameters['epsilon_decay'],
    updates_per_episode=hyperparameters['updates_per_episode']
)

trainer.train(num_episodes, ep_len=ep_len, warmup_ep_len=warmup_ep_len)

filepath = "training_logs_resource_gathering.pkl"
with open(filepath, "wb") as f:
    pickle.dump(trainer.logs, f)
print(f"Logs saved to {filepath}")

### Make Plots

In [None]:
plot_logs(filepath, smooth_factor=0.85)