In [None]:
from distutils.util import strtobool
import matplotlib.pyplot as plt

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical
import maze_library
from torch.utils.tensorboard import SummaryWriter


In [None]:
hyperparams = {
    "learning_rate":[1e-5],
    "episodes":[400],
    "hidden_layers":[ 
        [512, 256, 128] 
    ],
    "activation":[ "relu"],
    "policy_lr": [1e-5],
    "value_lr": [1e-5],
    "target_kl_div": [0.02],
    "max_policy_train_iters": [80],
    "value_train_iters": [80]
    
    
}
shared_params = {
    "steps_per_episode": 1000,
    "random_seed": 42,
    "maze_width": 7,
    "maze_height": 7,
    "log_every": 1,
    "plot_every": 1100,
    "mini_explore_runs_per_episode":3,
    "mini_exploit_runs_per_episode":2,
    
}

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
activation_map = {
    "relu": nn.ReLU,
    "elu": nn.ELU,
    "tanh": nn.Tanh,
    "leaky_relu": nn.LeakyReLU,
}

class ActorCriticWithCNN(nn.Module):
    def __init__(self, input_size, action_space_size, hidden_layers=[128, 128, 128], activation="relu"):
        super().__init__()
        self.visual_feature_size = 175  # 5x5x7
        self.visual_channels = 7
        self.grid_size = 5
        self.non_visual_input_size = input_size - self.visual_feature_size

        # CNN for visual features
        self.cnn = nn.Sequential(
            nn.Conv2d(self.visual_channels, 32, kernel_size=3, padding=1),
            activation_map[activation](),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            activation_map[activation](),
            nn.MaxPool2d(2),
        )

        cnn_output_size = 64 * 2 * 2  # after MaxPool2d

        # Shared layers for non-visual + visual combined
        fc_input_size = cnn_output_size + self.non_visual_input_size
        self.shared_layers = nn.Sequential()
        prev_size = fc_input_size
        for i, hidden_size in enumerate(hidden_layers):
            self.shared_layers.add_module(f"fc{i}", nn.Linear(prev_size, hidden_size))
            self.shared_layers.add_module(f"act{i}", activation_map[activation]())
            prev_size = hidden_size

        # Policy head
        self.policy_layers = nn.Sequential(
            nn.Linear(prev_size, 128),
            activation_map[activation](),
            nn.Linear(128, action_space_size),
        )

        # Value head
        self.value_layers = nn.Sequential(
            nn.Linear(prev_size, 128),
            activation_map[activation](),
            nn.Linear(128, 1),
        )

    def forward(self, x):
        visual_flat = x[:, -self.visual_feature_size:]
        non_visual = x[:, :-self.visual_feature_size]

        visual = visual_flat.view(-1, self.visual_channels, self.grid_size, self.grid_size)
        visual_out = self.cnn(visual)
        visual_out = visual_out.view(visual_out.size(0), -1)

        x_combined = torch.cat((non_visual, visual_out), dim=1)
        z = self.shared_layers(x_combined)

        policy_logits = self.policy_layers(z)
        value = self.value_layers(z)

        return policy_logits, value

    def policy(self, x):
        policy_logits, _ = self.forward(x)
        return policy_logits

    def value(self, x):
        _, value = self.forward(x)
        return value

In [None]:
class PPOTrainer():
    def __init__(self, actor_critic, ppo_clip_val=0.2, target_kl_div=0.01, 
                 max_policy_train_iters=80, value_train_iters=80,
                 policy_lr=3e-4, value_lr=1e-2):
        
        self.ac = actor_critic
        self.ppo_clip_val = ppo_clip_val
        self.target_kl_div = target_kl_div
        self.max_policy_train_iters = max_policy_train_iters
        self.value_train_iters = value_train_iters

        policy_params = list(self.ac.shared_layers.parameters()) + list(self.ac.policy_layers.parameters())
        self.policy_optim = optim.Adam(policy_params, lr=policy_lr)

        value_params = list(self.ac.shared_layers.parameters()) + list(self.ac.value_layers.parameters())
        self.value_optim = optim.Adam(value_params, lr=value_lr)

    def train_policy(self, obs, acts, old_log_probs, gaes):
        for _ in range(self.max_policy_train_iters):
            self.policy_optim.zero_grad()

            logits = self.ac.policy(obs)
            dist = Categorical(logits=logits)
            new_log_probs = dist.log_prob(acts)

            policy_ratio = torch.exp(new_log_probs - old_log_probs)
            clipped_ratio = policy_ratio.clamp(1 - self.ppo_clip_val, 1 + self.ppo_clip_val)

            policy_loss = -torch.min(policy_ratio * gaes, clipped_ratio * gaes).mean()

            policy_loss.backward()
            self.policy_optim.step()

            kl_div = (old_log_probs - new_log_probs).mean()
            if kl_div >= self.target_kl_div:
                break

    def train_value(self, obs, returns):
        for _ in range(self.value_train_iters):
            self.value_optim.zero_grad()

            values = self.ac.value(obs)
            value_loss = ((returns - values) ** 2).mean()

            value_loss.backward()
            self.value_optim.step()

In [None]:
def discount_rewards(rewards, gamma=0.99):
    """
    Return discounted rewards based on the given rewards and gamma param.
    """
    new_rewards = [float(rewards[-1])]
    for i in reversed(range(len(rewards)-1)):
        new_rewards.append(float(rewards[i]) + gamma * new_rewards[-1])
    return np.array(new_rewards[::-1])

def calculate_gaes(rewards, values, gamma=0.99, decay=0.97):
    """
    Return the General Advantage Estimates from the given rewards and values.
    Paper: https://arxiv.org/pdf/1506.02438.pdf
    """
    next_values = np.concatenate([values[1:], [0]])
    deltas = [rew + gamma * next_val - val for rew, val, next_val in zip(rewards, values, next_values)]

    gaes = [deltas[-1]]
    for i in reversed(range(len(deltas)-1)):
        gaes.append(deltas[i] + decay * gamma * gaes[-1])

    return np.array(gaes[::-1])

In [None]:
def rollout(hyperprm, model, env):
    """
    Performs a single rollout.
    Returns training data in the shape (n_steps, observation_shape)
    and the cumulative reward.
    """
    ### Create data storage
    train_data = [[], [], [], [], []] # obs, act, reward, values, act_log_probs
    obs = env.reset_and_regenerate()
    
    report_cards = []
    current_episode_explore_rewards = []
    current_episode_exploit_rewards = []
    for mini_episode in range((3 + 2)):
      env.smart_reset(mini_episode)      
      for _ in range(hyperprm["steps_per_episode"]):
          logits, val = model(torch.tensor([obs], dtype=torch.float32,
                                          device=DEVICE))
          act_distribution = Categorical(logits=logits)
          act = act_distribution.sample()
          act_log_prob = act_distribution.log_prob(act).item()

          act, val = act.item(), val.item()
          action = maze_library.create_action(act, mini_episode)
          next_obs, reward, done, truncated = env.take_action(action)
          if mini_episode < hyperprm["mini_explore_runs_per_episode"]:
              current_episode_explore_rewards.append(reward)
          else:
              current_episode_exploit_rewards.append(reward)
          for i, item in enumerate((obs, act, reward, val, act_log_prob)):
            train_data[i].append(item)

          obs = next_obs
          if done or truncated:
              break
    report_cards.append(maze_library.get_score(env))
    train_data = [np.asarray(x) for x in train_data]

    ### Do train data filtering
    train_data[3] = calculate_gaes(train_data[2], train_data[3])

    return train_data, current_episode_explore_rewards, current_episode_exploit_rewards

In [None]:
env = maze_library.init_environment(shared_params["maze_width"], shared_params["maze_height"], mini_exploit_runs_per_episode=shared_params["mini_exploit_runs_per_episode"], mini_explore_runs_per_episode=shared_params["mini_explore_runs_per_episode"])
 # Test rollout function


In [None]:
import os


def run_single_experiment(hyperprm, timestamp,run_id):
    filecount = 0
    folder_name = f"../mazeLogs/{timestamp}DoubleDQNMaze/Run{run_id}"
    os.makedirs(folder_name, exist_ok=True) 
    model = ActorCriticWithCNN(env.input_shape(), env.output_shape(), hidden_layers=hyperprm["hidden_layers"], activation=hyperprm["activation"])
    model = model.to(DEVICE)
    all_episode_explore_rewards = []
    all_episode_exploit_rewards = []
    report_cards = []
    ppo = PPOTrainer(
        model,
        policy_lr = hyperprm["policy_lr"],
        value_lr = hyperprm["value_lr"],
        target_kl_div = hyperprm["target_kl_div"],
        max_policy_train_iters = hyperprm["max_policy_train_iters"],
        value_train_iters = hyperprm["value_train_iters"])
    for episode_idx in range(hyperprm["episodes"]):
        # Perform rollout
        train_data, explore, exploit = rollout(hyperprm, model, env)
        all_episode_explore_rewards.extend(explore)
        all_episode_exploit_rewards.extend(exploit)
        # Shuffle
        permute_idxs = np.random.permutation(len(train_data[0]))

        # Policy data
        obs = torch.tensor(train_data[0][permute_idxs],
                            dtype=torch.float32, device=DEVICE)
        acts = torch.tensor(train_data[1][permute_idxs],
                            dtype=torch.int32, device=DEVICE)
        gaes = torch.tensor(train_data[3][permute_idxs],
                            dtype=torch.float32, device=DEVICE)
        act_log_probs = torch.tensor(train_data[4][permute_idxs],
                                    dtype=torch.float32, device=DEVICE)

        # Value data
        returns = discount_rewards(train_data[2])[permute_idxs]
        returns = torch.tensor(returns, dtype=torch.float32, device=DEVICE)

        # Train model
        ppo.train_policy(obs, acts, act_log_probs, gaes)
        ppo.train_value(obs, returns)
        score = maze_library.get_score(env)
        report_cards.append(score)
        if (episode_idx + 1) % hyperprm["log_every"] == 0:
            print(f"""Run {run_id}, Total steps {score.total_steps},Average Solve Score {float(f"{score.average_run_score/ score.dijkstra_shortest_path_score:.2g}") } filecount: {filecount} 
                  Explore Rewards sum: {float(f"{np.sum(np.array(explore)):.2g}")} Exploit Rewards sum: {float(f"{np.sum(np.array(exploit)):.2g}")} 
                  Success Rate in solving {score.success_rate_in_exploitation} Episode: {episode_idx}""")
            with open(f'{folder_name}/doubledqn{filecount}.json', 'w') as file:
                file.write(env.to_json_python())
            filecount+=1
            
            
    return {
        "params": hyperprm,
        "explore_rewards": all_episode_explore_rewards,
        "exploit_rewards": all_episode_exploit_rewards,
        "score" : report_cards,
    }

In [None]:
import datetime

from utils import generate_param_combinations, plot_episode_returns


results = []
combo_params = list(generate_param_combinations(hyperparams))
timestamp = datetime.datetime.now().strftime("%d-%m_%H-%M")
run_count = 0
print(f"Totals combinations = {len(combo_params)}")
for i, combo in enumerate(combo_params):
    for j in range(2):
        merged_params = {**shared_params, **combo}
        results.append(run_single_experiment(merged_params,timestamp, run_id=run_count))
        run_count += 1
        plot_episode_returns(results[-1]["explore_rewards"], results[-1]["exploit_rewards"], 10)