 # DQN Training in Maze Environment (Hyperparameterized)

In [None]:
import maze_library
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
import datetime
import random
from collections import defaultdict
import os
from utils import PrioritizedReplayBuffer, generate_param_combinations, save_experiment_results_with_pickle, load_experiment_results_with_pickle




 ## Hyperparameters

In [None]:


hyperparams = {
    "learning_rate":[5e-4],
    "gamma":[ 0.97],
    "epsilon_decay_episodes":[ 0.9],
    "batch_size":[ 64],
    "replay_buffer_size":[ 10000],
    "episodes":[ 6000 ],
    "hidden_layers":[ 
        [512, 256, 128],
    ],
    "activation":["relu"],
    "target_dqn_update" :[ 30],
    "gen_algorithm": ["BinaryTree"],
    "maze_size": [7],
}
shared_params = {
    "test_episodes": 200,
    "epsilon_start": 1.0,
    "epsilon_min": 0.01,
    "replay_buffer_size": 10000,
    "steps_per_episode": 1000,
    "random_seed": 42,
    "log_every": 400,
    "q_log_every": 2000,
    "plot_every": 1100,
    "train_after": 0.2,
    "mini_explore_runs_per_episode":3,
    "mini_exploit_runs_per_episode":2,
    
}

# Constants

In [None]:
print(maze_library.maze_generation_algorithms())

 ## Maze Setup

In [None]:
np.random.seed(42)

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


 ## Model Definition

In [None]:
activation_map = {
    "relu": nn.ReLU,
    "elu": nn.ELU,
    "tanh": nn.Tanh,
    "sigmoid": nn.Sigmoid,
    # Add more if needed
}

class DQNWithCNN(nn.Module):
    def __init__(self, input_size, output_size, device, hidden_layers=[64, 64, 64], activation="elu"):
        super(DQNWithCNN, self).__init__()
        self.device = device

        # Constants
        self.visual_feature_size = 175  # 5x5x7
        self.visual_channels = 7
        self.grid_size = 5

        # Compute size of non-visual input
        self.non_visual_input_size = input_size - self.visual_feature_size

        # CNN for visual features (input shape: [batch, 7, 5, 5])
        self.cnn = nn.Sequential(
            nn.Conv2d(self.visual_channels, 32, kernel_size=3, padding=1),  # output: [batch, 32, 5, 5]
            activation_map[activation](),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),  # output: [batch, 64, 5, 5]
            activation_map[activation](),
            nn.MaxPool2d(2),  # output: [batch, 64, 2, 2]
        )

        # Output from CNN flattened
        cnn_output_size = 64 * 2 * 2

        # FC layers for non-visual part
        fc_input_size = cnn_output_size + self.non_visual_input_size
        self.fc_layers = nn.Sequential()
        prev_size = fc_input_size
        for i, hidden_size in enumerate(hidden_layers):
            self.fc_layers.add_module(f"fc{i}", nn.Linear(prev_size, hidden_size))
            self.fc_layers.add_module(f"act{i}", activation_map[activation]())
            prev_size = hidden_size

        # Output layer
        self.fc_layers.add_module("output", nn.Linear(prev_size, output_size))

    def forward(self, x):
        # Split input into visual and non-visual components
        visual_flat = x[:, -self.visual_feature_size:]  # Last 175 features
        non_visual = x[:, :-self.visual_feature_size]   # Remaining features

        # Reshape visual input to [batch, 7, 5, 5]
        visual = visual_flat.view(-1, self.visual_channels, self.grid_size, self.grid_size)

        # Process visual features through CNN
        visual_out = self.cnn(visual)
        visual_out = visual_out.view(visual_out.size(0), -1)  # Flatten

        # Concatenate visual and non-visual features
        x_combined = torch.cat((non_visual, visual_out), dim=1)

        # Forward through FC layers
        return self.fc_layers(x_combined)


def create_double_dqn_model(hyperprm, input_shape, n_outputs):
    model = DQNWithCNN(input_shape, n_outputs, DEVICE, hyperprm["hidden_layers"], hyperprm["activation"]).to(DEVICE)
    target_model = DQNWithCNN(input_shape, n_outputs, DEVICE, hyperprm["hidden_layers"], hyperprm["activation"]).to(DEVICE)
    target_model.load_state_dict(model.state_dict())  
    optimizer = torch.optim.NAdam(model.parameters(), lr=hyperprm["learning_rate"])  
    loss_fn = nn.HuberLoss()

    return model, target_model, optimizer, loss_fn


 ## Policy and Replay Buffer

In [None]:
def epsilon_greedy_policy(state, epsilon, model, n_outputs):
    if random.random() < epsilon:
        return random.randint(0, n_outputs - 1)
    with torch.no_grad():
        state_tensor = torch.tensor(state, dtype=torch.float32, device=DEVICE).unsqueeze(0)
        q_values = model(state_tensor)
        return int(torch.argmax(q_values).item())

In [None]:
def play_one_step(env, state, epsilon,model, replay_buffer, run, n_outputs):
    action = epsilon_greedy_policy(state, epsilon, model, n_outputs)
    action_obj = maze_library.create_action(action, run)
    next_state, reward, done, truncated = env.take_action(action_obj)
    replay_buffer.append(state, action, reward, next_state, done, truncated)
    return next_state, reward, done, truncated

In [None]:
def training_step(model,target_model ,optimizer, loss_fn, hyperprm, replay_buffer, q_value_log, losses):
    batch = replay_buffer.sample(hyperprm["batch_size"])    
    states = torch.tensor(batch["state"], dtype=torch.float32, device=DEVICE)
    actions = torch.tensor(batch["action"], dtype=torch.int64, device=DEVICE)
    rewards = torch.tensor(batch["reward"], dtype=torch.float32, device=DEVICE)
    next_states = torch.tensor(batch["next_state"], dtype=torch.float32, device=DEVICE)
    dones = torch.tensor(batch["done"], dtype=torch.float32, device=DEVICE)
    truncateds = torch.tensor(batch["truncated"], dtype=torch.float32, device=DEVICE)
    is_weights = torch.tensor(batch["weights"], dtype=torch.float32, device=DEVICE)

    with torch.no_grad():
        next_q_values = model(next_states)
        next_actions = next_q_values.argmax(dim=1)
        target_q_values = target_model(next_states)
        max_next_q = target_q_values.gather(1, next_actions.unsqueeze(1)).squeeze()
        
        terminal = torch.logical_or(dones.bool(), truncateds.bool()).float()
        target_q = rewards + (1 - terminal) * hyperprm["gamma"] * max_next_q

    # Compute current Q-values
    q_values = model(states)
    selected_q = q_values.gather(1, actions.unsqueeze(1)).squeeze()

    # TD error and loss
    td_errors = selected_q - target_q  # TD error remains the same
    loss = (is_weights * loss_fn(selected_q, target_q)).mean()
    is_weights = is_weights / is_weights.max()

    optimizer.zero_grad()
    loss.backward()
    
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=10.0)
    optimizer.step()

    # Update priorities
    new_priorities = td_errors.detach().abs().cpu().numpy() + 1e-5
    replay_buffer.update_priorities(batch["indices"], new_priorities)

    losses.append(loss.item())
    q_value_log.extend(selected_q.detach().cpu().numpy())


 ## Training Loop

In [None]:
from utils import plot_losses, plot_q_histogram


def run_single_experiment(hyperprm, timestamp,run_id):
    all_episode_explore_rewards  = []
    all_episode_exploit_rewards  = []
    losses = []
    report_cards = []
    q_value_log = []
    maze_generation_algorithms = maze_library.maze_generation_algorithms()
    filecount = 0
    folder_name = f"../mazeLogs/{timestamp}DoubleDQNMaze/Run{run_id}"
    env = maze_library.init_environment(hyperprm["maze_size"], hyperprm["maze_size"], 
                                    gen_algorithm=hyperprm["gen_algorithm"],
                                    mini_exploit_runs_per_episode=hyperprm["mini_exploit_runs_per_episode"], 
                                    mini_explore_runs_per_episode=hyperprm["mini_explore_runs_per_episode"])
    input_shape = env.input_shape()
    n_outputs = env.output_shape()
    os.makedirs(folder_name, exist_ok=True) 
    model,target_model, optimizer, loss_fn = create_double_dqn_model(hyperprm, input_shape, n_outputs)
    replay_buffer = PrioritizedReplayBuffer(capacity=hyperprm["replay_buffer_size"], state_shape=(input_shape,))
    print(f"Run {run_id}, Episodes: {hyperprm['episodes']}")
    for episode in range(hyperprm["episodes"]):
        obs = env.reset_and_regenerate()
        current_episode_explore_rewards = []
        current_episode_exploit_rewards = []
        eps_decay = episode / (hyperprm["episodes"] * hyperprm["epsilon_decay_episodes"])
        epsilon = max(hyperprm["epsilon_start"] - eps_decay, hyperprm["epsilon_min"])
        for mini_episode in range((hyperprm["mini_explore_runs_per_episode"] + hyperprm["mini_exploit_runs_per_episode"])):
            obs = env.smart_reset(mini_episode)
            for _ in range(hyperprm["steps_per_episode"]):
                obs, reward, done, truncated = play_one_step(env, obs, epsilon,model, replay_buffer, mini_episode, n_outputs)
                if mini_episode < hyperprm["mini_explore_runs_per_episode"]:
                    current_episode_explore_rewards.append(reward)
                else:
                    current_episode_exploit_rewards.append(reward)
                
                if done or truncated:
                    break
                

        if episode > (hyperprm["episodes"] * hyperprm["train_after"]):
            training_step(model,target_model ,optimizer, loss_fn, hyperprm, replay_buffer, q_value_log, losses)        
        if episode % hyperprm["target_dqn_update"] == 0 and episode > 0:
            target_model.load_state_dict(model.state_dict())
        
        report_card = maze_library.get_score(env)
        report_cards.append(report_card.to_json())
        all_episode_explore_rewards.append(np.sum(np.array(current_episode_explore_rewards)))
        all_episode_exploit_rewards.append(np.sum(np.array(current_episode_exploit_rewards)))
        if episode % hyperprm["log_every"] == 0 and episode != 0:
            # print(f"""Run {run_id}, Total steps {report_card.total_steps},Average Solve Score {float(f"{report_card.average_run_score/ report_card.dijkstra_shortest_path_score:.2g}") } filecount: {filecount} 
            #       Explore Rewards sum: {float(f"{np.sum(np.array(current_episode_explore_rewards)):.2g}")} Exploit Rewards sum: {float(f"{np.sum(np.array(current_episode_exploit_rewards)):.2g}")} 
            #       Success Rate in solving {report_card.success_rate_in_exploitation} Episode: {episode}""")
            with open(f'{folder_name}/doubledqn{filecount}.json', 'w') as file:
                file.write(env.to_json_python())
            
            filecount += 1

    # plot_q_histogram(q_value_log)
    # plot_losses(losses)
    

    testing_maze_report_cards = defaultdict(list)
    folder_name = f"../mazeLogs/{timestamp}DoubleDQNMaze/Run{run_id}/Test"
    os.makedirs(folder_name, exist_ok=True) 
    
    
    for generation_type in maze_generation_algorithms:
        print("Starting test with", generation_type)
        filecount = 0
        
        for episode in range(hyperprm["test_episodes"]):
            obs = env.reset_and_regenerate(generation_type)
            for mini_episode in range((hyperprm["mini_explore_runs_per_episode"] + hyperprm["mini_exploit_runs_per_episode"])):
                obs = env.smart_reset(mini_episode)
                for _ in range(hyperprm["steps_per_episode"]):
                    with torch.no_grad():
                        state_tensor = torch.tensor(obs, dtype=torch.float32, device=DEVICE).unsqueeze(0)
                        q_values = model(state_tensor)
                        action = maze_library.create_action(int(torch.argmax(q_values).item()), mini_episode)
                        obs, _, done, truncated = env.take_action(action) 
                        
                    if done or truncated:
                        break
            if episode % 20 == 0 and episode != 0:                
                with open(f'{folder_name}/{generation_type}{filecount}.json', 'w') as file:
                    file.write(env.to_json_python())
            filecount += 1        
            report_card = maze_library.get_score(env)
            testing_maze_report_cards[generation_type].append(report_card.to_json())
        
        
    save_experiment_results_with_pickle ({
        "params": hyperprm,
        "training_explore_rewards": all_episode_explore_rewards,
        "training_exploit_rewards": all_episode_exploit_rewards,
        "training_report" : report_cards,
        "testing_maze_report_cards" : dict(testing_maze_report_cards)
    }, f"../mazeLogs/{timestamp}DoubleDQNMaze/Run{run_id}/experiment_results.pkl")  
        

In [None]:
results = []
combo_params = list(generate_param_combinations(hyperparams))
timestamp = datetime.datetime.now().strftime("%d-%m_%H-%M")
run_count = 0
print(f"Totals combinations = {len(combo_params)}")
for i, combo in enumerate(combo_params):
    for j in range(1):
        merged_params = {**shared_params, **combo}
        run_single_experiment(merged_params,timestamp, run_id=run_count)
        run_count += 1
    