In [37]:
# imports:
!pip install gymnasium==1.0.0
!pip install ale-py
!pip install wandb
!pip install torchsummary
import gymnasium as gym
import ale_py
from gymnasium.wrappers import MaxAndSkipObservation, ResizeObservation, GrayscaleObservation, FrameStackObservation, ReshapeObservation

import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torchsummary import summary

import collections

import wandb
import datetime



In [38]:
# version
print("Using Gymnasium version {}".format(gym.__version__))

ENV_NAME = "ALE/Breakout-v5"
test_env = gym.make(ENV_NAME, render_mode='rgb_array')

print(test_env.unwrapped.get_action_meanings())
print(test_env.observation_space.shape)

Using Gymnasium version 1.0.0
['NOOP', 'FIRE', 'RIGHT', 'LEFT']
(210, 160, 3)


In [39]:
# Source: M3-2_Example_1a (DQN on Pong, train)
class ImageToPyTorch(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        old_shape = self.observation_space.shape
        self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(old_shape[-1], old_shape[0], old_shape[1]), dtype=np.float32)

    def observation(self, observation):
        return np.moveaxis(observation, 2, 0)


class ScaledFloatFrame(gym.ObservationWrapper):
    def observation(self, obs):
        return np.array(obs).astype(np.float32) / 255.0


def make_env(env_name):
    env = gym.make(env_name, render_mode='rgb_array')
    print("Standard Env.        : {}".format(env.observation_space.shape))
    env = MaxAndSkipObservation(env, skip=4)
    print("MaxAndSkipObservation: {}".format(env.observation_space.shape))
    #env = FireResetEnv(env)
    env = ResizeObservation(env, (84, 84))
    print("ResizeObservation    : {}".format(env.observation_space.shape))
    env = GrayscaleObservation(env, keep_dim=True)
    print("GrayscaleObservation : {}".format(env.observation_space.shape))
    env = ImageToPyTorch(env)
    print("ImageToPyTorch       : {}".format(env.observation_space.shape))
    env = ReshapeObservation(env, (84, 84))
    print("ReshapeObservation   : {}".format(env.observation_space.shape))
    env = FrameStackObservation(env, stack_size=4)
    print("FrameStackObservation: {}".format(env.observation_space.shape))
    env = ScaledFloatFrame(env)
    print("ScaledFloatFrame     : {}".format(env.observation_space.shape))

    return env

env=make_env(ENV_NAME)

Standard Env.        : (210, 160, 3)
MaxAndSkipObservation: (210, 160, 3)
ResizeObservation    : (84, 84, 3)
GrayscaleObservation : (84, 84, 1)
ImageToPyTorch       : (1, 84, 84)
ReshapeObservation   : (84, 84)
FrameStackObservation: (4, 84, 84)
ScaledFloatFrame     : (4, 84, 84)


In [40]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [41]:
# Source: This code is adapted form the following github repository: https://github.com/dxyang/DQN_pytorch/blob/master/model.py
class DuelingDQN(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(DuelingDQN, self).__init__()
        self.num_actions = num_actions

        # Common feature extraction
        self.feature_extractor = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten()
        )

        # Advantage stream
        self.advantage = nn.Sequential(
            nn.Linear(64 * 7 * 7, 512),
            nn.ReLU(),
            nn.Linear(512, num_actions)
        )

        # Value stream
        self.value = nn.Sequential(
            nn.Linear(64 * 7 * 7, 512),
            nn.ReLU(),
            nn.Linear(512, 1)
        )

    def forward(self, x):
        # Extract features
        features = self.feature_extractor(x)

        # Compute advantage and value streams
        adv = self.advantage(features)
        val = self.value(features).expand(x.size(0), self.num_actions)

        # Combine streams into Q-values
        q_values = val + adv - adv.mean(dim=1, keepdim=True)
        return q_values

In [None]:
MEAN_REWARD_BOUND = 100         # Max is 864
NUMBER_OF_REWARDS_TO_AVERAGE = 10

GAMMA = 0.99

BATCH_SIZE = 32
LEARNING_RATE = 0.0001

EXPERIENCE_REPLAY_SIZE = 50000 
SYNC_TARGET_NETWORK = 1000 

EPS_START = 1.0
EPS_DECAY = 0.999985 
EPS_MIN = 0.05

INITIAL_BETA=0.4

In [None]:
Experience = collections.namedtuple('Experience', field_names=['state', 'action', 'reward', 'done', 'new_state'])

class ExperienceReplay:
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)

    def __len__(self):
        return len(self.buffer)

    def append(self, experience):
        self.buffer.append(experience)

    def sample(self, BATCH_SIZE):
        indices = np.random.choice(len(self.buffer), BATCH_SIZE, replace=False)
        states, actions, rewards, dones, next_states = zip(*[self.buffer[idx] for idx in indices])
        
        return np.array(states), np.array(actions), np.array(rewards, dtype=np.float32), \
               np.array(dones, dtype=np.uint8), np.array(next_states)

In [None]:
class DQNAgent:
    def __init__(self, env, exp_replay_buffer):
        self.env = env
        self.exp_replay_buffer = exp_replay_buffer
        self._reset()

    def _reset(self):
        self.current_state = self.env.reset()[0]
        self.total_reward = 0.0

    def step(self, net, epsilon=0.0, device="cpu"):
        done_reward = None
        if np.random.random() < epsilon:
            action = env.action_space.sample()
        else:
            state_ = np.array([self.current_state])
            state = torch.tensor(state_).to(device)
            q_vals = net(state)
            _, act_ = torch.max(q_vals, dim=1)
            action = int(act_.item())

        new_state, reward, terminated, truncated, _ = self.env.step(action)
        is_done = terminated or truncated
        self.total_reward += reward

        clipped_reward=np.sing(reward) #Here we clip the reward for training stability, as shown in the deepmind's paper: 

        exp = Experience(self.current_state, action, clipped_reward, is_done, new_state)
        self.exp_replay_buffer.append(exp)
        self.current_state = new_state

        if is_done:
            done_reward = self.total_reward
            self._reset()

        return done_reward

In [45]:
# login
wandb.login()

# start a new wandb run to track this script
wandb.init(
    project="Part1_DQN",
    config={
        "gamma": GAMMA,
        "learning_rate": LEARNING_RATE,
        "eps_start": EPS_START,
        "eps_decay": EPS_DECAY,
        "expereince_replay_size": EXPERIENCE_REPLAY_SIZE,
        "sync_target_network": SYNC_TARGET_NETWORK
    }
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33marroch35[0m ([33marroch35-organitzation[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [46]:
print(">>> Training starts at ",datetime.datetime.now())

>>> Training starts at  2024-12-01 23:08:38.294099


In [None]:
net = DuelingDQN(env.observation_space.shape, env.action_space.n).to(device)
target_net = DuelingDQN(env.observation_space.shape, env.action_space.n).to(device)

# Here we replace standard buffer with the PER buffer
buffer = ExperienceReplay(EXPERIENCE_REPLAY_SIZE)
agent = DQNAgent(env, buffer)

epsilon = EPS_START
optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
total_rewards = []
losses = []
step_number = 0
spected_min_reward=20
steps_best_model_obtained=None
spected_min_reward_achieved=False


while True:
    step_number += 1
    if(step_number % 2 == 0): epsilon = max(epsilon * EPS_DECAY, EPS_MIN)

    # Here we update beta from INITIAL_BETA to 1.0
    buffer.beta = min(1.0, buffer.beta + (1.0 - INITIAL_BETA) / 500000) # This is divided by 500000 because this is the expected number of steps for the algorithm to run. In this way, beta will be 1.0 when the algorithm reach the 500000 steps

        
    reward = agent.step(net, epsilon, device=device)
    if reward is not None:
        total_rewards.append(reward)

        mean_reward = np.mean(total_rewards[-NUMBER_OF_REWARDS_TO_AVERAGE:])

        print(f"Step:{step_number} | Total games:{len(total_rewards)} | Mean reward: {mean_reward:.3f}  (epsilon used: {epsilon:.2f})")
        wandb.log({"epsilon": epsilon, "reward_mean": mean_reward, "reward": reward}, step=step_number)

        if mean_reward > spected_min_reward:
            name="Part1_DQN_"+str(int(mean_reward))
            torch.save(net.state_dict(), f"../models/{name}.dat")
            print("New best model saved.")
            spected_min_reward_achieved=True
            spected_min_reward +=5
            steps_best_model_obtained=step_number

        if spected_min_reward_achieved and (step_number > steps_best_model_obtained + 100000):
            print("Model stacked, not learning.")
            print("Stopping the execution.")
            break
        
        if mean_reward > MEAN_REWARD_BOUND:
            print(f"SOLVED in {step_number} steps and {len(total_rewards)} games")
            break

    if len(buffer) < EXPERIENCE_REPLAY_SIZE:
        continue

    states_, actions_, rewards_, dones_, next_states_ = buffer.sample(BATCH_SIZE)
    #importance = torch.tensor(importance, dtype=torch.float32).to(device)

    states = torch.tensor(states_).to(device)
    next_states = torch.tensor(next_states_).to(device)
    actions = torch.tensor(actions_).to(device)
    rewards = torch.tensor(rewards_).to(device)
    dones = torch.BoolTensor(dones_).to(device)

    Q_values = net(states).gather(1, actions.unsqueeze(-1)).squeeze(-1)

    # Get the actions selected by the policy network
    policy_actions = net(next_states).argmax(1).unsqueeze(1)  # Shape: [batch_size, 1]

    # Use the target network to compute the value of those actions
    next_state_values = target_net(next_states).gather(1, policy_actions).squeeze(1) # Doble DQN
    next_state_values[dones] = 0.0
    next_state_values = next_state_values.detach()

    expected_Q_values = next_state_values * GAMMA + rewards


    #errors = torch.abs(Q_values - expected_Q_values).detach().cpu().numpy()  # Here we calculate the TD errors
    #loss = (importance * nn.MSELoss(reduction='none')(Q_values, expected_Q_values)).mean() # Here we use reduction='none' because we want to calculate the MSE element by element first, multiply it by the importance weights, and then take the mean
    loss = nn.MSELoss()(Q_values, expected_Q_values)

    # We update the buffer priorities based on the TD errors
    #buffer.update_priorities(indices, errors)

    losses.append(loss.item())
    mean_losses = np.mean(losses[-NUMBER_OF_REWARDS_TO_AVERAGE:])
    wandb.log({"loss_mean": mean_losses, "loss": loss.item()}, step=step_number)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if step_number % SYNC_TARGET_NETWORK == 0:
        target_net.load_state_dict(net.state_dict())



Step:58 | Total games:1 | Mean reward: 1.000  (epsilon used: 1.00)
Step:108 | Total games:2 | Mean reward: 1.000  (epsilon used: 1.00)
Step:178 | Total games:3 | Mean reward: 1.667  (epsilon used: 1.00)
Step:229 | Total games:4 | Mean reward: 1.750  (epsilon used: 1.00)
Step:283 | Total games:5 | Mean reward: 1.800  (epsilon used: 1.00)
Step:326 | Total games:6 | Mean reward: 1.667  (epsilon used: 1.00)
Step:387 | Total games:7 | Mean reward: 1.571  (epsilon used: 1.00)
Step:442 | Total games:8 | Mean reward: 1.625  (epsilon used: 1.00)
Step:510 | Total games:9 | Mean reward: 1.444  (epsilon used: 1.00)
Step:578 | Total games:10 | Mean reward: 1.600  (epsilon used: 1.00)
Step:652 | Total games:11 | Mean reward: 1.700  (epsilon used: 1.00)
Step:696 | Total games:12 | Mean reward: 1.600  (epsilon used: 0.99)
Step:770 | Total games:13 | Mean reward: 1.600  (epsilon used: 0.99)
Step:838 | Total games:14 | Mean reward: 1.600  (epsilon used: 0.99)
Step:923 | Total games:15 | Mean reward: 1.7

KeyboardInterrupt: 

In [None]:
torch.save(net.state_dict(), "../models/Part1_DQN.dat")

In [None]:
print(">>> Training ends at ",datetime.datetime.now())
print("")
print("")

In [48]:
# Finish the wandb run, necessary in notebooks
wandb.finish()

VBox(children=(Label(value='0.113 MB of 0.113 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epsilon,█▇▇▇▇▆▆▆▅▅▅▅▅▅▅▄▄▄▄▄▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▁▁
loss,▁▁▂▂▂▂▂▂▂▁▂▂▅▄▂▂▂▁▁▂▃▄▃▂▂▃▂▄▄▃▄▃▆▂▂▅▃▃█▅
loss_mean,▃▃▂▁▁▂▁▂▂▃▁▁▂▁▂▂▂▃▂▂▂▂▂▅▃▃▄▄▃▃▂▅█▃▃▄▄▄▇▄
reward,▂▂▂▂▂▂▂▂▅▄▁▂▂▂▂▂▂▃▄▃▂▂▂▅▁▂▃▆▃▅▃▅▆█▄▃▄▅▅▄
reward_mean,▂▂▂▂▂▂▁▂▁▂▂▂▂▂▁▂▂▂▁▂▂▂▃▂▃▂▃▄▅▅▆▆▆▄▆▆██▇▆

0,1
epsilon,0.44886
loss,0.00983
loss_mean,0.02208
reward,4.0
reward_mean,5.0


In [None]:

model=DuelingDQN(env.observation_space.shape, env.action_space.n).to(device)
model.load_state_dict(torch.load("../models/Part1_DQN_15.dat", map_location=torch.device('cpu')))



  model.load_state_dict(torch.load("../models/Part1_DQN_15.dat", map_location=torch.device('cpu')))


<All keys matched successfully>

In [None]:
from PIL import Image
import time
# params
visualize = True
images = []

state, _ = env.reset()
total_reward = 0.0

while True:
    start_ts = time.time()
    if visualize:
        img = env.render()
        images.append(Image.fromarray(img))

    state_ = torch.tensor(np.array([state], copy=False))
    q_vals = model(state_).data.numpy()[0]
    action = np.argmax(q_vals)

    state, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated
    total_reward += reward
    if done:
        break

print("Total reward: %.2f" % total_reward)

Total reward: 13.00


In [None]:
# params
gif_file = "video.gif"

# duration is the number of milliseconds between frames; this is 40 frames per second
images[0].save(gif_file, save_all=True, append_images=images[1:], duration=60, loop=0)

print("Episode export to '{}'".format(gif_file))

Episode export to 'video.gif'
