In [1]:
# imports:
!pip install gymnasium==1.0.0
!pip install ale-py
!pip install wandb
!pip install torchsummary
import gymnasium as gym
import ale_py
from gymnasium.wrappers import MaxAndSkipObservation, ResizeObservation, GrayscaleObservation, FrameStackObservation, ReshapeObservation

import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

import torch.optim as optim
from torchsummary import summary

import collections

import wandb
import datetime

from PIL import Image
import time

import math



In [3]:
# version
print("Using Gymnasium version {}".format(gym.__version__))

ENV_NAME = "ALE/Breakout-v5"
test_env = gym.make(ENV_NAME, render_mode='rgb_array')

print(test_env.unwrapped.get_action_meanings())
print(test_env.observation_space.shape)

Using Gymnasium version 1.0.0
['NOOP', 'FIRE', 'RIGHT', 'LEFT']
(210, 160, 3)


In [None]:
# Source: This class domes from the class activity M3-2_Example_1a (DQN on Pong, train)
class ImageToPyTorch(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        old_shape = self.observation_space.shape
        self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(old_shape[-1], old_shape[0], old_shape[1]), dtype=np.float32)

    def observation(self, observation):
        return np.moveaxis(observation, 2, 0)

# Source: This class domes from the class activity M3-2_Example_1a (DQN on Pong, train)
class ScaledFloatFrame(gym.ObservationWrapper):
    def observation(self, obs):
        return np.array(obs).astype(np.float32) / 255.0

# Source: This class has been adapted from the following github repository: # https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/blob/master/Chapter06/lib/wrappers.py
class FireResetEnv(gym.Wrapper):
    def __init__(self, env=None):
        super(FireResetEnv, self).__init__(env)
        assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
        assert len(env.unwrapped.get_action_meanings()) >= 3

    def step(self, action):
        return self.env.step(action)

    def reset(self, **kwargs):
        
        obs, info = self.env.reset(**kwargs)
        obs, _, terminated, truncated, _ = self.env.step(1)
        if terminated or truncated:
            obs, info = self.env.reset(**kwargs)
        obs, _, terminated, truncated, _ = self.env.step(2)
        if terminated or truncated:
            obs, info = self.env.reset(**kwargs)
        return obs, info

# Source: This class has been adapted from the following github repository: # https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/blob/master/Chapter06/lib/wrappers.py
class EpisodicLifeEnv(gym.Wrapper):
    def __init__(self, env):
        """Make end-of-life == end-of-episode, but only reset on true game over.
        Done by DeepMind for the DQN and co. since it helps value estimation.
        """
        gym.Wrapper.__init__(self, env)
        self.lives = 0
        self.was_real_done  = True

    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)
        self.was_real_done = terminated or truncated
        # check current lives, make loss of life terminal,
        # then update lives to handle bonus lives
        lives = self.env.unwrapped.ale.lives()
        if lives < self.lives and lives > 0:
            # for Qbert sometimes we stay in lives == 0 condtion for a few frames
            # so its important to keep lives > 0, so that we only reset once
            # the environment advertises done.
            terminated = True
            truncated = True
        self.lives = lives
        return obs, reward, terminated, truncated, info

    def reset(self, **kwargs):
        """Reset only when lives are exhausted.
        This way all states are still reachable even though lives are episodic,
        and the learner need not know about any of this behind-the-scenes.
        """
        if self.was_real_done:
            obs, info = self.env.reset(**kwargs)
        else:
            # no-op step to advance from terminal/lost life state
            obs, _, _, _, info = self.env.step(0)
        self.lives = self.env.unwrapped.ale.lives()
        return obs, info

def make_env(env_name):
    env = gym.make(env_name, render_mode='rgb_array')
    print("Standard Env.        : {}".format(env.observation_space.shape))
    env = MaxAndSkipObservation(env, skip=4)
    print("MaxAndSkipObservation: {}".format(env.observation_space.shape))
    env = FireResetEnv(env)
    env = ResizeObservation(env, (84, 84))
    print("ResizeObservation    : {}".format(env.observation_space.shape))
    env = GrayscaleObservation(env, keep_dim=True)
    print("GrayscaleObservation : {}".format(env.observation_space.shape))
    env = ImageToPyTorch(env)
    print("ImageToPyTorch       : {}".format(env.observation_space.shape))
    env = ReshapeObservation(env, (84, 84))
    print("ReshapeObservation   : {}".format(env.observation_space.shape))
    env = FrameStackObservation(env, stack_size=4)
    print("FrameStackObservation: {}".format(env.observation_space.shape))
    env = ScaledFloatFrame(env)
    print("ScaledFloatFrame     : {}".format(env.observation_space.shape))
    #env = EpisodicLifeEnv(env)

    return env

env=make_env(ENV_NAME)

Standard Env.        : (210, 160, 3)
MaxAndSkipObservation: (210, 160, 3)
ResizeObservation    : (84, 84, 3)
GrayscaleObservation : (84, 84, 1)
ImageToPyTorch       : (1, 84, 84)
ReshapeObservation   : (84, 84)
FrameStackObservation: (4, 84, 84)
ScaledFloatFrame     : (4, 84, 84)


In [5]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [None]:
# Source: This class comes from the following github repository: https://github.com/SimonNick/rainbow/blob/master/model.py
class NoisyLinear(nn.Module):
    # This class is a linear layer with added noise
    def __init__(self, in_features, out_features, sigma_init):
        super(NoisyLinear, self).__init__()
        
        self.in_features = in_features
        self.out_features = out_features 
        self.sigma_init = sigma_init

        # These are the mean parameters for weights and biases
        self.weight_mu = nn.Parameter(torch.FloatTensor(out_features, in_features))
        self.weight_sigma = nn.Parameter(torch.FloatTensor(out_features, in_features))
        self.register_buffer('weight_epsilon', torch.FloatTensor(out_features, in_features))

        self.bias_mu = nn.Parameter(torch.FloatTensor(out_features))
        self.bias_sigma = nn.Parameter(torch.FloatTensor(out_features))
        self.register_buffer('bias_epsilon', torch.FloatTensor(out_features))

        # This is a temporary buffer for sampling noise
        self.register_buffer('sample_weight_in', torch.FloatTensor(in_features))
        self.register_buffer('sample_weight_out', torch.FloatTensor(out_features))
        self.register_buffer('sample_bias_out', torch.FloatTensor(out_features))

        # Her we initialize the parameters ang generate initial noise
        self.reset_parameters()
        self.sample_noise()
    
    def forward(self, x):
        if self.training:
            # During training, add noise to weights and biases
            weight = self.weight_mu + self.weight_sigma.mul(self.weight_epsilon)
            bias = self.bias_mu + self.bias_sigma.mul(self.bias_epsilon)
        else:
            # During evaluation, use the deterministic weights and biases
            weight = self.weight_mu
            bias = self.bias_mu

        # We perform the linear transformation
        return F.linear(x, weight, bias)
    
    def reset_parameters(self):

        # Range for uniform initialization of the mean parameters
        mu_range = 1 / math.sqrt(self.weight_mu.size(1))

        # Here we initialize weights and biases with a uniform distribution
        self.weight_mu.data.uniform_(-mu_range, mu_range)
        self.weight_sigma.data.fill_(self.sigma_init / math.sqrt(self.weight_sigma.size(1)))

        self.bias_mu.data.uniform_(-mu_range, mu_range)
        self.bias_sigma.data.fill_(self.sigma_init / math.sqrt(self.bias_sigma.size(0)))

    def sample_noise(self):

        # We generate noise for the input and output dimensions of the weights
        self.sample_weight_in = self._scale_noise(self.sample_weight_in)
        self.sample_weight_out = self._scale_noise(self.sample_weight_out)
        self.sample_bias_out = self._scale_noise(self.sample_bias_out)

        # We compute element-wise noise for weights and biases
        self.weight_epsilon.copy_(self.sample_weight_out.ger(self.sample_weight_in))
        self.bias_epsilon.copy_(self.sample_bias_out)
    
    def _scale_noise(self, x):

        # We generate Gaussian noise
        x = x.normal_()
        
        # Here we apply scaling: sign(x) * sqrt(abs(x)). This scaling ensures noise has zero mean and unit variance
        x = x.sign().mul(x.abs().sqrt())
        return x

# Source: This code is adapted form the following github repository: https://github.com/dxyang/DQN_pytorch/blob/master/model.py
class DuelingDQN(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(DuelingDQN, self).__init__()
        self.num_actions = num_actions

        # These are the common feature extraction layers
        self.feature_extractor = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten()
        )

        # These are the advantage stream layers
        self.advantage = nn.Sequential(
            nn.Linear(64 * 7 * 7, 512),
            nn.ReLU(),
            nn.Linear(512, num_actions)
        )

        # These are the value stream
        self.value = nn.Sequential(
            nn.Linear(64 * 7 * 7, 512),
            nn.ReLU(),
            nn.Linear(512, 1)
        )

    def forward(self, x):
        # First, we extract the features
        features = self.feature_extractor(x)

        # Then, we compute advantage and value streams
        adv = self.advantage(features)
        val = self.value(features).expand(x.size(0), self.num_actions)

        # Finally, we combine streams into Q-values
        q_values = val + adv - adv.mean(dim=1, keepdim=True)
        return q_values

class NoisyDuelingDQN(nn.Module):
    def __init__(self, input_shape, num_actions, sigma_init=0.5):
        super(NoisyDuelingDQN, self).__init__()
        self.num_actions = num_actions

        # These are the common feature extraction layers
        self.feature_extractor = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten()
        )

        # These are the advantage stream layers with noisy layers
        self.advantage = nn.Sequential(
            NoisyLinear(64 * 7 * 7, 512, sigma_init),
            nn.ReLU(),
            NoisyLinear(512, num_actions, sigma_init)
        )

        # These are the value stream with noisy layers
        self.value = nn.Sequential(
            NoisyLinear(64 * 7 * 7, 512, sigma_init),
            nn.ReLU(),
            NoisyLinear(512, 1, sigma_init)
        )

    def forward(self, x):
        # First, we extract the features
        features = self.feature_extractor(x)

        # Then, we compute advantage and value streams
        adv = self.advantage(features)
        val = self.value(features).expand(x.size(0), self.num_actions)

        # Finally, we combine streams into Q-values
        q_values = val + adv - adv.mean(dim=1, keepdim=True)
        return q_values

    def reset_noise(self):
        # Here we reset noise for all noisy layers
        for layer in self.children():
            if isinstance(layer, NoisyLinear):
                layer.sample_noise()
            elif isinstance(layer, nn.Sequential):
                for sub_layer in layer:
                    if isinstance(sub_layer, NoisyLinear):
                        sub_layer.sample_noise()

In [None]:
MEAN_REWARD_BOUND = 700         # Max is 864
NUMBER_OF_REWARDS_TO_AVERAGE = 10

GAMMA = 0.99

BATCH_SIZE = 32
LEARNING_RATE = 0.0001

EXPERIENCE_REPLAY_SIZE = 50000
SYNC_TARGET_NETWORK = 1000

EPS_START = 1.0
EPS_DECAY = 0.999985
EPS_MIN = 0.05

INITIAL_BETA=0.4

In [None]:
# Source: This code comes from the class activity M3-2_Example_1a (DQN on Pong, train)
Experience = collections.namedtuple('Experience', field_names=['state', 'action', 'reward', 'done', 'new_state'])

class ExperienceReplay:
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)

    def __len__(self):
        return len(self.buffer)

    def append(self, experience):
        self.buffer.append(experience)

    def sample(self, BATCH_SIZE):
        indices = np.random.choice(len(self.buffer), BATCH_SIZE, replace=False)
        states, actions, rewards, dones, next_states = zip(*[self.buffer[idx] for idx in indices])

        return np.array(states), np.array(actions), np.array(rewards, dtype=np.float32), \
               np.array(dones, dtype=np.uint8), np.array(next_states)
    

# Source: Code adapted from the following github repository: https://github.com/the-computer-scientist/OpenAIGym/blob/master/PrioritizedExperienceReplayInOpenAIGym.ipynb
class PrioritizedExperienceReplayBuffer:
    def __init__(self, capacity, eps=0.001, alpha=0.6, beta=INITIAL_BETA):
        self.buffer = collections.deque(maxlen=capacity)

        # To make add priority to the experiences we add new attributes to the class
        self.priorities = collections.deque(maxlen=capacity) # This indicates the priorities of the experiences
        self.eps = eps  # This is a small constant to ensure no zero priority
        self.alpha = alpha  # This is an exponent for scaling priorities
        self.beta = beta  # This is and exponent for importance sampling adjustment

    def __len__(self):
        return len(self.buffer)

    # This function adds a new experience to the buffer with max priority
    def append(self, experience):
        self.buffer.append(experience)
        max_priority = max(self.priorities, default=1.0)
        self.priorities.append(max_priority)

    # This function calculates sampling probabilities for the buffer
    def _get_probabilities(self):
        scaled_priorities = np.array(self.priorities) ** self.alpha
        return scaled_priorities / scaled_priorities.sum()

    # This function calculates importance-sampling weights
    def _get_importance(self, probabilities):
        importance = ((1 / len(self.buffer)) * (1 / probabilities)) ** self.beta
        importance_normalized = importance / importance.max()
        return importance_normalized

    # This function samples a batch of experiences from the buffer and returns the batch, importance weights, and indices for priority updates
    def sample(self, batch_size):
        sample_size = min(len(self.buffer), batch_size)
        sample_probs = self._get_probabilities()
        sample_indices = np.random.choice(len(self.buffer), size=sample_size, p=sample_probs)

        experiences = [self.buffer[idx] for idx in sample_indices]
        importance = self._get_importance(sample_probs[sample_indices])

        states, actions, rewards, dones, next_states = zip(*experiences)

        return (np.array(states), np.array(actions), np.array(rewards, dtype=np.float32),
                np.array(dones, dtype=np.uint8), np.array(next_states)), importance, sample_indices

    # This function updates priorities for the given indices using the errors and adds a small epsilon to ensure no priority is zero.
    def update_priorities(self, indices, errors):
        for idx, error in zip(indices, errors):
            self.priorities[idx] = (abs(error) + self.eps)

In [None]:
# Source: This code comes from the class activity M3-2_Example_1a (DQN on Pong, train)
class DQNAgent:
    def __init__(self, env, exp_replay_buffer):
        self.env = env
        self.exp_replay_buffer = exp_replay_buffer
        self._reset()

    def _reset(self):
        self.current_state = self.env.reset()[0]
        self.total_reward = 0.0

    def step(self, net, device="cpu"):
        done_reward = None
        state_ = np.array([self.current_state])
        state = torch.tensor(state_).to(device)
        q_vals = net(state)
        _, act_ = torch.max(q_vals, dim=1)
        action = int(act_.item())

        new_state, reward, terminated, truncated, _ = self.env.step(action)
        is_done = terminated or truncated
        self.total_reward += reward

        clipped_reward=np.sign(reward) # Here we apply reward clipping

        exp = Experience(self.current_state, action, clipped_reward, is_done, new_state)
        self.exp_replay_buffer.append(exp)
        self.current_state = new_state

        if is_done:
            done_reward = self.total_reward
            self._reset()

        return done_reward

In [None]:
# login
wandb.login()

# start a new wandb run to track this script
wandb.init(
    project="Part1_DQN",
    config={
        "gamma": GAMMA,
        "learning_rate": LEARNING_RATE,
        "eps_start": EPS_START,
        "eps_decay": EPS_DECAY,
        "expereince_replay_size": EXPERIENCE_REPLAY_SIZE,
        "sync_target_network": SYNC_TARGET_NETWORK
    }
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33marroch35[0m ([33marroch35-organitzation[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [10]:
print(">>> Training starts at ",datetime.datetime.now())

>>> Training starts at  2024-12-06 14:00:04.318956


In [None]:
net = NoisyDuelingDQN(env.observation_space.shape, env.action_space.n).to(device)
target_net = NoisyDuelingDQN(env.observation_space.shape, env.action_space.n).to(device)

buffer = ExperienceReplay(EXPERIENCE_REPLAY_SIZE)
agent = DQNAgent(env, buffer)

optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
total_rewards = []
losses = []
step_number = 0
spected_min_reward=30


while True:
    step_number += 1

    # Reset noise after every training step
    net.reset_noise()
    target_net.reset_noise()

    reward = agent.step(net, device=device)
    if reward is not None:

        total_rewards.append(reward)

        mean_reward = np.mean(total_rewards[-NUMBER_OF_REWARDS_TO_AVERAGE:])
        
        print(f"Step:{step_number} | Total games:{len(total_rewards)} | Mean reward: {mean_reward:.3f}  ") 
        wandb.log({"reward_mean": mean_reward, "reward": reward}, step=step_number)
        
        # Every time we achieve a spected reward we save the current model, and we save the next every time the mean reward increases 5 points 
        if mean_reward > spected_min_reward:
            name="Part1_DQN_"+str(int(mean_reward))
            torch.save(net.state_dict(), f"{name}.dat")
            print("New best model saved.")
            spected_min_reward +=5

        if mean_reward > MEAN_REWARD_BOUND:
            print(f"SOLVED in {step_number} steps and {len(total_rewards)} games")
            break

    if len(buffer) < EXPERIENCE_REPLAY_SIZE:
        continue

    states_, actions_, rewards_, dones_, next_states_ = buffer.sample(BATCH_SIZE)

    states = torch.tensor(states_).to(device)
    next_states = torch.tensor(next_states_).to(device)
    actions = torch.tensor(actions_).to(device)
    rewards = torch.tensor(rewards_).to(device)
    dones = torch.BoolTensor(dones_).to(device)

    Q_values = net(states).gather(1, actions.unsqueeze(-1)).squeeze(-1)

    # Here we get the actions selected by the policy network
    policy_actions = net(next_states).argmax(1).unsqueeze(1)  # Shape: [batch_size, 1]

    # Here we use the target network to compute the value of those actions
    next_state_values = target_net(next_states).gather(1, policy_actions).squeeze(1) # Doble DQN
    next_state_values[dones] = 0.0
    next_state_values = next_state_values.detach()

    expected_Q_values = next_state_values * GAMMA + rewards

    loss = nn.MSELoss()(Q_values, expected_Q_values)

    losses.append(loss.item())
    mean_losses = np.mean(losses[-NUMBER_OF_REWARDS_TO_AVERAGE:])
    wandb.log({"loss_mean": mean_losses, "loss": loss.item()}, step=step_number)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if step_number % SYNC_TARGET_NETWORK == 0:
        target_net.load_state_dict(net.state_dict())

Step:70 | Total games:1 | Mean reward: 4.000  
Step:130 | Total games:2 | Mean reward: 3.000  
Step:188 | Total games:3 | Mean reward: 2.667  
Step:264 | Total games:4 | Mean reward: 2.750  
Step:308 | Total games:5 | Mean reward: 2.400  
Step:352 | Total games:6 | Mean reward: 2.167  
Step:401 | Total games:7 | Mean reward: 2.143  
Step:465 | Total games:8 | Mean reward: 2.250  
Step:509 | Total games:9 | Mean reward: 2.111  
Step:591 | Total games:10 | Mean reward: 2.300  
Step:667 | Total games:11 | Mean reward: 2.300  
Step:716 | Total games:12 | Mean reward: 2.200  
Step:747 | Total games:13 | Mean reward: 2.000  
Step:790 | Total games:14 | Mean reward: 1.800  
Step:860 | Total games:15 | Mean reward: 2.100  
Step:904 | Total games:16 | Mean reward: 2.100  
Step:948 | Total games:17 | Mean reward: 2.000  
Step:1013 | Total games:18 | Mean reward: 2.000  
Step:1098 | Total games:19 | Mean reward: 2.200  
Step:1150 | Total games:20 | Mean reward: 2.000  
Step:1206 | Total games:21 

KeyboardInterrupt: 

In [None]:
torch.save(net.state_dict(), "Part1_DQN.dat")

In [None]:
print(">>> Training ends at ",datetime.datetime.now())

In [33]:
# Finish the wandb run, necessary in notebooks
wandb.finish()
print("")

VBox(children=(Label(value='0.020 MB of 0.020 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
reward,▅▄▅▂▅▃▄▂▃▂▃▁▁▃▁▃▃▁▁▂▁▂▂▂▂▁▁▃▃▅▃▃█▂▂▁▂▂▁▃
reward_mean,█▇▆▅▅▇▅▄▃▄▄▄▄▄▄▄▅▃▃▄▅▅▅▅▄▄▄▃▅▄▇██▅▅▄▃▃▁▃

0,1
reward,1.0
reward_mean,1.4





In [None]:
model=NoisyDuelingDQN(env.observation_space.shape, env.action_space.n).to(device)
model.load_state_dict(torch.load("Part1_DQN_35.dat", map_location=torch.device(device))) 

  model.load_state_dict(torch.load("../models/Part1_DQN_35.dat", map_location=torch.device('cpu')))


<All keys matched successfully>

In [None]:
# This function is used to test the performance of the agent. 
# A part from this, it also saves the best episode to later generate a video
def test_agent(model, env, num_episodes=100):
    total_rewards = []
    
    best_reward=30
    best_episode=[]
    for episode in range(num_episodes):
        state = env.reset()[0]
        done = False
        episode_reward = 0
        images=[]
        while not done:
            img = env.render()
            images.append(Image.fromarray(img))
            state_ = torch.tensor(np.array([state], copy=False)).to(device)
            with torch.no_grad():
                q_vals = model(state_).data.cpu().numpy()[0]
            action = np.argmax(q_vals)

            next_state, reward, terminated, truncated, _ = env.step(action)
            state = next_state
            episode_reward+=reward
            done = terminated or truncated

        # If the current episode's reward exceeds the best reward so far, update the best episode
        if(episode_reward>best_reward):
            best_episode=images
            best_reward=episode_reward
            

        total_rewards.append(episode_reward)
        print(f" Episode: {episode} | Total reward: {episode_reward:.3f}")

    avg_reward = sum(total_rewards) / num_episodes
    print(f"Test Results: Average Reward over {num_episodes} episodes: {avg_reward:.3f}")

    return best_episode

In [24]:
images=test_agent(model, env)

 Episode: 0 | Total reward: 14.000
 Episode: 1 | Total reward: 20.000
 Episode: 2 | Total reward: 32.000
 Episode: 3 | Total reward: 30.000
 Episode: 4 | Total reward: 17.000
 Episode: 5 | Total reward: 37.000
 Episode: 6 | Total reward: 34.000
 Episode: 7 | Total reward: 50.000
 Episode: 8 | Total reward: 18.000
 Episode: 9 | Total reward: 36.000
 Episode: 10 | Total reward: 14.000
 Episode: 11 | Total reward: 17.000
 Episode: 12 | Total reward: 24.000
 Episode: 13 | Total reward: 26.000
 Episode: 14 | Total reward: 32.000
 Episode: 15 | Total reward: 18.000
 Episode: 16 | Total reward: 23.000
 Episode: 17 | Total reward: 41.000
 Episode: 18 | Total reward: 28.000
 Episode: 19 | Total reward: 11.000
 Episode: 20 | Total reward: 27.000
 Episode: 21 | Total reward: 31.000
 Episode: 22 | Total reward: 39.000
 Episode: 23 | Total reward: 24.000
 Episode: 24 | Total reward: 16.000
 Episode: 25 | Total reward: 30.000
 Episode: 26 | Total reward: 19.000
 Episode: 27 | Total reward: 33.000
 E

In [None]:

gif_file = "video_noisy4.gif"

images[0].save(gif_file, save_all=True, append_images=images[1:], duration=160, loop=0)

print("Episode export to '{}'".format(gif_file))

Episode export to '../data/video_noisy4.gif'
