# Reinforcement Learning / Tensorflow - TF_Agents 

El aprendizaje por refuerzo (RL) es uno de los campos más antiguos del aprendizaje automático. Ha existido desde la década de 1950 y ha producido muchas aplicaciones interesantes a lo largo de los años.

<br />
<img src='https://es.mathworks.com/help///reinforcement-learning/ug/reinforcement_learning_diagram.png' width='300' />

*"El aprendizaje por refuerzo se diferencia del aprendizaje supervisado en que no requiere la presentación de pares de entrada/salida etiquetados y no requiere que se corrijan explícitamente acciones subóptimas. En cambio, la atención se centra en encontrar un equilibrio entre la exploración (de territorio desconocido) y la explotación (del conocimiento actual).."* [wikipedia](https://en.wikipedia.org/wiki/Reinforcement_learning)

In [1]:
!pip install tensorflow==2.13.0
!pip install tf-agents[reverb]
!pip install gymnasium[atari]==0.29.0
!pip install autorom[accept-rom-license]==0.6.1
!AutoROM --accept-license


Collecting tensorflow==2.13.0
  Downloading tensorflow-2.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting gast<=0.4.0,>=0.2.1 (from tensorflow==2.13.0)
  Downloading gast-0.4.0-py3-none-any.whl.metadata (1.1 kB)
Collecting keras<2.14,>=2.13.1 (from tensorflow==2.13.0)
  Downloading keras-2.13.1-py3-none-any.whl.metadata (2.4 kB)
Collecting numpy<=1.24.3,>=1.22 (from tensorflow==2.13.0)
  Downloading numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3 (from tensorflow==2.13.0)
  Downloading protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting tensorboard<2.14,>=2.13 (from tensorflow==2.13.0)
  Downloading tensorboard-2.13.0-py3-none-any.whl.metadata (1.8 kB)
Collecting tensorflow-estimator<2.14,>=2.13.0 (from tensorflow==2.13.0)
  Downloading tensorflow_estimator-2.13.0-py2.py3-none-any

In [2]:
import gymnasium as gym
from tf_agents.environments import suite_gym, tf_py_environment
from tf_agents.networks import q_network
from tf_agents.agents.dqn import dqn_agent
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common
import tensorflow as tf
import numpy as np


2025-11-19 22:32:42.562429: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-11-19 22:32:42.562497: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-11-19 22:32:42.564282: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# Entorno Pong (Atari)
env_name = "ALE/Pong-v5"
gym_env = suite_gym.load(env_name)  # env de TF-Agents
train_env = tf_py_environment.TFPyEnvironment(gym_env)


A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


In [6]:
# Obtener el tamaño de las acciones
num_actions = train_env.action_spec().maximum - train_env.action_spec().minimum + 1

# Red Q
q_net = q_network.QNetwork(
    train_env.observation_spec(),
    train_env.action_spec(),
    fc_layer_params=(256, 256)  # dos capas densas de 256 neuronas
)


In [7]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    epsilon_greedy=0.1,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=tf.Variable(0)
)

agent.initialize()


In [8]:
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=100000
)


In [17]:
from tf_agents.environments import py_environment
from tf_agents.specs import array_spec
from tf_agents.trajectories import time_step as ts
import numpy as np

class GymPyWrapper(py_environment.PyEnvironment):
    def __init__(self, gym_env):
        super().__init__()
        self._env = gym_env
        self._action_spec = array_spec.BoundedArraySpec(
            shape=(), dtype=np.int32, minimum=0, maximum=self._env.action_space.n-1, name='action'
        )
        obs_shape = self._env.observation_space.shape
        self._observation_spec = array_spec.BoundedArraySpec(
            shape=obs_shape, dtype=np.uint8, minimum=0, maximum=255, name='observation'
        )
        self._state = None
        self._episode_ended = False

    def action_spec(self):
        return self._action_spec

    def observation_spec(self):
        return self._observation_spec

    def _reset(self):
        obs, _ = self._env.reset()
        self._state = obs
        self._episode_ended = False
        return ts.restart(np.array(self._state, dtype=np.uint8))

    def _step(self, action):
        if self._episode_ended:
            return self.reset()
        obs, reward, terminated, truncated, _ = self._env.step(action)
        self._state = obs
        self._episode_ended = terminated or truncated
        if self._episode_ended:
            return ts.termination(np.array(self._state, dtype=np.uint8), reward)
        else:
            return ts.transition(np.array(self._state, dtype=np.uint8), reward, discount=1.0)


In [18]:
gym_env = gym.make("ALE/Pong-v5", frameskip=1)
gym_env = AtariPreprocessing(gym_env, frame_skip=4, grayscale_obs=True, scale_obs=False)
gym_env = FrameStack(gym_env, num_stack=4)

train_env = tf_py_environment.TFPyEnvironment(GymPyWrapper(gym_env))


In [19]:
# === 3. Crear entorno Pong con preprocesamiento Atari ===
env_name = "ALE/Pong-v5"

gym_env = gym.make(env_name, frameskip=1)
gym_env = AtariPreprocessing(gym_env, frame_skip=4, grayscale_obs=True, scale_obs=False)
gym_env = FrameStack(gym_env, num_stack=4)

train_env = tf_py_environment.TFPyEnvironment(GymPyWrapper(gym_env))


In [20]:
# === 4. Crear red Q y agente DQN ===
q_net = q_network.QNetwork(
    train_env.observation_spec(),
    train_env.action_spec(),
    fc_layer_params=(256,)
)

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    epsilon_greedy=0.1,
    target_update_tau=0.05,
    target_update_period=200,
    gamma=0.99,
    td_errors_loss_fn=tf.keras.losses.Huber(reduction="none"),
    train_step_counter=tf.Variable(0)
)

agent.initialize()


In [21]:
# === 5. Replay buffer ===
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=100_000
)


In [22]:
# === 6. Función para recolectar experiencia ===
def collect_step(environment, policy, buffer):
    time_step = environment.current_time_step()
    action_step = policy.action(time_step)
    next_time_step = environment.step(action_step.action)
    traj = trajectory.from_transition(time_step, action_step, next_time_step)
    buffer.add_batch(traj)

# Ejemplo: recolectar 1000 pasos iniciales
for _ in range(1000):
    collect_step(train_env, agent.collect_policy, replay_buffer)


In [23]:
# Número pequeño de iteraciones para prueba
num_iterations = 5000
batch_size = 32

dataset = replay_buffer.as_dataset(
    sample_batch_size=batch_size,
    num_steps=2,
    single_deterministic_pass=False
)
iterator = iter(dataset)

for _ in range(num_iterations):
    experience, _ = next(iterator)
    train_loss = agent.train(experience).loss


Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))


Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))


In [26]:
import imageio
import tensorflow as tf
import numpy as np
import cv2
import os

os.makedirs("pong_frames", exist_ok=True)

num_episodes = 1

for ep in range(num_episodes):
    time_step = train_env.reset()
    frames = []
    while not time_step.is_last():
        action_step = agent.policy.action(time_step)
        time_step = train_env.step(action_step.action)
        
        # Convertimos el tensor a numpy
        frame = time_step.observation[0].numpy()  # shape (4,84,84)
        
        # Seleccionamos el último frame del stack y convertimos a RGB
        last_frame = frame[-1]  # shape (84,84)
        frame_rgb = cv2.cvtColor(last_frame, cv2.COLOR_GRAY2RGB)  # ahora shape (84,84,3)
        
        frames.append(frame_rgb)
    
    video_path = f"pong_episode_{ep+1}.mp4"
    imageio.mimsave(video_path, frames, fps=30)
    print(f"Video guardado: {video_path}")




Video guardado: pong_episode_1.mp4


In [27]:
from IPython.display import HTML
from base64 import b64encode

video_path = "pong_episode_1.mp4"  # el video que guardaste

# Leer el archivo y codificarlo en base64
mp4 = open(video_path,'rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()

# Mostrar video en notebook
HTML(f"""
<video width=400 controls>
    <source src="{data_url}" type="video/mp4">
</video>
""")
