In [1]:
!pip install gymnasium[atari]
!pip install gymnasium[accept-rom-license]
!pip install --upgrade gym ale-py
!pip install keyboard
!pip install keras
!pip install tensorflow
!pip install wandb

import tensorflow as tf
from tensorflow import keras
import numpy as np
from collections import deque



from ale_py import ALEInterface
from ale_py.roms import SpaceInvaders
import pathlib
import gymnasium as gym
import wandb
from tensorflow.keras.callbacks import ModelCheckpoint
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gymnasium[atari]
  Downloading gymnasium-0.28.1-py3-none-any.whl (925 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m925.5/925.5 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
Collecting jax-jumpy>=1.0.0 (from gymnasium[atari])
  Downloading jax_jumpy-1.0.0-py3-none-any.whl (20 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium[atari])
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Collecting shimmy[atari]<1.0,>=0.1.0 (from gymnasium[atari])
  Downloading Shimmy-0.2.1-py3-none-any.whl (25 kB)
Collecting ale-py~=0.8.1 (from shimmy[atari]<1.0,>=0.1.0->gymnasium[atari])
  Downloading ale_py-0.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: farama-notifications, ja

In [None]:
#Inicializar interfaz
ale = ALEInterface()


In [None]:
#cargar juego
ale.loadROM(SpaceInvaders)

env = gym.make('ALE/SpaceInvaders-v5')

n_inputs = env.observation_space.shape[0]
n_outputs = env.action_space.n


In [None]:
main_nn = keras.Sequential([
    keras.layers.Conv2D(32, (8, 8), strides=4, activation='relu', input_shape=(210, 160, 3)),
    keras.layers.Conv2D(64, (4, 4), strides=2, activation='relu'),
    keras.layers.Conv2D(64, (3, 3), strides=1, activation='relu'),
    keras.layers.Flatten(),
    keras.layers.Dense(512, activation='relu'),
    keras.layers.Dense(n_outputs)
])

target_nn = keras.models.clone_model(main_nn)

optimizer = keras.optimizers.Adam(lr=0.01)
loss_fn = keras.losses.mean_squared_error

replay_buffer = deque(maxlen=10000)



In [None]:
def epsilon_greedy_policy(state, epsilon=0):
    if np.random.rand() < epsilon:
        return np.random.randint(n_outputs)
    else:

        if isinstance(state, tuple) and len(state) == 2 and isinstance(state[0], np.ndarray) and isinstance(state[1], dict):
            Q_values = main_nn.predict(state[0][np.newaxis])
        else:
            Q_values = main_nn.predict(state[np.newaxis])

        return np.argmax(Q_values[0])

In [None]:
def sample_experiences(batch_size):
    indices = np.random.randint(len(replay_buffer), size=batch_size)
    batch = [replay_buffer[index] for index in indices]
    states, actions, rewards, next_states, dones = [
        np.array([experience[field_index] for experience in batch], dtype=object)
        for field_index in range(5)]
    return states, actions, rewards, next_states, dones



In [None]:
def play_one_step(env, state, epsilon):
    action = epsilon_greedy_policy(state, epsilon)
    result = env.step(action)
    next_state, reward, done, _,_ = env.step(action)
    if next_state.dtype == np.uint8:
        replay_buffer.append((state, action, reward, next_state, done))
    return next_state, reward


In [None]:
discount_rate = 0.99


def training_step(batch_size):
    experiences = sample_experiences(batch_size)
    states, actions, rewards, next_states, dones = experiences
    next_Q_values = target_nn.predict(next_states.astype('float32'))
    max_next_Q_values = np.max(next_Q_values, axis=1)
    target_Q_values = (rewards + (1 - dones) * discount_rate * max_next_Q_values)
    target_Q_values = target_Q_values.reshape(-1, 1)
    mask = tf.one_hot(actions.astype('int32'), n_outputs)
    with tf.GradientTape() as tape:

        for i in range(len(states)):
            for state in states:
              print(state)
            if isinstance(states[i], tuple) and len(states[i]) == 2 and isinstance(states[i][0], np.ndarray) and isinstance(states[i][1], dict):
                states[i] = states[i][0]
            elif states[i].shape != (210, 160, 3):
                states[i] = states[i-1]
        #print(np.stack([np.array(state, dtype=object) for state in states]).astype('float32').shape)
        all_Q_values = main_nn(tf.convert_to_tensor(np.stack([np.array(state, dtype=object) for state in states]).astype('float32')))
        print(all_Q_values)
        Q_values = tf.reduce_sum(all_Q_values * mask, axis=1, keepdims=True)
        loss = tf.reduce_mean(loss_fn(target_Q_values.astype('float32'), Q_values))
        print(loss)
    grads = tape.gradient(loss, main_nn.trainable_variables)
    optimizer.apply_gradients(zip(grads, main_nn.trainable_variables))

    return loss

In [None]:
def play_one_step_train(env, state, model):
    # Verificar si el estado es una tupla
    if isinstance(state, tuple) and len(state) == 2 and isinstance(state[0], np.ndarray):
        state = state[0]

    # Utilizar el modelo para predecir acciones
    Q_values = model.predict(state[np.newaxis])
    action = np.argmax(Q_values[0])

    next_state, reward, done, _,_ = env.step(action)
    if next_state.dtype == np.uint8:
        replay_buffer.append((state, action, reward, next_state, done))
    return next_state, reward

In [None]:
import os
from google.colab import drive
# Mount Google Drive into Colab
drive.mount('/content/gdrive')
checkpoint_path = '/content/gdrive/My Drive/Proyecto 2 AA/SI/model_{episode:03d}.h5'
model_file = '/content/gdrive/My Drive/Proyecto 2 AA/SI/my_dqn.h5'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:


wandb.init(project="SI Project")



# Create a ModelCheckpoint callback
checkpoint_callback = ModelCheckpoint(
    checkpoint_path,
    save_weights_only=True,
    save_best_only=False,
    save_freq=10,  # Save checkpoints every 10 episodes
    verbose=1
)

# Add the checkpoint callback to the list of callbacks
callbacks = [checkpoint_callback]
start_episode = 0  # Set the starting episode

# Check if any checkpoints exist
import glob

directory = '/content/gdrive/My Drive/Proyecto 2 AA/SI/'

# Get the list of checkpoint files in the directory
checkpoint_files = glob.glob(directory + 'model_*.h5')

# Sort the checkpoint files by episode number
checkpoint_files.sort(key=lambda x: int(x.split('_')[1].split('.')[0]))

# Get the latest checkpoint file
latest_checkpoint = checkpoint_files[-1] if checkpoint_files else None

print(f"Latest checkpoint path: {latest_checkpoint}")


if latest_checkpoint is not None:
    # Extract the episode number from the checkpoint path
    start_episode = int(latest_checkpoint.split('_')[1].split('.')[0])

    # Load the weights from the latest checkpoint
    main_nn.load_weights(latest_checkpoint)
    print(f"Resuming training from episode {start_episode }")
else:
    print("No checkpoints found in the specified directory.")

print(f"Latest checkpoint path: {latest_checkpoint}")



Latest checkpoint path: /content/gdrive/My Drive/Proyecto 2 AA/SI/model_110.h5
Resuming training from episode 110
Latest checkpoint path: /content/gdrive/My Drive/Proyecto 2 AA/SI/model_110.h5


In [None]:

#LOOP
if os.path.isfile(model_file):
    model = keras.models.load_model(model_file)

    env = gym.make('ALE/SpaceInvaders-v5', render_mode='human')
    obs = env.reset()

    while True:
        obs, reward = play_one_step_train(env, obs, model)


else:
    for episode in range(start_episode, 600):

        obs = env.reset()

        for step in range(150):
            epsilon = max(1 - episode / 500, 0.01)

            obs, reward = play_one_step(env, obs, epsilon)


            if episode > 70:
                loss = training_step(70)

                wandb.log({"episode": episode, "total_reward": reward, "loss": loss})
        print(f"Episode: {episode}")
        if episode % 10 == 0:
            # Save the model every 10 episodes
            main_nn.save(checkpoint_path.format(episode=episode))
    main_nn.save('my_dqn.h5')