In [42]:
import tensorflow as tf
import keras
from keras import layers
import gymnasium as gym
from gymnasium.wrappers import FrameStackObservation, AtariPreprocessing, RecordVideo
import numpy as np
import random
from collections import deque
import os
import ale_py
from tqdm import tqdm

In [43]:
# Device selection
device_name = "gpu"  # Set to "cpu" or "gpu" as needed
if device_name == "gpu" and tf.config.list_physical_devices('GPU'):
    device = "/GPU:0"
else:
    device = "/CPU:0"
device

'/CPU:0'

In [44]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


In [45]:
# Environment setup
gym.register_envs(ale_py)
env = gym.make("SpaceInvadersNoFrameskip-v4", render_mode="rgb_array", frameskip=1) 
env = AtariPreprocessing(env)
env = FrameStackObservation(env, 4)
trigger = lambda t: t % 20 == 0 # Every 20th episode
env = RecordVideo(env, video_folder="./videos", episode_trigger=trigger, disable_logger=True)

In [46]:
# Get the number of actions
num_actions = env.action_space.n
print(f"Number of actions: {num_actions}")
# List all action meanings
action_meanings = env.unwrapped.get_action_meanings()
print("Actions and their meanings:", action_meanings)

Number of actions: 6
Actions and their meanings: ['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']


In [47]:
# Hyper parameters
seed = 42
gamma = 0.99
epsilon = 1.0
epsilon_min = 0.1
epsilon_max = 1.0
epsilon_interval = (epsilon_max - epsilon_min)
epsilon_decay_frames = 1000000  # Adjust this value if needed


batch_size = 32
max_steps_per_episode = 10000
max_episodes = 0 # Update this
max_frames = 1e7

# Exploration parameters
epsilon_random_frames = 50000
epsilon_greedy_frames = 1000000.0

# Replay buffer parameters
# NOTE: The Deepmind paper suggests 1000000 however this causes memory issues
max_memory_length = 100000
update_after_actions = 4
update_target_network = 10000

# Using huber loss for stability (specifically for Adam)
loss_function = keras.losses.Huber()

In [48]:
# Initialize history variables
action_history = []
state_history = []
state_next_history = []
rewards_history = []
done_history = []
episode_reward_history = []
running_reward = 0
episode_count = 0
frame_count = 0

In [49]:

def create_q_model():
    return keras.Sequential(
        [
            layers.Input(shape=(84, 84, 4)),
            layers.Conv2D(32, kernel_size=8, strides=4, activation="relu"),
            layers.Conv2D(64, kernel_size=4, strides=2, activation="relu"),
            layers.Conv2D(64, kernel_size=3, strides=1, activation="relu"),
            layers.Flatten(),
            layers.Dense(512, activation="relu"),
            layers.Dense(num_actions, activation="linear")
        ]
    )


model = create_q_model()
model_target = create_q_model()

optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)

# Create directory for saving models
save_dir = "models"
os.makedirs(save_dir, exist_ok=True)

In [50]:
# Replay buffer
replay_buffer = deque(maxlen=max_memory_length)

# Initialize variables
best_running_reward = -float('inf')  # Start with the lowest possible reward
last_saved_model = None  # Track the last saved model
log_file = "model_performance_log.txt"  # Log file path

# Initialize tqdm progress bars
total_episodes = max_episodes if max_episodes > 0 else float('inf')  # Total episodes, if max_episodes is set
total_frames = max_frames if max_frames > 0 else float('inf')  # Total frames, if max_frames is set

episode_bar = tqdm(total=total_episodes, desc="Episodes", unit="episode")
frame_bar = tqdm(total=total_frames, desc="Frames", unit="frame")

# Clear log file
with open(log_file, "w") as f:
    f.write("Episode\tRunning Reward\tEpsilon\n")

# Initialize history
episode_reward_history = deque(maxlen=100)  # Efficient rolling window
running_reward = 0

# Target network update tau for soft updates
tau = 0.001

with tf.device(device):  # Ensures all TensorFlow operations are executed on the selected device
    print(f"Using device: {device}")
    while True:
        # Reset environment
        observation, _ = env.reset()
        state = np.array(observation)  # Preprocessed state
        #print("State shape:", state.shape)  # Debugging to verify shape
        episode_reward = 0

        for timestep in range(1, max_steps_per_episode):
            frame_count += 1

            # Epsilon-greedy action selection
            if frame_count < epsilon_random_frames or tf.random.uniform((1,)) < epsilon:
                action = env.action_space.sample()
            else:
                # Ensure state is in the correct format: (batch_size, height, width, channels)
                state_tensor = keras.ops.convert_to_tensor(state)
                state_tensor = keras.ops.expand_dims(state_tensor, 0)
                action_probs = model(state_tensor, training=False)
                action = keras.ops.argmax(action_probs[0]).numpy()

            # Decay probability of taking random action
            epsilon -= epsilon_interval / epsilon_greedy_frames
            epsilon = max(epsilon, epsilon_min)

            # Step in the environment
            state_next, reward, done, _, _ = env.step(action)
            state_next = np.array(state_next)  # Preprocessed next state
            episode_reward += reward

            # Store transition in replay buffer
            replay_buffer.append((state, action, reward, state_next, done))
            state = state_next  # Update state

            # Train the model
            if frame_count % update_after_actions == 0 and len(replay_buffer) >= batch_size:
                # Sample a batch from replay buffer
                batch = random.sample(replay_buffer, batch_size)
                states, actions, rewards, next_states, dones = zip(*batch)

                # Convert to tensors
                state_sample = tf.stack(states, axis=0)  # Combine into a single tensor
                #print("state_sample shape before transpose:", state_sample.shape)

                # Transpose to (batch_size, 84, 84, 4)
                state_sample = tf.transpose(state_sample, perm=[0, 2, 3, 1])  # Correct permutation
                #print("state_sample shape after transpose:", state_sample.shape)

                state_sample = tf.cast(state_sample, dtype=tf.float32)
                

                next_state_sample = tf.stack(next_states, axis=0)  # Result: (batch_size, 4, 84, 84)
                #print("next_state_sample shape before transpose:", next_state_sample.shape)
                next_state_sample = tf.transpose(next_state_sample, perm=[0, 2, 3, 1])  # Rearrange to (batch_size, 84, 84, 4)
                #print("next_state_sample shape after transpose:", next_state_sample.shape)
                next_state_sample = tf.cast(next_state_sample, dtype=tf.float32)

                action_sample = tf.convert_to_tensor(actions, dtype=tf.int32)
                reward_sample = tf.convert_to_tensor(rewards, dtype=tf.float32)
                done_sample = tf.convert_to_tensor(dones, dtype=tf.float32)

                # Debugging final shapes before passing to the model
                #print("state_sample final shape:", state_sample.shape)  # Expected: (batch_size, 84, 84, 4)
                #print("next_state_sample final shape:", next_state_sample.shape)  # Expected: (batch_size, 84, 84, 4)
                #print("action_sample final shape:", action_sample.shape)  # Expected: (batch_size,)

                # Compute target Q-values
                future_rewards = model_target(next_state_sample)
                updated_q_values = reward_sample + gamma * tf.reduce_max(future_rewards, axis=1) * (1 - done_sample)

                # Masked loss
                with tf.GradientTape() as tape:
                    q_values = model(state_sample)
                    q_action = tf.reduce_sum(tf.one_hot(action_sample, num_actions) * q_values, axis=1)
                    loss = loss_function(updated_q_values, q_action)

                # Backpropagation
                grads = tape.gradient(loss, model.trainable_variables)
                #print("Gradient shapes:", [g.shape for g in grads if g is not None])
                optimizer.apply_gradients(zip(grads, model.trainable_variables))

            # Update target network
            if frame_count % update_target_network == 0:
                model_target.set_weights([
                    tau * w + (1 - tau) * tw
                    for w, tw in zip(model.get_weights(), model_target.get_weights())
                ])

            # End episode if done
            if done:
                break

        # Update running reward and history
        episode_reward_history.append(episode_reward)
        running_reward = np.mean(episode_reward_history)
        episode_count += 1
        # Update progress bar
        episode_bar.update(1)  # Increment the progress bar by one episode
        episode_bar.set_postfix(running_reward=f"{running_reward:.2f}", epsilon=f"{epsilon:.2f}")

        # Check if the model should be saved
        if running_reward > best_running_reward:
            best_running_reward = running_reward

            # Remove the last saved model if it exists
            if last_saved_model and os.path.exists(last_saved_model):
                os.remove(last_saved_model)

            # Save the new best model
            model_path = f"models/best_model_episode_{episode_count}.keras"
            model.save(model_path)
            last_saved_model = model_path

            # Log the result
            with open(log_file, "a") as f:
                f.write(f"{episode_count}\t{running_reward:.2f}\t{epsilon:.2f}\n")

            print(f"New best model saved: {model_path} with running reward: {running_reward:.2f}")

        # Print progress
        if episode_count % 10 == 0:
            print(f"Episode {episode_count}, Frame {frame_count}, Running Reward: {running_reward:.2f}, Epsilon: {epsilon:.2f}")

        # Termination conditions
        if running_reward > 10000:
            print(f"Solved with a running_reward of {running_reward} at episode {episode_count}!")
            model.save("spaceinvaders_qmodel_solved.keras")
            break

        if max_episodes > 0 and episode_count >= max_episodes:
            print(f"Stopped at episode {episode_count}!")
            break

        if max_frames > 0 and frame_count >= max_frames:
            print(f"Stopped at frame {frame_count}!")
            break

    # Final save after training
    model.save("spaceinvaders_qmodel_final.keras")
    print("Final model saved as 'spaceinvaders_qmodel_final.keras'.")
    print("Training complete.")
episode_bar.close()

Episodes: 0episode [02:44, ?episode/s]

Frames:   0%|          | 0/10000000.0 [02:44<?, ?frame/s]

Using device: /CPU:0





New best model saved: models/best_model_episode_1.keras with running reward: 475.00


KeyboardInterrupt: 

In [None]:
# # Environment setup
# gym.register_envs(ale_py)
# env = gym.make("SpaceInvadersNoFrameskip-v4", render_mode="rgb_array", frameskip=1)
# env = AtariPreprocessing(env)
# env = FrameStackObservation(env, 4)
# trigger = lambda t: t % 20 == 0  # Every 20th episode
# env = RecordVideo(env, video_folder="./videos", episode_trigger=trigger, disable_logger=True)

# # Get the number of actions
# num_actions = env.action_space.n
# print(f"Number of actions: {num_actions}")
# # List all action meanings
# action_meanings = env.unwrapped.get_action_meanings()
# print("Actions and their meanings:", action_meanings)

# # Hyper parameters
# seed = 42
# gamma = 0.99
# epsilon = 1.0
# epsilon_min = 0.1
# epsilon_max = 1.0
# epsilon_interval = (epsilon_max - epsilon_min)
# epsilon_decay_frames = 1000000  # Adjust this value if needed

# batch_size = 32
# max_steps_per_episode = 10000
# max_episodes = 0  # Update this
# max_frames = 1e7

# # Exploration parameters
# epsilon_random_frames = 50000
# epsilon_greedy_frames = 1000000.0

# # Replay buffer parameters
# # NOTE: The Deepmind paper suggests 1e6 however this causes memory issues
# max_memory_length = 100000
# update_after_actions = 4
# update_target_network = 10000

# # Using huber loss for stability (specifically for Adam)
# loss_function = keras.losses.Huber()

# # Initialize history variables
# action_history = []
# state_history = []
# state_next_history = []
# rewards_history = []
# done_history = []
# episode_reward_history = []
# running_reward = 0
# episode_count = 0
# frame_count = 0

# def create_q_model():
#     return keras.Sequential(
#         [
#             layers.Input(shape=(84, 84, 4)),
#             layers.Conv2D(32, kernel_size=8, strides=4, activation="relu"),
#             layers.Conv2D(64, kernel_size=4, strides=2, activation="relu"),
#             layers.Conv2D(64, kernel_size=3, strides=1, activation="relu"),
#             layers.Flatten(),
#             layers.Dense(512, activation="relu"),
#             layers.Dense(num_actions, activation="linear")
#         ]
#     )

# model = create_q_model()
# model_target = create_q_model()

# optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)

# # Create directory for saving models
# save_dir = "models"
# os.makedirs(save_dir, exist_ok=True)  # Replay buffer
# replay_buffer = deque(maxlen=max_memory_length)

# # Initialize variables
# best_running_reward = -float('inf')  # Start with the lowest possible reward
# last_saved_model = None  # Track the last saved model
# log_file = "model_performance_log.txt"  # Log file path

# # Clear log file
# with open(log_file, "w") as f:
#     f.write("Episode\tRunning Reward\tEpsilon\n")

# # Initialize history
# episode_reward_history = deque(maxlen=100)  # Efficient rolling window

# frame_count = 0
# episode_count = 0
# epsilon = epsilon_max

# # Target network update tau for soft updates
# tau = 0.001

# while True:
#     # Reset environment
#     observation, _ = env.reset()
#     state = tf.expand_dims(tf.convert_to_tensor(observation, dtype=tf.float32), 0)  # Preprocessed state
#     episode_reward = 0

#     # Epsilon decay once per episode
#     epsilon = max(epsilon_min, epsilon_max - epsilon_interval * (frame_count / epsilon_greedy_frames))

#     for timestep in range(1, max_steps_per_episode):
#         frame_count += 1

#         # Epsilon-greedy action selection
#         if frame_count < epsilon_random_frames or tf.random.uniform((1,)) < epsilon:
#             action = env.action_space.sample()
#         else:
#             # Ensure state is in the correct format: (batch_size, height, width, channels)
#             state = tf.transpose(state, perm=[0, 2, 3, 1])  # Convert (1, 4, 84, 84) to (1, 84, 84, 4) if needed
#             action_probs = model(state, training=False)
#             action = tf.argmax(action_probs[0]).numpy()

#         # Step in the environment
#         state_next, reward, done, _, _ = env.step(action)
#         state_next = tf.expand_dims(tf.convert_to_tensor(state_next, dtype=tf.float32), 0)  # Preprocessed next state
#         episode_reward += reward

#         # Store transition in replay buffer
#         replay_buffer.append((state, action, reward, state_next, done))
#         state = state_next  # Update state

#         # Train the model
#         if frame_count % update_after_actions == 0 and len(replay_buffer) >= batch_size:
#             # Sample a batch from replay buffer
#             batch = random.sample(replay_buffer, batch_size)
#             states, actions, rewards, next_states, dones = zip(*batch)

#             # Ensure all tensors are in correct shape
#             state_sample = [tf.transpose(s, perm=[0, 2, 3, 1]) for s in states]
#             state_sample = tf.concat(state_sample, axis=0)
#             state_sample = tf.cast(state_sample, dtype=tf.float32)

#             next_state_sample = [tf.transpose(s, perm=[0, 2, 3, 1]) for s in next_states]
#             next_state_sample = tf.concat(next_state_sample, axis=0)
#             next_state_sample = tf.cast(next_state_sample, dtype=tf.float32)

#             action_sample = tf.convert_to_tensor(actions, dtype=tf.int32)
#             reward_sample = tf.convert_to_tensor(rewards, dtype=tf.float32)
#             done_sample = tf.convert_to_tensor(dones, dtype=tf.float32)

#             # Compute target Q-values
#             future_rewards = model_target(next_state_sample)
#             updated_q_values = reward_sample + gamma * tf.reduce_max(future_rewards, axis=1) * (1 - done_sample)

#             # Masked loss
#             with tf.GradientTape() as tape:
#                 q_values = model(state_sample)
#                 q_action = tf.reduce_sum(tf.one_hot(action_sample, num_actions) * q_values, axis=1)
#                 loss = loss_function(updated_q_values, q_action)

#             # Backpropagation
#             grads = tape.gradient(loss, model.trainable_variables)
#             optimizer.apply_gradients(zip(grads, model.trainable_variables))

#         # Update target network
#         if frame_count % update_target_network == 0:
#             model_target.set_weights([
#                 tau * w + (1 - tau) * tw
#                 for w, tw in zip(model.get_weights(), model_target.get_weights())
#             ])

#         # End episode if done
#         if done:
#             break

#     # Update running reward and history
#     episode_reward_history.append(episode_reward)
#     running_reward = np.mean(episode_reward_history)
#     episode_count += 1

#     # Check if the model should be saved
#     if running_reward > best_running_reward:
#         best_running_reward = running_reward

#         # Remove the last saved model if it exists
#         if last_saved_model and os.path.exists(last_saved_model):
#             os.remove(last_saved_model)

#         # Save the new best model
#         model_path = f"models/best_model_episode_{episode_count}.keras"
#         model.save(model_path)
#         last_saved_model = model_path

#         # Log the result
#         with open(log_file, "a") as f:
#             f.write(f"{episode_count}\t{running_reward:.2f}\t{epsilon:.2f}\n")

#         print(f"New best model saved: {model_path} with running reward: {running_reward:.2f}")

#     # Print progress
#     if episode_count % 10 == 0:
#         print(f"Episode {episode_count}, Frame {frame_count}, Running Reward: {running_reward:.2f}, Epsilon: {epsilon:.2f}")

#     # Termination conditions
#     if running_reward > 10000:
#         print(f"Solved with a running_reward of {running_reward} at episode {episode_count}!")
#         model.save("spaceinvaders_qmodel_solved.keras")
#         break

#     if max_episodes > 0 and episode_count >= max_episodes:
#         print(f"Stopped at episode {episode_count}!")
#         model.save(f"spaceinvaders_qmodel_final_episode_{episode_count}.keras")
#         break

#     if max_frames > 0 and frame_count >= max_frames:
#         print(f"Stopped at frame {frame_count}!")
#         model.save(f"spaceinvaders_qmodel_final_frame_{frame_count}.keras")
#         break

# # Final save after training
# model.save("spaceinvaders_qmodel_final.keras")
# print("Final model saved as 'spaceinvaders_qmodel_final.keras'.")
# print("Training complete.")

  logger.warn("Unable to save last video! Did you call close()?")


Number of actions: 6
Actions and their meanings: ['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']
New best model saved: models/best_model_episode_1.keras with running reward: 240.00
New best model saved: models/best_model_episode_2.keras with running reward: 325.00
Episode 10, Frame 5561, Running Reward: 181.00, Epsilon: 1.00
Episode 20, Frame 9903, Running Reward: 143.00, Epsilon: 0.99
Episode 30, Frame 14455, Running Reward: 129.17, Epsilon: 0.99
Episode 40, Frame 19871, Running Reward: 132.25, Epsilon: 0.98
Episode 50, Frame 24699, Running Reward: 132.30, Epsilon: 0.98
Episode 60, Frame 30027, Running Reward: 132.83, Epsilon: 0.97
Episode 70, Frame 35091, Running Reward: 133.93, Epsilon: 0.97
Episode 80, Frame 41215, Running Reward: 139.44, Epsilon: 0.96
Episode 90, Frame 46330, Running Reward: 139.33, Epsilon: 0.96


InvalidArgumentError: {{function_node __wrapped__ConcatV2_N_32_device_/job:localhost/replica:0/task:0/device:CPU:0}} ConcatOp : Dimension 2 in both shapes must be equal: shape[0] = [1,84,84,4] vs. shape[4] = [1,84,4,84] [Op:ConcatV2] name: concat

In [None]:
print(state_sample.shape)


NameError: name 'state_sample' is not defined