In [87]:
import tensorflow as tf
import keras
from keras import layers
import gymnasium as gym
from gymnasium.wrappers import FrameStackObservation, AtariPreprocessing, RecordVideo
import numpy as np
import random
from collections import deque
import os

'''
You don’t need import ale_py and gym.register_envs(ale_py) because:

Atari environments are automatically registered during the installation of gymnasium[atari].
The current Gymnasium API handles all the behind-the-scenes setup for you.
'''

'\nYou don’t need import ale_py and gym.register_envs(ale_py) because:\n\nAtari environments are automatically registered during the installation of gymnasium[atari].\nThe current Gymnasium API handles all the behind-the-scenes setup for you.\n'

In [88]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


In [89]:
# Hyper parameters
seed = 42
gamma = 0.99
epsilon = 1.0
epsilon_min = 0.1
epsilon_max = 1.0
epsilon_interval = (epsilon_max - epsilon_min)
epsilon_decay_frames = 1000000  # Adjust this value if needed


batch_size = 32
max_steps_per_episode = 10000
max_episodes = 0 # Update this
max_frames = 1e7

# Exploration parameters
epsilon_random_frames = 50000
epsilon_greedy_frames = 1000000.0

# Replay buffer parameters
# NOTE: The Deepmind paper suggests 1000000 however this causes memory issues
max_memory_length = 100000
update_after_actions = 4
update_target_network = 10000

# Using huber loss for stability (specifically for Adam)
loss_function = keras.losses.Huber()

# Initialize history variables
action_history = []
state_history = []
state_next_history = []
rewards_history = []
done_history = []
episode_reward_history = []
running_reward = 0
episode_count = 0
frame_count = 0

In [90]:
# Environment setup
# Setting frameskip=1 disables frame-skipping in the base environment, ensuring no conflict with AtariPreprocessing.
#The Atari environment (ALE/SpaceInvaders-v5) has a default frameskip value (usually [2, 5]).
env = gym.make("SpaceInvadersNoFrameskip-v4", render_mode="rgb_array", frameskip=1) 
env = AtariPreprocessing(env)
env = FrameStackObservation(env, 4)
trigger = lambda t: t % 20 == 0 # Every 20th episode
env = RecordVideo(env, video_folder="./videos", episode_trigger=trigger, disable_logger=True)

  logger.warn("Unable to save last video! Did you call close()?")


In [91]:
# Get the number of actions
num_actions = env.action_space.n
print(f"Number of actions: {num_actions}")
# List all action meanings
action_meanings = env.unwrapped.get_action_meanings()
print("Actions and their meanings:", action_meanings)

Number of actions: 6
Actions and their meanings: ['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']


In [92]:
def create_q_model():
    return keras.Sequential(
        [
            layers.Input(shape=(84, 84, 4)),
            layers.Conv2D(32, kernel_size=8, strides=4, activation="relu"),
            layers.Conv2D(64, kernel_size=4, strides=2, activation="relu"),
            layers.Conv2D(64, kernel_size=3, strides=1, activation="relu"),
            layers.Flatten(),
            layers.Dense(512, activation="relu"),
            layers.Dense(num_actions, activation="linear")
        ]
    )


model = create_q_model()
model_target = create_q_model()

optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)

# Create directory for saving models
save_dir = "models"
os.makedirs(save_dir, exist_ok=True)

In [95]:
from collections import deque
import random

# Replay buffer
replay_buffer = deque(maxlen=max_memory_length)

# Initialize history
episode_reward_history = deque(maxlen=100)  # Efficient rolling window

frame_count = 0
episode_count = 0
epsilon = epsilon_max

# Target network update tau for soft updates
tau = 0.001

while True:
    # Reset environment
    observation, _ = env.reset()
    state = tf.expand_dims(tf.convert_to_tensor(observation, dtype=tf.float32), 0)  # Preprocessed state
    episode_reward = 0

    # Epsilon decay once per episode
    epsilon = max(epsilon_min, epsilon_max - epsilon_interval * (frame_count / epsilon_greedy_frames))

    for timestep in range(1, max_steps_per_episode):
        frame_count += 1

        # Epsilon-greedy action selection
        if frame_count < epsilon_random_frames or tf.random.uniform((1,)) < epsilon:
            action = env.action_space.sample()
        else:
            action_probs = model(state, training=False)
            action = tf.argmax(action_probs[0]).numpy()

        # Step in the environment
        state_next, reward, done, _, _ = env.step(action)
        state_next = tf.expand_dims(tf.convert_to_tensor(state_next, dtype=tf.float32), 0)  # Preprocessed next state
        episode_reward += reward

        # Store transition in replay buffer
        replay_buffer.append((state, action, reward, state_next, done))
        state = state_next  # Update state

        # Train the model
        if frame_count % update_after_actions == 0 and len(replay_buffer) >= batch_size:
            # Sample a batch from replay buffer
            batch = random.sample(replay_buffer, batch_size)
            states, actions, rewards, next_states, dones = zip(*batch)

            # Convert to tensors
            state_sample = tf.concat(states, axis=0)  # Combine into a single tensor
            state_sample = tf.transpose(state_sample, perm=[0, 2, 3, 1])  # Rearrange to (batch_size, 84, 84, 4)
            state_sample = tf.cast(state_sample, dtype=tf.float32)
            
            next_state_sample = tf.concat(next_states, axis=0)  # Combine into a single tensor
            next_state_sample = tf.transpose(next_state_sample, perm=[0, 2, 3, 1])  # Rearrange to (batch_size, 84, 84, 4)
            next_state_sample = tf.cast(next_state_sample, dtype=tf.float32)

            action_sample = tf.convert_to_tensor(actions, dtype=tf.int32)
            reward_sample = tf.convert_to_tensor(rewards, dtype=tf.float32)
            done_sample = tf.convert_to_tensor(dones, dtype=tf.float32)

            # Debugging shape before passing to the model
           # print("state_sample shape:", state_sample.shape)  # Should be (batch_size, 84, 84, 4)
           # print("next_state_sample shape:", next_state_sample.shape)  # Should be (batch_size, 84, 84, 4)


            # Compute target Q-values
            future_rewards = model_target(next_state_sample)
            updated_q_values = reward_sample + gamma * tf.reduce_max(future_rewards, axis=1) * (1 - done_sample)

            # Masked loss
            with tf.GradientTape() as tape:
                q_values = model(state_sample)
                q_action = tf.reduce_sum(tf.one_hot(action_sample, num_actions) * q_values, axis=1)
                loss = loss_function(updated_q_values, q_action)

            # Backpropagation
            grads = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

        # Update target network
        if frame_count % update_target_network == 0:
            model_target.set_weights([
                tau * w + (1 - tau) * tw
                for w, tw in zip(model.get_weights(), model_target.get_weights())
            ])

        # End episode if done
        if done:
            break

    # Update running reward and history
    episode_reward_history.append(episode_reward)
    running_reward = np.mean(episode_reward_history)
    episode_count += 1

    # Save model after each episode
    model_path = f"models/episode_{episode_count}.keras"
    model.save(model_path)
    print(f"Model saved for episode {episode_count} at {model_path}.")

    # Print progress
    if episode_count % 10 == 0:
        print(f"Episode {episode_count}, Frame {frame_count}, Running Reward: {running_reward:.2f}, Epsilon: {epsilon:.2f}")

    # Termination conditions
    if running_reward > 40:  # Considered good intermediate performance
        print(f"Solved with a running_reward of {running_reward} at episode {episode_count}!")
        model.save("spaceinvaders_qmodel_solved.keras")
        break

    if max_episodes > 0 and episode_count >= max_episodes:
        print(f"Stopped at episode {episode_count}!")
        break

    if max_frames > 0 and frame_count >= max_frames:
        print(f"Stopped at frame {frame_count}!")
        break

# Final save after training
model.save("spaceinvaders_qmodel_final.keras")
print("Final model saved as 'breakout_qmodel_final.keras'.")
print("Training complete.")


state_sample shape: (32, 84, 84, 4)
next_state_sample shape: (32, 84, 84, 4)
state_sample shape: (32, 84, 84, 4)
next_state_sample shape: (32, 84, 84, 4)
state_sample shape: (32, 84, 84, 4)
next_state_sample shape: (32, 84, 84, 4)
state_sample shape: (32, 84, 84, 4)
next_state_sample shape: (32, 84, 84, 4)
state_sample shape: (32, 84, 84, 4)
next_state_sample shape: (32, 84, 84, 4)
state_sample shape: (32, 84, 84, 4)
next_state_sample shape: (32, 84, 84, 4)
state_sample shape: (32, 84, 84, 4)
next_state_sample shape: (32, 84, 84, 4)
state_sample shape: (32, 84, 84, 4)
next_state_sample shape: (32, 84, 84, 4)
state_sample shape: (32, 84, 84, 4)
next_state_sample shape: (32, 84, 84, 4)
state_sample shape: (32, 84, 84, 4)
next_state_sample shape: (32, 84, 84, 4)
state_sample shape: (32, 84, 84, 4)
next_state_sample shape: (32, 84, 84, 4)
state_sample shape: (32, 84, 84, 4)
next_state_sample shape: (32, 84, 84, 4)
state_sample shape: (32, 84, 84, 4)
next_state_sample shape: (32, 84, 84, 4)

In [100]:
import keras
import gymnasium as gym
import ale_py
from gymnasium.wrappers import FrameStackObservation
from gymnasium.wrappers.atari_preprocessing import AtariPreprocessing

# Register ALE environments
gym.register_envs(ale_py)

# Load the pre-trained model
model_file = "../Lab/breakout_qmodel_final.keras"
agent = keras.models.load_model(model_file)

# Initialize the environment
env = gym.make("SpaceInvadersNoFrameskip-v4", render_mode="human")
env = AtariPreprocessing(env)
env = FrameStackObservation(env, 4)

# Reset the environment
state, _ = env.reset()
done = False

# Run the environment loop
while not done:
    # Convert state to a tensor for compute efficiency
    state_tensor = keras.ops.convert_to_tensor(state)
    # Transpose state shape from (4, 84, 84) to (84, 84, 4)
    state_tensor = keras.ops.transpose(state_tensor, [1, 2, 0])
    # Add batch dimension
    state_tensor = keras.ops.expand_dims(state_tensor, 0)
    # Predict action probabilities
    action_probs = agent(state_tensor, training=False)
    # Take the "best" action
    action = keras.ops.argmax(action_probs[0]).numpy()

    # Step the environment
    state, reward, done, _, _ = env.step(action)



: 

: 