<a href="https://colab.research.google.com/github/DeerBay/Deep-Learning-Julia/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
!pip install gymnasium ale_py



In [70]:
import tensorflow as tf
import keras
from keras import layers
import gymnasium as gym
from gymnasium.wrappers import FrameStackObservation, AtariPreprocessing, RecordVideo
import numpy as np
import random
from collections import deque
import os
import ale_py
from tqdm import tqdm

# Device selection
device_name = "gpu"  # Set to "cpu" or "gpu" as needed
if device_name == "gpu" and tf.config.list_physical_devices('GPU'):
    device = "/GPU:0"
else:
    device = "/CPU:0"
device

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
# Setup Atari environment
env = gym.make("SpaceInvadersNoFrameskip-v4", render_mode="rgb_array")  # Use rgb_array for recording

# Apply Atari preprocessing
env = AtariPreprocessing(env, frame_skip=4, grayscale_obs=True, scale_obs=True)

# Apply FrameStackObservation to stack 4 frames
env = FrameStackObservation(env, stack_size=4)  # Use stack_size parameter for frame stacking

# Record videos every 20th episode
trigger = lambda t: t % 20 == 0  # Video every 20th episode
env = RecordVideo(env, video_folder="./videos", episode_trigger=trigger, disable_logger=True)

# Test the setup
observation, info = env.reset()
print(f"Initial observation shape: {observation.shape}")

# Get the number of actions
num_actions = env.action_space.n
print(f"Number of actions: {num_actions}")
# List all action meanings
action_meanings = env.unwrapped.get_action_meanings()
print("Actions and their meanings:", action_meanings)

# Get the number of actions
num_actions = env.action_space.n
print(f"Number of actions: {num_actions}")
# List all action meanings
action_meanings = env.unwrapped.get_action_meanings()
print("Actions and their meanings:", action_meanings)

epsilon_decay_frames = 1000000  # Adjust this value if needed
batch_size = 32
max_steps_per_episode = 10000
max_episodes = 10000 # Update this
max_frames = 1e7

# Replay buffer parameters
# NOTE: The Deepmind paper suggests 1000000 however this causes memory issues
max_memory_length = 100000

# Function to create the Q-network model
def create_q_model(input_shape=(84, 84, 4), num_actions=6):
    return tf.keras.Sequential(
        [
            layers.Input(shape=input_shape),
            layers.Conv2D(32, kernel_size=8, strides=4, activation="relu"),
            layers.Conv2D(64, kernel_size=4, strides=2, activation="relu"),
            layers.Conv2D(64, kernel_size=3, strides=1, activation="relu"),
            layers.Flatten(),
            layers.Dense(512, activation="relu"),
            layers.Dense(num_actions, activation="linear"),
        ]
    )

# Hyperparameters
input_shape = (84, 84, 4)
num_actions = 6
gamma = 0.99  # Discount factor
batch_size = 32
epsilon = 1.0
epsilon_min = 0.1
epsilon_interval = (epsilon - epsilon_min)  # Range of epsilon decay
epsilon_random_frames = 1000
epsilon_greedy_frames = 1000000
max_steps_per_episode = 10000
update_after_actions = 4
update_target_network = 10000
tau = 0.005  # Soft update factor
# Using huber loss for stability (specifically for Adam)
loss_function = keras.losses.Huber()

# Environment and Replay Buffer
replay_buffer = deque(maxlen=100000)
frame_count = 0
episode_count = 0
episode_reward_history = []
best_running_reward = -float("inf")
running_reward = 0

# Models and Optimizer
model = create_q_model(input_shape, num_actions)
model_target = create_q_model(input_shape, num_actions)
model_target.set_weights(model.get_weights())  # Synchronize weights initially
optimizer = tf.keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)

# Create directory for saving models
save_dir = "models"
os.makedirs(save_dir, exist_ok=True)

# Training loop
with tf.device("/gpu:0"):  # Adjust device as needed
    print(f"Using device: /gpu:0")
    while True:
        # Reset environment
        observation, _ = env.reset()
        state = np.array(observation, dtype=np.float32) / 255.0  # Normalize and preprocess state
        state = np.transpose(state, (1, 2, 0))  # Ensure (84, 84, 4)
        episode_reward = 0

        for timestep in range(1, max_steps_per_episode + 1):
            frame_count += 1

            # Epsilon-greedy action selection
            if frame_count < epsilon_random_frames or np.random.rand() < epsilon:
                action = env.action_space.sample()
            else:
                state_tensor = tf.convert_to_tensor(state, dtype=tf.float32)
                state_tensor = tf.expand_dims(state_tensor, 0)  # Add batch dimension
                action_probs = model(state_tensor, training=False)
                action = tf.argmax(action_probs[0]).numpy()

            # Decay epsilon
            epsilon -= epsilon_interval / epsilon_greedy_frames
            epsilon = max(epsilon, epsilon_min)

            # Environment step
            state_next, reward, done, _, _ = env.step(action)
            state_next = np.array(state_next, dtype=np.float32) / 255.0
            state_next = np.transpose(state_next, (1, 2, 0))  # Ensure (84, 84, 4)
            episode_reward += reward

            # Store transition in replay buffer
            replay_buffer.append((state, action, reward, state_next, done))
            state = state_next

            # Training
            if frame_count % update_after_actions == 0 and len(replay_buffer) >= batch_size:
                # Sample batch from replay buffer
                batch = random.sample(replay_buffer, batch_size)
                states, actions, rewards, next_states, dones = zip(*batch)

                # Convert to tensors
                state_sample = tf.convert_to_tensor(states, dtype=tf.float32)  # (batch_size, 84, 84, 4)
                next_state_sample = tf.convert_to_tensor(next_states, dtype=tf.float32)  # (batch_size, 84, 84, 4)

                action_sample = tf.convert_to_tensor(actions, dtype=tf.int32)
                reward_sample = tf.convert_to_tensor(rewards, dtype=tf.float32)
                done_sample = tf.convert_to_tensor(dones, dtype=tf.float32)

                # Compute target Q-values
                future_rewards = model_target(next_state_sample, training=False)
                updated_q_values = reward_sample + gamma * tf.reduce_max(future_rewards, axis=1) * (1 - done_sample)

                # Train model
                with tf.GradientTape() as tape:
                    q_values = model(state_sample)
                    q_action = tf.reduce_sum(tf.one_hot(action_sample, num_actions) * q_values, axis=1)
                    loss = tf.keras.losses.Huber()(updated_q_values, q_action)

                grads = tape.gradient(loss, model.trainable_variables)
                optimizer.apply_gradients(zip(grads, model.trainable_variables))

            # Update target network
            if frame_count % update_target_network == 0:
                model_target.set_weights(model.get_weights())

            if done:
                break

        # Log rewards and progress
        episode_reward_history.append(episode_reward)
        running_reward = np.mean(episode_reward_history[-100:])
        episode_count += 1

        if running_reward > best_running_reward:
            best_running_reward = running_reward
            model.save(f"{save_dir}/best_model_episode_{episode_count}.keras")

        if episode_count % 10 == 0:
            print(f"Episode: {episode_count}, Running Reward: {running_reward:.2f}, Epsilon: {epsilon:.2f}")
            print(f"Episod {episode_count} finished. Totalamount of frames: {frame_count}")


        if running_reward > 10000 or frame_count >= 50000000:
            print(f"Training complete after {episode_count} episodes and {frame_count} frames.")
            model.save(f"{save_dir}/final_model.keras")
            break



Using device: /gpu:0
Episode: 10, Running Reward: 163.50, Epsilon: 1.00
Episod 10 finished. Totalamount of frames: 4910


KeyboardInterrupt: 

In [39]:
with tf.device(device):  # Ensures all TensorFlow operations are executed on the selected device
    print(f"Using device: {device}")
    while True:
        # Reset environment
        observation, _ = env.reset()
        state = np.array(observation)  # Preprocessed state
        #print("State shape:", state.shape)  # Debugging to verify shape
        episode_reward = 0

        for timestep in range(1, max_steps_per_episode):
            frame_count += 1

            # Epsilon-greedy action selection
            if frame_count < epsilon_random_frames or tf.random.uniform((1,)) < epsilon:
                action = env.action_space.sample()
            else:
                # Ensure state is in the correct format: (batch_size, height, width, channels)
                state_tensor = keras.ops.convert_to_tensor(state)
                state_tensor = keras.ops.expand_dims(state_tensor, 0)
                action_probs = model(state_tensor, training=False)
                action = keras.ops.argmax(action_probs[0]).numpy()

            # Decay probability of taking random action
            epsilon -= epsilon_interval / epsilon_greedy_frames
            epsilon = max(epsilon, epsilon_min)

            # Step in the environment
            state_next, reward, done, _, _ = env.step(action)
            state_next = np.array(state_next)  # Preprocessed next state
            episode_reward += reward

            # Store transition in replay buffer
            replay_buffer.append((state, action, reward, state_next, done))
            state = state_next  # Update state

            # Train the model
            if frame_count % update_after_actions == 0 and len(replay_buffer) >= batch_size:
                # Sample a batch from replay buffer
                batch = random.sample(replay_buffer, batch_size)
                states, actions, rewards, next_states, dones = zip(*batch)

                # Convert to tensors
                state_sample = tf.stack(states, axis=0)  # Combine into a single tensor
                print("state_sample shape before transpose:", state_sample.shape)

                # Transpose to (batch_size, 84, 84, 4)
                state_sample = tf.transpose(state_sample, perm=[0, 2, 3, 1])  # Correct permutation
                print("state_sample shape after transpose:", state_sample.shape)

                state_sample = tf.cast(state_sample, dtype=tf.float32)


                next_state_sample = tf.stack(next_states, axis=0)  # Result: (batch_size, 4, 84, 84)
                print("next_state_sample shape before transpose:", next_state_sample.shape)
                next_state_sample = tf.transpose(next_state_sample, perm=[0, 2, 3, 1])  # Rearrange to (batch_size, 84, 84, 4)
                print("next_state_sample shape after transpose:", next_state_sample.shape)
                next_state_sample = tf.cast(next_state_sample, dtype=tf.float32)

                action_sample = tf.convert_to_tensor(actions, dtype=tf.int32)
                reward_sample = tf.convert_to_tensor(rewards, dtype=tf.float32)
                done_sample = tf.convert_to_tensor(dones, dtype=tf.float32)

                # Debugging final shapes before passing to the model
                print("state_sample final shape:", state_sample.shape)  # Expected: (batch_size, 84, 84, 4)
                print("next_state_sample final shape:", next_state_sample.shape)  # Expected: (batch_size, 84, 84, 4)
                print("action_sample final shape:", action_sample.shape)  # Expected: (batch_size,)

                # Compute target Q-values
                future_rewards = model_target(next_state_sample)
                updated_q_values = reward_sample + gamma * tf.reduce_max(future_rewards, axis=1) * (1 - done_sample)

                # Masked loss
                with tf.GradientTape() as tape:
                    q_values = model(state_sample)
                    q_action = tf.reduce_sum(tf.one_hot(action_sample, num_actions) * q_values, axis=1)
                    loss = loss_function(updated_q_values, q_action)

                # Backpropagation
                grads = tape.gradient(loss, model.trainable_variables)
                print("Gradient shapes:", [g.shape for g in grads if g is not None])
                optimizer.apply_gradients(zip(grads, model.trainable_variables))

            # Update target network
            if frame_count % update_target_network == 0:
                model_target.set_weights([
                    tau * w + (1 - tau) * tw
                    for w, tw in zip(model.get_weights(), model_target.get_weights())
                ])

            # End episode if done
            if done:
                break

        # Update running reward and history
        episode_reward_history.append(episode_reward)
        running_reward = np.mean(episode_reward_history)
        episode_count += 1
        # Update progress bar
        episode_bar.update(1)  # Increment the progress bar by one episode
        episode_bar.set_postfix(running_reward=f"{running_reward:.2f}", epsilon=f"{epsilon:.2f}")

        # Check if the model should be saved
        if running_reward > best_running_reward:
            best_running_reward = running_reward

            # Remove the last saved model if it exists
            if last_saved_model and os.path.exists(last_saved_model):
                os.remove(last_saved_model)

            # Save the new best model
            model_path = f"models/best_model_episode_{episode_count}.keras"
            model.save(model_path)
            last_saved_model = model_path

            # Log the result
            with open(log_file, "a") as f:
                f.write(f"{episode_count}\t{running_reward:.2f}\t{epsilon:.2f}\n")

            print(f"New best model saved: {model_path} with running reward: {running_reward:.2f}")

        # Print progress
        if episode_count % 10 == 0:
            print(f"Episode {episode_count}, Frame {frame_count}, Running Reward: {running_reward:.2f}, Epsilon: {epsilon:.2f}")

        # Termination conditions
        if running_reward > 10000:
            print(f"Solved with a running_reward of {running_reward} at episode {episode_count}!")
            model.save("spaceinvaders_qmodel_solved.keras")
            break

        if max_episodes > 0 and episode_count >= max_episodes:
            print(f"Stopped at episode {episode_count}!")
            break

        if max_frames > 0 and frame_count >= max_frames:
            print(f"Stopped at frame {frame_count}!")
            break

    # Final save after training
    model.save("spaceinvaders_qmodel_final.keras")
    print("Final model saved as 'spaceinvaders_qmodel_final.keras'.")
    print("Training complete.")
episode_bar.close()

Using device: /GPU:0
state_sample shape before transpose: (32, 4, 84, 84)
state_sample shape after transpose: (32, 84, 84, 4)
next_state_sample shape before transpose: (32, 4, 84, 84)
next_state_sample shape after transpose: (32, 84, 84, 4)
state_sample final shape: (32, 84, 84, 4)
next_state_sample final shape: (32, 84, 84, 4)
action_sample final shape: (32,)
Gradient shapes: [TensorShape([8, 8, 4, 32]), TensorShape([32]), TensorShape([4, 4, 32, 64]), TensorShape([64]), TensorShape([3, 3, 64, 64]), TensorShape([64]), TensorShape([3136, 512]), TensorShape([512]), TensorShape([512, 6]), TensorShape([6])]
state_sample shape before transpose: (32, 4, 84, 84)
state_sample shape after transpose: (32, 84, 84, 4)
next_state_sample shape before transpose: (32, 4, 84, 84)
next_state_sample shape after transpose: (32, 84, 84, 4)
state_sample final shape: (32, 84, 84, 4)
next_state_sample final shape: (32, 84, 84, 4)
action_sample final shape: (32,)
Gradient shapes: [TensorShape([8, 8, 4, 32]), T

KeyboardInterrupt: 