<a href="https://colab.research.google.com/github/DeerBay/Deep-Learning-Julia/blob/main/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gymnasium['atari, other'] gymnasium ale-py tqdm

Collecting gymnasium
  Downloading gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting ale-py
  Downloading ale_py-0.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.6 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium[atari,other])
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading gymnasium-1.0.0-py3-none-any.whl (958 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m958.1/958.1 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ale_py-0.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m53.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium, ale-py
Successfully installed ale-py-0.10.1 farama-notifications-0.0.4 gymnasium-1.0.0


In [11]:
import tensorflow as tf
import keras
from keras import layers
import gymnasium as gym
from gymnasium.wrappers import FrameStackObservation, AtariPreprocessing, RecordVideo
import numpy as np
import os
from tqdm import tqdm

# Performance and GPU Configuration
tf.config.optimizer.set_jit(True)
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

class ReplayBuffer:
    def __init__(self, max_size, state_shape, num_actions):
        self.max_size = max_size
        self.states = np.zeros((max_size, *state_shape), dtype=np.float32)
        self.actions = np.zeros(max_size, dtype=np.int32)
        self.rewards = np.zeros(max_size, dtype=np.float32)
        self.next_states = np.zeros((max_size, *state_shape), dtype=np.float32)
        self.dones = np.zeros(max_size, dtype=np.float32)
        self.index = 0
        self.is_full = False

    def add(self, state, action, reward, next_state, done):
        idx = self.index % self.max_size
        self.states[idx] = state
        self.actions[idx] = action
        self.rewards[idx] = reward
        self.next_states[idx] = next_state
        self.dones[idx] = done
        self.index += 1
        if self.index >= self.max_size:
            self.is_full = True

    def sample(self, batch_size):
        max_index = self.max_size if self.is_full else self.index
        indices = np.random.choice(max_index, batch_size, replace=False)
        return (
            self.states[indices],
            self.actions[indices],
            self.rewards[indices],
            self.next_states[indices],
            self.dones[indices]
        )
    def __len__(self):  # This ensures len(replay_buffer) works
        return self.max_size if self.is_full else self.index

class DQNAgent:
    def __init__(self, state_shape, num_actions, learning_rate=0.00025):
        self.state_shape = state_shape
        self.num_actions = num_actions
        self.model = self.create_q_model(state_shape, num_actions)
        self.target_model = self.create_q_model(state_shape, num_actions)
        self.target_model.set_weights(self.model.get_weights())
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, clipnorm=1.0)
        self.replay_buffer = ReplayBuffer(max_size=100000, state_shape=state_shape, num_actions=num_actions)
        self.gamma = 0.99
        self.epsilon = 1.0
        self.epsilon_min = 0.1
        self.epsilon_decay = 0.99995

    def create_q_model(self, input_shape, num_actions):
        return tf.keras.Sequential([
            layers.Input(shape=input_shape),
            layers.Conv2D(32, kernel_size=8, strides=4, activation="relu", kernel_initializer='he_uniform'),
            layers.Conv2D(64, kernel_size=4, strides=2, activation="relu", kernel_initializer='he_uniform'),
            layers.Conv2D(64, kernel_size=3, strides=1, activation="relu", kernel_initializer='he_uniform'),
            layers.Flatten(),
            layers.Dense(512, activation="relu", kernel_initializer='he_uniform'),
            layers.Dense(num_actions, activation="linear")
        ])

    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.randint(self.num_actions)
        state_tensor = tf.convert_to_tensor(state[np.newaxis, ...], dtype=tf.float32)
        q_values = self.model(state_tensor, training=False)
        return tf.argmax(q_values[0]).numpy()

    @tf.function
    def train_step(self, states, actions, rewards, next_states, dones):
        future_rewards = self.target_model(next_states, training=False)
        target_q_values = rewards + self.gamma * tf.reduce_max(future_rewards, axis=1) * (1 - dones)
        with tf.GradientTape() as tape:
            q_values = self.model(states, training=True)
            q_action = tf.reduce_sum(tf.one_hot(actions, self.num_actions) * q_values, axis=1)
            loss = tf.keras.losses.Huber()(target_q_values, q_action)
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))
        return loss

    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

def main():
    # Environment setup
    env = gym.make("SpaceInvadersNoFrameskip-v4", render_mode="rgb_array")
    env = AtariPreprocessing(env, frame_skip=4, grayscale_obs=True, scale_obs=True)
    env = FrameStackObservation(env, stack_size=4)
    video_folder = "videos"
    os.makedirs(video_folder, exist_ok=True)

    # Agent setup
    state_shape = (84, 84, 4)
    num_actions = env.action_space.n
    agent = DQNAgent(state_shape, num_actions)

    # Training parameters
    max_episodes = 5000
    max_steps_per_episode = 10000
    update_target_every = 1000
    log_every = 100
    epsilon_random_frames = 50000  # Number of frames to take random actions and observe output
    episode_rewards = []
    frame_count = 0

    # Open a file for logging results
    log_file = "training_log.txt"
    with open(log_file, "w") as f:
        f.write("Episode,Avg_Reward,Epsilon\n")  # Write headers

    # Training loop
    for episode in range(max_episodes):
        state, _ = env.reset()
        state = np.array(state, dtype=np.float32) / 255.0
        state = np.transpose(state, (1, 2, 0))
        episode_reward = 0

        for step in range(max_steps_per_episode):
            frame_count += 1

            # Choose action
            if frame_count < epsilon_random_frames:
                # Take random action during the initial exploration phase
                action = np.random.randint(num_actions)
            else:
                # Use epsilon-greedy policy after the exploration phase
                action = agent.get_action(state)

            # Step environment
            next_state, reward, done, _, _ = env.step(action)
            next_state = np.array(next_state, dtype=np.float32) / 255.0
            next_state = np.transpose(next_state, (1, 2, 0))

            # Store in replay buffer
            agent.replay_buffer.add(state, action, reward, next_state, done)

            # Update state and reward
            state = next_state
            episode_reward += reward

            # Train only if enough samples in replay buffer
            if len(agent.replay_buffer) >= 32:
                states, actions, rewards, next_states, dones = agent.replay_buffer.sample(32)
                agent.train_step(states, actions, rewards, next_states, dones)

            # Decay epsilon (only after the random exploration phase)
            if frame_count >= epsilon_random_frames:
                agent.epsilon = max(agent.epsilon_min, agent.epsilon * agent.epsilon_decay)

            # Update target network periodically
            if frame_count % update_target_every == 0:
                agent.update_target_model()

            if done:
                break

        # Logging results
        episode_rewards.append(episode_reward)
        if (episode + 1) % log_every == 0:
            avg_reward = np.mean(episode_rewards[-log_every:])
            with open(log_file, "a") as f:
                f.write(f"{episode + 1},{avg_reward:.2f},{agent.epsilon:.4f}\n")
                f.flush()
            print(f"Episode {episode + 1}, Avg Reward: {avg_reward:.2f}, Epsilon: {agent.epsilon:.4f}")

        # Save video every 100 episodes
        if (episode + 1) % 100 == 0:
            os.makedirs('models', exist_ok=True)
            agent.model.save(f'models/episode{episode + 1}_space_invaders_model.keras')
            video_env = RecordVideo(env, video_folder=video_folder, episode_trigger=lambda ep: True)
            video_state, _ = video_env.reset()
            video_state = np.array(video_state, dtype=np.float32) / 255.0
            video_state = np.transpose(video_state, (1, 2, 0))
            for _ in range(max_steps_per_episode):
                video_action = agent.get_action(video_state)
                video_next_state, _, video_done, _, _ = video_env.step(video_action)
                video_state = np.array(video_next_state, dtype=np.float32) / 255.0
                video_state = np.transpose(video_state, (1, 2, 0))
                if video_done:
                    break
            video_env.close()

    # Save final model
    os.makedirs('models', exist_ok=True)
    agent.model.save('models/final_space_invaders_model.keras')


if __name__ == "__main__":
    main()

KeyboardInterrupt: 