In [4]:
import numpy as np
import gymnasium 
import matplotlib.pyplot as plt
import ale_py
import pygame
import tensorflow as tf
import keras
from keras import Sequential

In [7]:
gymnasium.register_envs(ale_py)
env = gymnasium.make("ALE/MsPacman-v5" , render_mode = "human")

model = Sequential([
    tf.keras.layers.InputLayer(input_shape=(210,160,3)),

    tf.keras.layers.Conv2D(filters = 32 , kernel_size=(8,8) , strides = 4 , activation = "relu"),
    tf.keras.layers.Conv2D(filters = 64 , kernel_size=(4,4) , strides = 4 , activation = "relu"),
    tf.keras.layers.Conv2D(filters = 64 , kernel_size=(3,3) , strides = 4 , activation = "relu"),

    tf.keras.layers.MaxPooling2D(pool_size=(2,2)),

    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(units=512, activation='relu'),
    tf.keras.layers.Dense(units=256, activation='relu'),

    # Output layer: 1 unit per possible action with sigmoid activation
    tf.keras.layers.Dense(units=8, activation='sigmoid')
])

def play_one_step(env, obs, reward, loss_fn, model):
    normalized_obs = obs / 255.0  # Normalize the observation
    with tf.GradientTape() as tape:
        yprob = model(normalized_obs[np.newaxis])  # Forward pass to get probabilities
        yprob = yprob.numpy().flatten()  # Flatten the array to simplify the index search
        
        # Find indices of max and min probabilities
        max_index = np.argmax(yprob)  # Index of max probability
        min_index = np.argmin(yprob)  # Index of min probability
        if min_index == 0:
            temp_arr = np.delete(yprob, 0)
            min_index = np.argmin(temp_arr) + 1

        # Choose the action based on random comparison
        action_bool = np.max(yprob) > np.random.uniform(0, 1)
        action = max_index if action_bool else min_index  # Select action

        ytarget = np.zeros_like(yprob, dtype=np.float32)  # One-hot target vector
        ytarget[action] = 1  # Set target at the action index

        # Compute the loss using the binary cross-entropy loss function
        loss = tf.reduce_mean(loss_fn(yprob, ytarget))
    
    grad = tape.gradient(loss, model.trainable_variables)  # Compute gradients

    # Take the action in the environment
    obs, reward, done, truncated, info = env.step(action)  # Correct the use of `action`

    # Debugging: Print reward and info
    # print(f"Reward: {reward}, Done: {done}, Truncated: {truncated}")

    # Render the environment (optional, for visualization)
    env.render()

    return obs, reward, done, truncated, info, grad


def play_multiple_episodes(env , max_episodes , max_steps , loss_fn , model):
    all_rewards = []
    all_grads = []
    for episode in range(max_episodes):
        current_rewards = []
        current_grads = []
        obs , info = env.reset()
        reward = 0
        for step in range(max_steps):
            obs , reward , done , truncated , info , grad = play_one_step(env , obs, reward , loss_fn , model)
            current_rewards.append(reward)
            current_grads.append(grad)
            if done or truncated:
                break
        all_rewards.append(current_rewards)
        all_grads.append(current_grads)
    return all_rewards,all_grads

def discount_rewards(rewards , discount_factor):
    discounted_rewards = np.zeros_like(rewards , np.float32)
    cummulative = 0.0
    for index in reversed(range(len(rewards))):
        cummulative = rewards[index] + cummulative * discount_factor
        discounted_rewards[index] = cummulative
    return discounted_rewards

def discount_and_normalize(all_rewards, discount_factor):
    all_discount_rewards = [discount_rewards(rewards , discount_factor) for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discount_rewards)
    mean_rewards = flat_rewards.mean()
    std_rewards = flat_rewards.std()
    return [((discounted_rewards - mean_rewards)/std_rewards) for discounted_rewards in all_discount_rewards]

iterations = 20
max_episodes = 1000
max_steps = 1000
discount_factor = 0.95

optimizer = tf.optimizers.Nadam(learning_rate = 0.01)
loss_fn = tf.losses.BinaryCrossentropy()
for iteration in range(iterations):
    all_rewards, all_grads = play_multiple_episodes(env , max_episodes , max_steps , loss_fn,model)
    all_final_rewards = discount_and_normalize(all_rewards , discount_factor)
    all_mean_grads = []
    for var_index in range(len(model.trainable_variables)):
        mean_grads = tf.reduce_mean(
            [final_reward * all_grads[episode_index][step][var_index]
             for episode_index , final_rewards in enumerate(all_final_rewards)
             for step , final_reward in enumerate(final_rewards)] , axis = 0)
        all_mean_grads.append(mean_grads)
    optimizer.apply_gradients(all_mean_grads , model.trainable_variables)

pygame.quit()

KeyboardInterrupt: 

In [6]:
pygame.quit()