In [3]:
import gymnasium as gym
import numpy as np
import random
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers
from tqdm import tqdm



In [4]:
# Hyperparameters
learning_rate = 0.00025
epsilon = 0.1  # Exploration rate
episodes = 1  # Number of training episodes
optimizer = optimizers.Adam(learning_rate=learning_rate)
loss_function = 'mse'  # Mean Squared Error loss for Q-value difference


In [7]:
def create_cnn(input_shape, num_actions):
    model = models.Sequential([
layers.Input(shape=input_shape),
        layers.Conv2D(64, (8, 8), strides=(4, 4), activation='relu'),
        layers.Conv2D(128, (8, 8), strides=(4, 4), activation='relu'),
        layers.Conv2D(128, (5, 5), strides=(2, 2), activation='relu'),
        layers.Conv2D(128, (5, 5), strides=(2, 2), activation='relu'),
        layers.Conv2D(128, (3, 3), strides=(1, 1), activation='relu'),
        layers.Conv2D(128, (3, 3), strides=(1, 1), activation='relu'),
        layers.Conv2D(128, (3, 3), strides=(1, 1), activation='relu'),
        layers.Conv2D(128, (3, 3), strides=(1, 1), activation='relu'),
        layers.Flatten(),
        layers.Dense(512, activation='relu'),
        layers.Dense(5, activation='linear')  # Output layer for Q-values
    ])
    return model


In [8]:
env = gym.make('ALE/Frogger-v5', render_mode='rgb_array')
#env = gym.make('ALE/Frogger-v5', )
input_shape = env.observation_space.shape  # This should match the frame size
num_actions = env.action_space.n  # Number of possible actions

model = create_cnn(input_shape, num_actions)
model.compile(optimizer=optimizers.Adam(learning_rate=0.00025), loss='mse')  # Mean Squared Error loss for Q-value difference


ValueError: Computed output size would be negative. Received `inputs shape=(None, 4, 2, 128)`, `kernel shape=(5, 5, 128, 128)`, `dilation_rate=[1 1]`.

In [None]:
def train_model(episodes):
    # Initialize list to keep track of total rewards for each episode
    episode_rewards = []

    # Set up tqdm progress bar
    with tqdm(total=episodes, unit='episode') as pbar:
        for e in range(episodes):
            state = env.reset()[0]
            state = np.array(state)
            done = False
            total_reward = 0

            while not done:
                # Randomly choose an action or the best predicted action
                if np.random.rand() <= 0.1:  # Exploration rate
                    action = env.action_space.sample()
                else:
                    q_values = model.predict(state[None, ...], verbose=0)
                    action = np.argmax(q_values[0])

                next_state, reward, terminated, truncated, info = env.step(action)
                next_state = np.array(next_state)
                total_reward += reward
                
                if terminated or truncated:
                    done = True

            # Update progress bar
            pbar.update(1)
            pbar.set_description(f"Episode: {e+1}, Reward: {total_reward}")

            # Append the total reward to the rewards list
            episode_rewards.append(total_reward)

    # Print overall training results
    print(f"Average Reward: {np.mean(episode_rewards)}")
    print(f"Best Reward: {max(episode_rewards)}")

# Call the training function with the desired number of episodes
train_model(1)
env.close()