In [1]:
import gym
import numpy as np
import random
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers
from tqdm import tqdm

In [48]:
#Env.step() returns: Observation, reward, terminated, truncated, info

In [49]:
def create_cnn(input_shape, num_actions):
    model = models.Sequential([
        layers.Input(shape=input_shape),
        layers.Conv2D(32, (8, 8), strides=(4, 4), activation='relu'),
        layers.Conv2D(64, (4, 4), strides=(2, 2), activation='relu'),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.Conv2D(128, (1, 1), activation='relu'),
        layers.Flatten(),
        layers.Dense(512, activation='relu'),
        layers.Dense(num_actions, activation='linear')  # Output layer for Q-values
    ])
    return model


In [50]:
#env = gym.make('ALE/Frogger-v5', render_mode='rgb_array')
#env = gym.make('ALE/Frogger-v5', render_mode='human')
env = gym.make('ALE/Frogger-v5', )
input_shape = env.observation_space.shape  # This should match the frame size
num_actions = env.action_space.n  # Number of possible actions

model = create_cnn(input_shape, num_actions)
model.compile(optimizer=optimizers.Adam(learning_rate=0.00025), loss='mse')  # Mean Squared Error loss for Q-value difference


In [51]:
print(env.reset())

(array([[[  0,   0,   0],
        [  0,   0,   0],
        [  0,   0,   0],
        ...,
        [  0,   0,   0],
        [  0,   0,   0],
        [  0,   0,   0]],

       [[  0,   0,   0],
        [  0,   0,   0],
        [  0,   0,   0],
        ...,
        [  0,   0,   0],
        [  0,   0,   0],
        [  0,   0,   0]],

       [[  0,   0,   0],
        [  0,   0,   0],
        [  0,   0,   0],
        ...,
        [  0,   0,   0],
        [  0,   0,   0],
        [  0,   0,   0]],

       ...,

       [[ 82, 126,  45],
        [ 82, 126,  45],
        [ 82, 126,  45],
        ...,
        [ 82, 126,  45],
        [ 82, 126,  45],
        [ 82, 126,  45]],

       [[ 82, 126,  45],
        [ 82, 126,  45],
        [ 82, 126,  45],
        ...,
        [ 82, 126,  45],
        [ 82, 126,  45],
        [ 82, 126,  45]],

       [[ 82, 126,  45],
        [ 82, 126,  45],
        [ 82, 126,  45],
        ...,
        [ 82, 126,  45],
        [ 82, 126,  45],
        [ 82, 126,  45]

In [52]:
def train_model(episodes):
    # Initialize list to keep track of total rewards for each episode
    episode_rewards = []

    # Set up tqdm progress bar
    with tqdm(total=episodes, unit='episode') as pbar:
        for e in range(episodes):
            state = env.reset()[0]
            state = np.array(state)
            done = False
            total_reward = 0

            while not done:
                # Randomly choose an action or the best predicted action
                if np.random.rand() <= 0.1:  # Exploration rate
                    action = env.action_space.sample()
                else:
                    q_values = model.predict(state[None, ...], verbose=0)
                    action = np.argmax(q_values[0])

                next_state, reward, terminated, truncated, info = env.step(action)
                next_state = np.array(next_state)
                total_reward += reward
                
                if terminated or truncated:
                    done = True

            # Update progress bar
            pbar.update(1)
            pbar.set_description(f"Episode: {e+1}, Reward: {total_reward}")

            # Append the total reward to the rewards list
            episode_rewards.append(total_reward)

    # Print overall training results
    print(f"Average Reward: {np.mean(episode_rewards)}")
    print(f"Best Reward: {max(episode_rewards)}")

# Call the training function with the desired number of episodes
train_model(1)
env.close()


Episode: 1, Reward: 5.0: 100%|██████████| 1/1 [01:11<00:00, 71.27s/episode]

Average Reward: 5.0
Best Reward: 5.0





In [53]:
env.close()
