In [812]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import Model
from tensorflow.keras.layers import Conv2D, ReLU, MaxPool2D, Flatten, Dense
import numpy as np
import gymnasium as gym
from gymnasium import Env
from gymnasium.spaces import Discrete, Box
import cv2
from PIL import Image
from PIL import ImageGrab
import os
from PIL import Image
import pyautogui
import time
from collections import deque
import random
import matplotlib.pyplot as plt
import math

# env

In [813]:
keys = ["z", "s", "q", "d", "e"]

In [814]:
import gym
from gym import spaces


class CustomEnv(Env):
    def __init__(self):
        super(CustomEnv, self).__init__()

        # Define action and observation space
        # They must be gym.spaces objects
        self.total_picked_up = 0
        self.action_space = Discrete(5)
        self.observation_space = Box(low=0, high=255, shape=(1080, 1920, 3), dtype=np.uint8)

    def step(self, action):
        observation = self.get_screen()
        
        pyautogui.press(keys[action])
  
        reward = 0
        if self.item_found(observation):
            reward = 1
            self.total_picked_up += 1
            
        info = {}
        
        done = self.total_picked_up >= 5

        return observation, reward, done, info

    def reset(self):
        self.total_picked_up = 0
        return self.total_picked_up

    def render(self, mode='human'):
        pass
    
    def get_screen(self):
        screen = pyautogui.screenshot()
        return np.array(screen)
    
    def item_found(self, observation):
        
        if len(observation.shape) == 3:
            observation = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY)
        # Read the template
        template = cv2.imread("resources/template_matching/leafs.png", 0)

        # Perform template matching
        if observation.dtype != np.uint8:
            observation = observation.astype(np.uint8)
        if template.dtype != np.uint8:
            template = template.astype(np.uint8)

        res = cv2.matchTemplate(observation, template, cv2.TM_CCOEFF_NORMED)
        return np.where(res >= .8)

# model

In [815]:
# # Define loss and optimizer
# optimizer = tf.keras.optimizers.Adam()
# loss_fn = tf.keras.losses.MeanSquaredError()
# 
# # Number of episodes to play
# num_episodes = 500
# 
# 
# # Function to preprocess images
# def preprocess_image(image):
#     return tf.image.rgb_to_grayscale(image)
# 
# 
# for episode in range(num_episodes):
#     initial_state = env.reset()
# 
#     done = False
#     while not done:
#         action = env.action_space.sample()
#         next_state, reward, done, info = env.step(action)
# 
#         # next_state = preprocess_image(next_state)
# 
#         # training here


In [816]:
# while True:
#     # Capture screen
#     screen = pyautogui.screenshot()
#     # Convert the image into numpy array representation
#     frame = np.array(screen)
#     # Convert the BGR image into RGB image
#     frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
#     # Resize the capturing screen to 1080p
#     frame = cv2.resize(frame, (1920, 1080))
#     # Display screen in 1080p
#     cv2.imshow('Screen Capture in 1080p', frame)
# 
#     # Wait for the user to press the ESC key (ASCII 27) to quit capturing the screen
#     if cv2.waitKey(1) == 27:
#         break
# 
# cv2.destroyAllWindows()

# NN

In [817]:
# # Define the Convolutional Neural Network
# inputs = layers.Input(shape=states)
# x = layers.Conv2D(32, (8, 8), strides=4, activation='relu')(inputs)
# x = layers.Conv2D(64, (4, 4), strides=2, activation='relu')(x)
# x = layers.Conv2D(64, (3, 3), strides=1, activation='relu')(x)
# x = layers.Flatten()(x)
# x = layers.Dense(512, activation='relu')(x)
# outputs = layers.Dense(env.action_space.n, activation='linear')(x)
# 
# model = keras.Model(inputs=inputs, outputs=outputs)

In [818]:
# model.build(states)
# model.summary()

Create an ampty .keras file

# DQNN

Q-learning NN class

In [819]:
class DQN(Model):
    def __init__(self, input_shape, out_actions):
        super(DQN, self).__init__()

        inputs = layers.Input(shape=input_shape)
        x = layers.Conv2D(32, (8, 8), strides=4, activation='relu')(inputs)
        x = layers.Conv2D(64, (4, 4), strides=2, activation='relu')(x)
        x = layers.Conv2D(64, (3, 3), strides=1, activation='relu')(x)
        x = layers.Flatten()(x)
        output = layers.Dense(units=out_actions, activation='linear')(x)
        self.model = keras.Model(inputs=inputs, outputs=output)

        self.model.build(self.model)
        self.model.summary()

    def call(self, input_tensor):
        return self.model(input_tensor)

In [820]:
# Define memory for Experience Replay
class ReplayMemory():
    def __init__(self, maxlen):
        self.memory = deque([], maxlen=maxlen)

    def append(self, transition):
        self.memory.append(transition)

    def sample(self, sample_size):
        return random.sample(self.memory, sample_size)

    def __len__(self):
        return len(self.memory)

In [821]:
class FrozenLakeDQL():
    # Hyperparameters (adjustable)
    learning_rate_a = 0.001         # learning rate (alpha)
    discount_factor_g = 0.9         # discount rate (gamma)
    network_sync_rate = 10          # number of steps the agent takes before syncing the policy and target network
    replay_memory_size = 1000       # size of replay memory
    mini_batch_size = 32            # size of the training data set sampled from the replay memory

    # Neural Network
    loss_fn = keras.losses.MeanSquaredError()         # NN Loss function. MSE=Mean Squared Error can be swapped to something else.
    optimizer = None                # NN Optimizer. Initialize later.

    ACTIONS = ['z', 'q', 's', 'd', 'e']     # for printing 0,1,2,3,4 => Z(forwards), Q(left), S(backwards),D(right), E(action)

    # Train the FrozeLake environment
    def train(self, episodes):
        # Create FrozenLake instance
        env = CustomEnv()
        num_states = env.observation_space
        num_actions = env.action_space.n

        epsilon = 1 # 1 = 100% random actions
        memory = ReplayMemory(self.replay_memory_size)

        # Create policy and target network.
        policy_dqn = DQN(input_shape=(1920, 1080, 3), out_actions=num_actions)
        target_dqn = DQN(input_shape=(1920, 1080, 3), out_actions=num_actions)

        # Make the target and policy networks the same (copy weights/biases from one network to the other)
        target_dqn.set_weights(policy_dqn.get_weights())

        print('Policy (random, before training):')
        self.print_dqn(policy_dqn)

        # Policy network optimizer. "Adam" optimizer can be swapped to something else.
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate_a)

        # List to keep track of rewards collected per episode. Initialize list to 0's.
        rewards_per_episode = np.zeros(episodes)

        # List to keep track of epsilon decay
        epsilon_history = []

        # Track number of steps taken. Used for syncing policy => target network.
        step_count=0

        for i in range(episodes):
            state = env.reset()  # Initialize to state 0
            terminated = False      # True when agent falls in hole or reached goal
            truncated = False       # True when agent takes more than 200 actions

            # Agent navigates map until it falls into hole/reaches goal (terminated), or has taken 200 actions (truncated).
            while(not terminated and not truncated):

                # Select action based on epsilon-greedy
                if random.random() < epsilon:
                    # select random action
                    action = env.action_space.sample() # actions: 0=left,1=down,2=right,3=up
                else:
                    # select best action
                    action = policy_dqn(self.state_to_dqn_input(state))

                # Execute action
                new_state,reward,terminated,truncated = env.step(action)

                # Save experience into memory
                memory.append((state, action, new_state, reward, terminated))

                # Move to the next state
                state = new_state

                # Increment step counter
                step_count+=1

            # Keep track of the rewards collected per episode.
            if reward == 1:
                rewards_per_episode[i] = 1

            # Check if enough experience has been collected and if at least 1 reward has been collected
            if len(memory)>self.mini_batch_size and np.sum(rewards_per_episode)>0:
                mini_batch = memory.sample(self.mini_batch_size)
                self.optimize(mini_batch, policy_dqn, target_dqn)

                # Decay epsilon
                epsilon = max(epsilon - 1/episodes, 0)
                epsilon_history.append(epsilon)

                # Copy policy network to target network after a certain number of steps
                if step_count > self.network_sync_rate:
                    target_dqn.set_weights(policy_dqn.get_weights)
                    step_count=0

        # Close environment
        env.close()

        # Save policy
        policy_dqn.save_weights("woodGathering_dql_cnn.keras")

        # Create new graph
        plt.figure(1)

        # Plot average rewards (Y-axis) vs episodes (X-axis)
        sum_rewards = np.zeros(episodes)
        for x in range(episodes):
            sum_rewards[x] = np.sum(rewards_per_episode[max(0, x-100):(x+1)])
        plt.subplot(121) # plot on a 1 row x 2 col grid, at cell 1
        plt.plot(sum_rewards)

        # Plot epsilon decay (Y-axis) vs episodes (X-axis)
        plt.subplot(122) # plot on a 1 row x 2 col grid, at cell 2
        plt.plot(epsilon_history)

        # Save plots
        plt.savefig('woodGathering_dql_cnn.png')

    # Optimize policy network
    def optimize(self, mini_batch, policy_dqn, target_dqn):

        current_q_list = []
        target_q_list = []

        for state, action, new_state, reward, terminated in mini_batch:

            if terminated:
                # Agent either reached goal (reward=1) or fell into hole (reward=0)
                # When in a terminated state, target q value should be set to the reward.
                target = np.array([reward])
            else:
                # Calculate target q value
                target = np.array(
                    reward + self.discount_factor_g * tf.reduce_max(target_dqn(self.state_to_dqn_input(new_state)))
                )

            # Get the current set of Q values
            current_q = policy_dqn(self.state_to_dqn_input(state))
            current_q_list.append(current_q)

            # Get the target set of Q values
            target_q = target_dqn(self.state_to_dqn_input(state))

            # Adjust the specific action to the target that was just calculated. 
            # Target_q[batch][action], hardcode batch to 0 because there is only 1 batch.
            target_q_var = tf.Variable(target_q.numpy())
            target_q_var[0, action].assign(target)

        # Compute loss for the whole minibatch
        loss = self.loss_fn(tf.stack(current_q_list), tf.stack(target_q_list))

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def state_to_dqn_input(self, state):
        image = tf.zeros((1920, 1080, 3), dtype=tf.uint8)
        image = np.reshape(image, (1920, 1080, 3))
        
        return np.array([image])

    # Run the FrozeLake environment with the learned policy
    def test(self, episodes):
        # Create FrozenLake instance
        env = gym.make('FrozenLake-v1', map_name="4x4", is_slippery=is_slippery, render_mode='human')
        num_states = env.observation_space.n
        num_actions = env.action_space.n

        # Load learned policy
        policy_dqn = DQN(input_shape=3, out_actions=num_actions)
        policy_dqn.load_weights("woodGathering_dql_cnn.keras")
        policy_dqn.eval()    # switch model to evaluation mode

        print('Policy (trained):')
        self.print_dqn(policy_dqn)

        for i in range(episodes):
            state = env.reset()  # Initialize to state 0
            terminated = False      # True when agent falls in hole or reached goal
            truncated = False       # True when agent takes more than 200 actions

            # Agent navigates map until it falls into a hole (terminated), reaches goal (terminated), or has taken 200 actions (truncated).
            while(not terminated and not truncated):
                # Select best action
                action = tf.argmax(policy_dqn(self.state_to_dqn_input(state)), axis=-1).numpy()

                # Execute action
                state,reward,terminated,truncated,_ = env.step(action)

        env.close()

    # Print DQN: state, best action, q values
    def print_dqn(self, dqn):
        # Loop each state and print policy to console
        for s in range(16):
            #  Format q values for printing
            q_values = ''
            for q in dqn(self.state_to_dqn_input(s))[0]:
                q_values += "{:+.2f}".format(q)+' '  # Concatenate q values, format to 2 decimals
            q_values=q_values.rstrip()              # Remove space at the end

            # Map the best action to L D R U
            print(f'x {np.argmax(dqn(self.state_to_dqn_input(s)))}')
            best_action = self.ACTIONS[np.argmax(dqn(self.state_to_dqn_input(s)))]

            # Print policy in the format of: state, action, q values
            # The printed layout matches the FrozenLake map.
            print(f'{s:02},{best_action},[{q_values}]', end=' ')
            if (s+1)%4==0:
                print() # Print a newline every 4 states

In [822]:
if __name__ == '__main__':

    frozen_lake = FrozenLakeDQL()
    is_slippery = False
    frozen_lake.train(1000)
    frozen_lake.test(10)

Policy (random, before training):
x 0
00,z,[+0.00 +0.00 +0.00 +0.00 +0.00] x 0
01,z,[+0.00 +0.00 +0.00 +0.00 +0.00] x 0
02,z,[+0.00 +0.00 +0.00 +0.00 +0.00] x 0
03,z,[+0.00 +0.00 +0.00 +0.00 +0.00] 
x 0
04,z,[+0.00 +0.00 +0.00 +0.00 +0.00] x 0
05,z,[+0.00 +0.00 +0.00 +0.00 +0.00] x 0
06,z,[+0.00 +0.00 +0.00 +0.00 +0.00] x 0
07,z,[+0.00 +0.00 +0.00 +0.00 +0.00] 
x 0
08,z,[+0.00 +0.00 +0.00 +0.00 +0.00] x 0
09,z,[+0.00 +0.00 +0.00 +0.00 +0.00] x 0
10,z,[+0.00 +0.00 +0.00 +0.00 +0.00] x 0
11,z,[+0.00 +0.00 +0.00 +0.00 +0.00] 
x 0
12,z,[+0.00 +0.00 +0.00 +0.00 +0.00] x 0
13,z,[+0.00 +0.00 +0.00 +0.00 +0.00] x 0
14,z,[+0.00 +0.00 +0.00 +0.00 +0.00] x 0
15,z,[+0.00 +0.00 +0.00 +0.00 +0.00] 


InvalidArgumentError: {{function_node __wrapped__Sub_device_/job:localhost/replica:0/task:0/device:CPU:0}} Incompatible shapes: [32,1,5] vs. [0] [Op:Sub] name: 