# 1. Create a custom gym environment

In [1]:
from gym import Env
from gym.spaces import Discrete, Box
import numpy as np
import random
import math
from enum import Enum

In [2]:
class Action(Enum):
    UP = 0
    DOWN = 1
    LEFT = 2
    RIGHT = 3

In [76]:
class GameEnv(Env):
    def __init__(self, size, game_length):
        self.size = size
        self.GAME_LENGTH = game_length
        self.action_space = Discrete(4)
        self.observation_space = Box(low=-1, high=1, shape=(self.size, self.size), dtype=np.int32)
        self.state, self.player = self.createBoard()
        self.time_remaining = self.GAME_LENGTH
        
    def step(self, action):
        self.time_remaining -= 1
        done = False
        
        #evaluate move, save value of the new space before move then update the state
        invalid_move = False
        if Action(action) == Action.UP:
            new_pos = (self.player[0] -1, self.player[1])
            if new_pos[0] >= 0:
                new_space_val = self.movePlayer(new_pos)
            else:
                invalid_move = True
        elif Action(action) == Action.DOWN:
            new_pos = (self.player[0] +1, self.player[1])
            if new_pos[0] < self.size:
                new_space_val = self.movePlayer(new_pos)
            else:
                invalid_move = True
        elif Action(action) == Action.LEFT:
            new_pos = (self.player[0], self.player[1] -1)
            if new_pos[1] >= 0:
                new_space_val = self.movePlayer(new_pos)
            else:
                invalid_move = True
        elif Action(action) == Action.RIGHT:
            new_pos = (self.player[0], self.player[1] +1)
            if new_pos[1] < self.size:
                new_space_val = self.movePlayer(new_pos)
            else:
                invalid_move = True
        else:
            print("Invalid input to step function")
            
        #evaluate reward 
        reward = 0
        if(invalid_move):
            reward = -0.1
            done = False
        else:
            if new_space_val == 0:
                reward = -0.1
                done = False
            elif new_space_val == -1:
                reward = 20
                done = True
        
        #evaluate if out of time
        if self.time_remaining == 0:
            done = True
            reward = -20
            
        #placeholder for required return value
        info = {}
        
        return self.state, reward, done, info
    
    def movePlayer(self, new_pos):
        new_space_val = self.state[new_pos]
        self.state[self.player] = 0
        self.state[new_pos] = 1
        self.player = new_pos
        return new_space_val
        
    def render(self, mode='human'):
        print(self.state)
    
    def reset(self):
        self.state, self.player =  self.createBoard()
        self.time_remaining = self.GAME_LENGTH
        return self.state
    
    def createBoard(self):
        board = np.zeros((self.size,self.size), dtype=np.int32)
        player_pos = (np.random.randint(self.size), np.random.randint(self.size))
        goal_pos = (np.random.randint(self.size), np.random.randint(self.size))
        player_goal_distance = math.sqrt((player_pos[0] - goal_pos[0])**2 + (player_pos[1] - goal_pos[1])**2)
        while player_goal_distance < self.size/2:
            goal_pos = (np.random.randint(self.size), np.random.randint(self.size))
            player_goal_distance = math.sqrt((player_pos[0] - goal_pos[0])**2 + (player_pos[1] - goal_pos[1])**2)
        board[player_pos] = 1
        board[goal_pos] = -1
        return board, player_pos

### Test

In [77]:
env = GameEnv(5, 20)
episodes = 15
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0
    while not done:
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward
        
    print(f'Episode:{episode} Score:{score}')

Episode:1 Score:-21.900000000000002
Episode:2 Score:-21.900000000000002
Episode:3 Score:-21.900000000000002
Episode:4 Score:19.4
Episode:5 Score:-21.900000000000002
Episode:6 Score:-21.900000000000002
Episode:7 Score:-21.900000000000002
Episode:8 Score:-21.900000000000002
Episode:9 Score:-21.900000000000002
Episode:10 Score:19.0
Episode:11 Score:-21.900000000000002
Episode:12 Score:-21.900000000000002
Episode:13 Score:-21.900000000000002
Episode:14 Score:-21.900000000000002
Episode:15 Score:19.5


In [78]:
state = env.reset()
done = False
score = 0
while not done:
    action = env.action_space.sample()
    n_state, reward, done, info = env.step(action)
    score += reward
    env.render()

print(f'Episode:{episode} Score:{score}')

[[ 0  0  0  0  0]
 [ 0  1  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0 -1]
 [ 0  0  0  0  0]]
[[ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  1  0  0  0]
 [ 0  0  0  0 -1]
 [ 0  0  0  0  0]]
[[ 0  0  0  0  0]
 [ 0  1  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0 -1]
 [ 0  0  0  0  0]]
[[ 0  1  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0 -1]
 [ 0  0  0  0  0]]
[[ 0  0  1  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0 -1]
 [ 0  0  0  0  0]]
[[ 0  0  0  0  0]
 [ 0  0  1  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0 -1]
 [ 0  0  0  0  0]]
[[ 0  0  0  0  0]
 [ 0  1  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0 -1]
 [ 0  0  0  0  0]]
[[ 0  1  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0 -1]
 [ 0  0  0  0  0]]
[[ 0  1  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0 -1]
 [ 0  0  0  0  0]]
[[ 0  0  0  0  0]
 [ 0  1  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0 -1]
 [ 0  0  0  0  0]]
[[ 0  0  0  0  0]
 [ 0  0  1  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0 -1]
 [ 0  0  0  0  0]]

# 2. Create Deep Learning Model

In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam

In [85]:
del model

In [51]:
def build_model(states, actions):
    model = Sequential()
    model.add(Flatten(input_shape=(1,states[0],states[1])))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

In [86]:
states = env.observation_space.shape
actions = env.action_space.n
model = build_model(states,actions)

In [87]:
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_8 (Flatten)          (None, 25)                0         
_________________________________________________________________
dense_24 (Dense)             (None, 100)               2600      
_________________________________________________________________
dense_25 (Dense)             (None, 100)               10100     
_________________________________________________________________
dense_26 (Dense)             (None, 4)                 404       
Total params: 13,104
Trainable params: 13,104
Non-trainable params: 0
_________________________________________________________________


# 3. Build Agent

In [10]:
from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy, MaxBoltzmannQPolicy, LinearAnnealedPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
#from tf_agents.environments import tf_py_environment

In [58]:
def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=100000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy,
                   nb_actions=actions, nb_steps_warmup=100, target_model_update=1e-2)
    return dqn

In [88]:
#env =  tf_py_environment.TFPyEnvironment(env)

dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=1000000, visualize=False, verbose=1)

Training for 1000000 steps ...
Interval 1 (0 steps performed)
624 episodes - episode_reward: -8.938 [-21.900, 19.800] - loss: 9.020 - mae: 5.249 - mean_q: 2.662

Interval 2 (10000 steps performed)
789 episodes - episode_reward: 3.017 [-21.900, 19.800] - loss: 12.698 - mae: 6.969 - mean_q: 7.306

Interval 3 (20000 steps performed)
887 episodes - episode_reward: 8.690 [-21.900, 19.800] - loss: 13.369 - mae: 8.319 - mean_q: 10.708

Interval 4 (30000 steps performed)
939 episodes - episode_reward: 11.538 [-21.900, 19.800] - loss: 14.473 - mae: 9.174 - mean_q: 13.594

Interval 5 (40000 steps performed)
971 episodes - episode_reward: 12.891 [-21.900, 19.800] - loss: 14.468 - mae: 9.976 - mean_q: 15.093

Interval 6 (50000 steps performed)
1002 episodes - episode_reward: 13.194 [-21.900, 19.800] - loss: 13.581 - mae: 10.427 - mean_q: 15.327

Interval 7 (60000 steps performed)
954 episodes - episode_reward: 12.846 [-21.900, 19.800] - loss: 12.779 - mae: 10.652 - mean_q: 15.344

Interval 8 (7000

873 episodes - episode_reward: 11.486 [-21.900, 19.800] - loss: 10.757 - mae: 12.477 - mean_q: 17.404

Interval 40 (390000 steps performed)
886 episodes - episode_reward: 12.019 [-21.900, 19.800] - loss: 10.713 - mae: 12.411 - mean_q: 17.287

Interval 41 (400000 steps performed)
892 episodes - episode_reward: 12.163 [-21.900, 19.800] - loss: 10.656 - mae: 12.380 - mean_q: 17.277

Interval 42 (410000 steps performed)
888 episodes - episode_reward: 12.442 [-21.900, 19.800] - loss: 10.650 - mae: 12.386 - mean_q: 17.257

Interval 43 (420000 steps performed)
905 episodes - episode_reward: 11.968 [-21.900, 19.800] - loss: 10.650 - mae: 12.384 - mean_q: 17.252

Interval 44 (430000 steps performed)
862 episodes - episode_reward: 11.097 [-21.900, 19.800] - loss: 10.701 - mae: 12.456 - mean_q: 17.378

Interval 45 (440000 steps performed)
872 episodes - episode_reward: 11.751 [-21.900, 19.800] - loss: 10.731 - mae: 12.441 - mean_q: 17.350

Interval 46 (450000 steps performed)
873 episodes - episo

898 episodes - episode_reward: 12.706 [-21.900, 19.800] - loss: 10.276 - mae: 12.625 - mean_q: 17.562

Interval 78 (770000 steps performed)
901 episodes - episode_reward: 12.331 [-21.900, 19.800] - loss: 10.452 - mae: 12.487 - mean_q: 17.378

Interval 79 (780000 steps performed)
889 episodes - episode_reward: 12.227 [-21.900, 19.800] - loss: 10.115 - mae: 12.499 - mean_q: 17.371

Interval 80 (790000 steps performed)
880 episodes - episode_reward: 11.464 [-21.900, 19.800] - loss: 10.173 - mae: 12.504 - mean_q: 17.383

Interval 81 (800000 steps performed)
912 episodes - episode_reward: 12.161 [-21.900, 19.800] - loss: 10.417 - mae: 12.434 - mean_q: 17.304

Interval 82 (810000 steps performed)
900 episodes - episode_reward: 12.189 [-21.900, 19.800] - loss: 10.351 - mae: 12.481 - mean_q: 17.381

Interval 83 (820000 steps performed)
929 episodes - episode_reward: 12.436 [-21.900, 19.800] - loss: 10.257 - mae: 12.482 - mean_q: 17.374

Interval 84 (830000 steps performed)
903 episodes - episo

<tensorflow.python.keras.callbacks.History at 0x1105c93d080>

In [90]:
scores = dqn.test(env, nb_episodes=15, visualize=False)
print(np.mean(scores.history['episode_reward']))

Testing for 15 episodes ...
Episode 1: reward: -21.900, steps: 20
Episode 2: reward: -21.900, steps: 20
Episode 3: reward: -21.900, steps: 20
Episode 4: reward: -21.900, steps: 20
Episode 5: reward: -21.900, steps: 20
Episode 6: reward: -21.900, steps: 20
Episode 7: reward: -21.900, steps: 20
Episode 8: reward: -21.900, steps: 20
Episode 9: reward: -21.900, steps: 20
Episode 10: reward: -21.900, steps: 20
Episode 11: reward: -21.900, steps: 20
Episode 12: reward: -21.900, steps: 20
Episode 13: reward: -21.900, steps: 20
Episode 14: reward: -21.900, steps: 20
Episode 15: reward: -21.900, steps: 20
-21.899999999999995


# 4. Saving and Reloading Model

In [65]:
dqn.save_weights('saved_models/78_perc_success.h5f', overwrite=True)

In [None]:
del model
del dqn
del env

In [None]:
env = GameEnv()
actions = env.action_space.n
states = env.observation_space.shape
model = build_model(states, actions)
dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

In [None]:
dqn.load_weights('dqn_weights.h5f')

In [None]:
_ = dqn.test(env, nb_episodes=15, visualize=False)