In [1]:
import warnings
warnings.simplefilter('ignore')
import tensorflow as tf
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.compat.v1.Session(config=config)
tf.compat.v1.keras.backend.set_session(sess)
print("Is GPU available?", tf.test.is_gpu_available())
print("TF version:", tf.__version__)
print("Keras version:", tf.keras.__version__)

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
Is GPU available? True
TF version: 2.3.1
Keras version: 2.4.0


In [2]:
import numpy as np
import random
import matplotlib.pyplot as plt
from collections import deque
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.layers import Dense, Activation, Conv2D, Flatten
from tensorflow.keras.optimizers import Adam

tf.keras.backend.set_image_data_format('channels_first')

In [3]:
class NN(Model):
    def __init__(self, n_actions, input_shape):
        super(NN, self).__init__()
        
        self.vel1 = self.dense1 = Dense(2, activation='relu')
            
        self.conv1 = Conv2D(filters=8, kernel_size=2, activation='relu', data_format='channels_first')
        self.conv2 = Conv2D(filters=16, kernel_size=2, activation='relu', data_format='channels_first')
        self.conv3 = Conv2D(filters=16, kernel_size=2, activation='relu', data_format='channels_first')
        self.flatten = Flatten(data_format='channels_first')
        self.dense1 = Dense(128, activation='relu')
        self.dense2 = Dense(128, activation='relu')
        self.V = Dense(1, activation=None)
        self.A = Dense(n_actions, activation=None)
    
    def call(self, data):
        
        state, vel = data
        
        y = self.vel1(vel)
        
        x = self.conv1(state)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.flatten(x)
        x = tf.concat([x, y], axis=1)
        x = self.dense1(x)
        x = self.dense2(x)
        V = self.V(x)
        A = self.A(x)
        
        Q = (V + (A - tf.math.reduce_mean(A, axis=1, keepdims=True)))
        
        return Q
    
    def advantage(self, data):
        
        state, vel = data
        
        y = self.vel1(vel)

        x = self.conv1(state)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.flatten(x)
        x = tf.concat([x, y], axis=1)
        x = self.dense1(x)
        x = self.dense2(x)
        A = self.A(x)
        
        return A

In [4]:
class Agent():
    def __init__(self, input_shape, n_actions, mem_size, eps, eps_min, eps_dec, gamma, q_eval_name, q_next_name, 
                 replace_freq, lr=0.0005):
        self.Q_eval = NN(n_actions, input_shape)
        self.Q_next = NN(n_actions, input_shape)
        self.Q_eval.compile(optimizer=Adam(lr=lr), loss='mse')
        self.Q_next.compile(optimizer=Adam(lr=lr), loss='mse')
        self.memory = deque(maxlen=mem_size)
        self.eps = eps
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.gamma = gamma
        self.replace = replace_freq
        self.action_space = [i for i in range(n_actions)]
        self.steps = 0
        self.input_shape = input_shape
        self.q_eval_name = q_eval_name
        self.q_next_name = q_next_name
    
    def store(self, state, action, reward, n_state, done, vel):
        vel[0] /= 15.0
        vel[1] /= 15.0
        pack = [np.expand_dims(state, axis=0), action, reward, np.expand_dims(n_state, axis=0), 
                done, np.expand_dims(vel, axis=0)]
        self.memory.append(pack)
    
    def take_data(self, batch_size):
        pack = random.sample(self.memory, batch_size)
        states = []
        actions = []
        rewards = []
        n_states = []
        dones = []
        vels = []
        for i in range(batch_size):
            states.append(pack[i][0])
            actions.append(pack[i][1])
            rewards.append(pack[i][2])
            n_states.append(pack[i][3])
            dones.append(pack[i][4])
            vels.append(pack[i][5])
        return states, actions, rewards, n_states, dones, vels
    
    def choose_action(self, state, vel):
        if np.random.random() > self.eps:
            state = np.expand_dims(state, axis=0)
            vel[0] /= 15.0
            vel[1] /= 15.0
            vel = np.reshape(vel, (1, 2))
            return np.argmax(self.Q_eval.advantage([state, vel]))
        return np.random.choice(self.action_space)
    
    def decay_eps(self): 
        self.eps = self.eps - self.eps_dec if self.eps > self.eps_min else self.eps_min
    
    def replace_weights(self):
        if not (self.steps % self.replace):
            self.Q_next.set_weights(self.Q_eval.get_weights())
    
    def upgrade(self, batch_size=64):
        if len(self.memory) >= 4*batch_size:
            states, actions, rewards, n_states, dones, vels = self.take_data(batch_size)
            
            self.replace_weights()
            
            act = [np.argmax(self.Q_eval([n_states[i], vels[i]])) for i in range(batch_size)]
            q_next = [self.Q_next([n_states[i], vels[i]]).numpy()[0][act[i]] for i in range(batch_size)]
            q_target = [self.Q_eval([states[i], vels[i]]).numpy()[0] for i in range(batch_size)]
            
            for i in range(batch_size):
                q_target[i][actions[i]] = rewards[i] + self.gamma*q_next[i]*(1 - dones[i])
            
            states = np.reshape(states, (batch_size, 3, 84, 84))
            vels = np.reshape(vels, (batch_size, 2))
            
            self.Q_eval.train_on_batch([np.array(states), vels], np.array(q_target))
            
            self.decay_eps()
            self.steps += 1

    def save(self):
        self.Q_eval.save_weights(self.q_eval_name)
        

In [5]:
from MySnakeEnv import Game

env = Game()

In [6]:
agent = Agent(input_shape=env.observation_space_shape, n_actions=env.action_space_n, mem_size=10000, eps=1.0, 
              eps_min=0.001, eps_dec=0.001, gamma=0.99, q_eval_name='Q_eval.h5', q_next_name='Q_next.h5', 
              replace_freq=400)

In [None]:
n_episodes = 1000
scores = []
mean_scores = []

for i in range(n_episodes):
    done = False
    state = env.reset()
    
    while not done:
        steps = 0
        while True:
            steps += 1
            vel = env.body_parts[0].vel
            action = agent.choose_action(state, vel)
            n_state, reward, done = env.step(action)
            agent.store(state, action, reward, n_state, done, vel)
            agent.upgrade()
            state = n_state
            #env.render()
            if (steps == 100) or (done):
                break
    
    scores.append(env.score)
    mean = np.mean(scores[-50:])
    mean_scores.append(mean)
    
    print('Episode', i, 'Reward', env.score, 'AVG', mean)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



In [None]:
plt.plot(mean_scores)
plt.grid()
plt.xlabel('Episodes')
plt.ylabel('AVG score')