In [1]:
### Ignore warnings and test if some gpu is available
import warnings
warnings.simplefilter('ignore')
import tensorflow as tf
tf.test.is_gpu_available()

True

In [2]:
import cv2
import gym
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Activation, Conv2D, Flatten
from tensorflow.keras.optimizers import Adam

In [3]:
class Memory():
    def __init__(self, mem_size):
        self.mem_size = mem_size
        self.memory = [None] * self.mem_size
        self.mem_pointer = 0
    
    def store(self, state, action, reward, n_state, done):
        pack = [np.expand_dims(state, axis=0), action, reward, np.expand_dims(n_state, axis=0), done]
        self.memory[self.mem_pointer] = pack
        self.mem_pointer += 1
        if self.mem_pointer >= self.mem_size:
            self.mem_pointer = 0
    
    def take_data(self, batch_size):
        indices = []
        states = []
        actions = []
        rewards = []
        n_states = []
        dones = []
        for _ in range(batch_size):
            i = np.random.choice([i for i in range(self.mem_size)])
            while i in indices:
                i = np.random.choice([i for i in range(self.mem_size)])
            indices.append(i)
            states.append(np.reshape(self.memory[i][0], (4, 1, 84, 84)))
            actions.append(self.memory[i][1])
            rewards.append(self.memory[i][2])
            n_states.append(np.reshape(self.memory[i][3], (4, 1, 84, 84)))
            dones.append(self.memory[i][4])
        
        return states, actions, rewards, n_states, dones

In [4]:
class NeuralNet():
    def __init__(self, name, input_shape, n_actions, lr=0.0001):
        self.name = name
        self.lr = lr
        self.n_actions = n_actions
        self.input_shape = input_shape
        self._build()
    
    def _build(self):
        self.NN = Sequential()
        
        self.NN.add(Conv2D(32, 4, 2, data_format='channels_first', activation='relu', input_shape=(*self.input_shape, )))
        self.NN.add(Conv2D(64, 3, 2, data_format='channels_first', activation='relu'))
        self.NN.add(Conv2D(64, 2, 2, data_format='channels_first', activation='relu'))
        self.NN.add(Flatten(data_format='channels_first'))
        self.NN.add(Dense(units=512, activation='relu'))
        self.NN.add(Dense(units=256, activation='relu'))
        self.NN.add(Dense(units=self.n_actions))
        
        self.NN.compile(optimizer=Adam(lr=self.lr), loss='mse')
        
        self.NN.summary()
    
    def train(self, states, target, batch_size=32):
        self.NN.fit(x=states, y=target, batch_size=batch_size, verbose=0, epochs=1, shuffle=False)
        
    def predict(self, state):
        return self.NN.predict(state)
        
    def save(self):
        self.NN.save(self.name)

In [5]:
class Agent():
    def __init__(self, input_shape, n_actions, mem_size, eps, eps_min, eps_dec, gamma, q_eval_name, q_next_name, replace_freq):
        self.Q_eval = NeuralNet(q_eval_name, input_shape, n_actions)
        self.Q_next = NeuralNet(q_next_name, input_shape, n_actions)
        self.memory = Memory(mem_size)
        self.eps = eps
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.gamma = gamma
        self.replace = replace_freq
        self.action_space = [i for i in range(n_actions)]
        self.steps = 0
        self.q_eval_name = q_eval_name
        self.q_next_name = q_next_name
    
    def choose_action(self, state):
        if np.random.random() > self.eps:
            return np.argmax(self.Q_next.predict(state[0])[0])
        else:
            return np.random.choice(self.action_space)
    
    def decay_eps(self): 
        self.eps = self.eps - self.eps_dec if self.eps > self.eps_min else self.eps_min
    
    def replace_weights(self):
        if not (self.steps % self.replace):
            self.Q_next.NN.set_weights(self.Q_eval.NN.get_weights())
    
    def upgrade(self, batch_size=32):
        if self.memory.memory[-1] is not None:
            self.replace_weights()
            states, actions, rewards, n_states, dones = self.memory.take_data(batch_size)
            for i in range(batch_size):
                q_next = np.amax(self.Q_next.predict(n_states[i])[0])
                if done:
                    q_next = 0.0
                q_target = rewards[i] + self.gamma*q_next
                y = self.Q_eval.predict(states[i])[0]
                y[actions[i]] = q_target
                y = [y for _ in range(4)]
                y = np.reshape(y, (4, 4))
                self.Q_eval.train(states[i], y)
                
            self.decay_eps()
    
    def save(self):
        self.Q_eval.save()
        self.Q_next.save()

In [None]:
env = gym.make('Breakout-v0')
n_games = 250
agent = Agent((1, 84, 84), 4, 5000, 1.0, 0.01, 0.0005, 0.99, 'Q_eval.h5', 'Q_next.h5', 800)
best_score = -1000
scores = []
means = []
eps = []
for i in range(n_games):
    done = False
    state = env.reset()
    state = cv2.cvtColor(state, cv2.COLOR_BGR2GRAY)
    state = cv2.resize(state, (84, 84))
    state = np.reshape(state, (1, 84, 84))
    state = [state for _ in range(4)]
    score = 0
    while not done:
        action = agent.choose_action(np.expand_dims(state, axis=0))
        n, r, d = [], 0, False
        for _ in range(4):
            n_state, reward, done, _ = env.step(action)
            n_state = cv2.cvtColor(n_state, cv2.COLOR_BGR2GRAY)
            n_state = cv2.resize(n_state, (84, 84))
            n_state = np.reshape(n_state, (1, 84, 84))
            n.append(n_state)
            r += reward
            if done:
                d = True
            score += reward
        n_state = n
        reward = r
        done = d
        agent.steps += 1
        agent.memory.store(state, action, reward, n_state, int(done))
        agent.upgrade()
        state = n_state
    scores.append(score)
    mean = np.mean(scores[-20:])
    means.append(mean)
    eps.append(agent.eps)
    print('episode: ', i+1, '   score: ', score, '   eps:  %.2f' %agent.eps)
env.close()

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 32, 41, 41)        544       
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 64, 20, 20)        18496     
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 64, 10, 10)        16448     
_________________________________________________________________
flatten (Flatten)            (None, 6400)              0         
_________________________________________________________________
dense (Dense)                (None, 512)               3277312   
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
____

In [None]:
plt.plot(scores)
plt.grid()
plt.show()

In [None]:
plt.plot(means)
plt.grid()
plt.show()

In [None]:
plt.plot(eps)
plt.grid()
plt.show()

In [None]:
Q = load_model('Q_next.h5')
env = gym.make('Breakout-v0')
for i in range(10):
    done = False
    state = env.reset()
    while not done:
        env.render()
        action = Q.predict(np.expand_dims(state, axis=0))[0]
        n_state, reward, done, _ = env.step([action])
        state = n_state
env.close()