In [None]:
### Ignore warnings and test if some gpu is available
import warnings
warnings.simplefilter('ignore')
import tensorflow as tf
tf.test.is_gpu_available()

In [None]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import Adam

In [None]:
class Memory():
    def __init__(self, mem_size):
        self.mem_size = mem_size
        self.memory = [None] * self.mem_size
        self.mem_pointer = 0
    
    def store(self, state, action, reward, n_state, done):
        pack = [np.expand_dims(state, axis=0), action, reward, np.expand_dims(n_state, axis=0), done]
        self.memory[self.mem_pointer] = pack
        self.mem_pointer += 1
        if self.mem_pointer >= self.mem_size:
            self.mem_pointer = 0
    
    def take_data(self, batch_size):
        indices = []
        states = []
        actions = []
        rewards = []
        n_states = []
        dones = []
        for _ in range(batch_size):
            i = np.random.choice([i for i in range(self.mem_size)])
            while i in indices:
                i = np.random.choice([i for i in range(self.mem_size)])
            indices.append(i)
            states.append(self.memory[i][0])
            actions.append(self.memory[i][1])
            rewards.append(self.memory[i][2])
            n_states.append(self.memory[i][3])
            dones.append(self.memory[i][4])
        
        return states, actions, rewards, n_states, dones

In [None]:
class NeuralNet():
    def __init__(self, name, input_shape, n_actions, lr=0.001):
        self.name = name
        self.lr = lr
        self.n_actions = n_actions
        self.input_shape = input_shape
        self._build()
    
    def _build(self):
        self.NN = Sequential()
        
        self.NN.add(Dense(units=16, activation='relu', input_shape=(*self.input_shape, )))
        self.NN.add(Dense(units=16, activation='relu'))
        self.NN.add(Dense(units=self.n_actions, activation='linear'))
        
        self.NN.compile(optimizer=Adam(lr=self.lr), loss='mse')
        
        self.NN.summary()
    
    def train(self, states, target, batch_size=32):
        self.NN.fit(x=states, y=target, batch_size=batch_size, verbose=0, epochs=1, shuffle=False)
        
    def predict(self, state):
        return self.NN.predict(state)
        
    def save(self):
        self.NN.save(self.name)

In [None]:
class Agent():
    def __init__(self, input_shape, n_actions, mem_size, eps, eps_min, eps_dec, gamma, q_eval_name, q_next_name, replace_freq):
        self.Q_eval = NeuralNet(q_eval_name, input_shape, n_actions)
        self.Q_next = NeuralNet(q_next_name, input_shape, n_actions)
        self.memory = Memory(mem_size)
        self.eps = eps
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.gamma = gamma
        self.replace = replace_freq
        self.action_space = [i for i in range(n_actions)]
        self.steps = 0
        self.q_eval_name = q_eval_name
        self.q_next_name = q_next_name
    
    def choose_action(self, state):
        if np.random.random() > self.eps:
            return np.argmax(self.Q_next.predict(state))
        else:
            return np.random.choice(self.action_space)
    
    def decay_eps(self): 
        self.eps = self.eps**(1/(self.steps)) if self.eps > self.eps_min else self.eps_min
    
    def replace_weights(self):
        if not (self.steps % self.replace):
            self.Q_next.NN.set_weights(self.Q_eval.NN.get_weights())
    
    def upgrade(self, batch_size=32):
        if self.memory.memory[-1] is not None:
            self.replace_weights()
            states, actions, rewards, n_states, dones = self.memory.take_data(batch_size)
            for i in range(batch_size):
                q_next = np.amax(self.Q_next.predict(n_states[i])[0])
                if done:
                    q_next = 0.0
                q_target = rewards[i] + self.gamma*q_next
                y = self.Q_eval.predict(states[i])[0]
                y[actions[i]] = q_target
                y = np.reshape(y, (1, 2))
                self.Q_eval.train(states[i], y)
                
            self.decay_eps()
    
    def save(self):
        self.Q_eval.save()
        self.Q_next.save()

In [None]:
env = gym.make('CartPole-v1')
n_games = 1000
agent = Agent((4,), 2, 2000, 0.99, 0.1, 0.0001, 0.99, 'Q_eval.h5', 'Q_next.h5', 800)
best_score = -100
scores = []
means = []
eps = []
for i in range(n_games):
    done = False
    state = env.reset()
    score = 0
    while not done:
        action = agent.choose_action(np.expand_dims(state, axis=0))
        n_state, reward, done, _ = env.step(action)
        if done:
            reward -= 10
        score += reward
        agent.steps += 1
        agent.memory.store(state, action, reward, n_state, int(done))
        agent.upgrade()
        state = n_state
    scores.append(score)
    mean = np.mean(scores[-20:])
    means.append(mean)
    eps.append(agent.eps)
    if mean > best_score:
        agent.save()
        best_score = mean
    print('episode: ', i+1, '   score: ', score, '   eps:  %.2f' %agent.eps)

In [None]:
plt.plot(scores)
plt.grid()
plt.show()

In [None]:
plt.plot(means)
plt.grid()
plt.show()

In [None]:
plt.plot(eps)
plt.grid()
plt.show()

In [None]:
Q = load_model('Q_next.h5')
env = gym.make('CartPole-v1')
for i in range(10):
    done = False
    state = env.reset()
    while not done:
        env.render()
        action = np.argmax(Q.predict(np.expand_dims(state, axis=0))[0])
        n_state, reward, done, _ = env.step(action)
        state = n_state
    env.close()
    env = gym.make('CartPole-v1')