In [66]:
import gym
import numpy as np

import keras
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, InputLayer
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras.optimizers import SGD
from keras.optimizers import Adam

import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import collections
import os
#from random import random
import random

In [68]:
#The reward is based on how long the player survives.
#The player has two choices, push the right button or the left one.
#In the DQN algorithm, there are also two very important parts: the remember and replay methods.
#exploration_rate - At the beginning, the lack of experience of our agent makes us choose randomly an action and when the agent gets more experienced, we let it decide which action to undertake.
#exploration_decay - We want to decrease the number of explorations as it gets better and better at playing games.
#The loss is a value that indicates how far our prediction is from the actual target. 
#Our goal is to decrease the loss, which is the gap between the prediction and the target.
#fit is decreasing the loss or the difference between the target and the predicted value.
#memory stores the results of the rl predictions 

#https://medium.com/@gtnjuvin/my-journey-into-deep-q-learning-with-keras-and-gym-3e779cc12762

In [81]:
class Agent():
    def __init__(self, state_size, action_size):
        self.weight_backup      = "cartpole_weight.h5"
        self.state_size         = state_size
        self.action_size        = action_size
        self.memory             = collections.deque(maxlen=2000)
        self.learning_rate      = 0.001
        self.gamma              = 0.95
        self.exploration_rate   = 1.0
        self.exploration_min    = 0.01
        self.exploration_decay  = 0.995
        self.sample_batch_size  = 32
        self.brain              = self._build_model()
        
    def save_model(self):
        self.brain.save(self.weight_backup)        
    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        if os.path.isfile(self.weight_backup):
            model.load_weights(self.weight_backup)
            self.exploration_rate = self.exploration_min
        return model
    
    def act(self, state):
        if np.random.rand() <= self.exploration_rate:
            return random.randrange(self.action_size)
        act_values = self.brain.predict(state)
        return np.argmax(act_values[0])
    
    def replay(self, sample_batch_size):
        if len(self.memory) < sample_batch_size:
            return
        sample_batch = random.sample(self.memory, sample_batch_size)
        for state, action, reward, next_state, done in sample_batch:
            target = reward
            if not done:
                  target = reward + self.gamma * np.amax(self.brain.predict(next_state)[0])

            target_f = self.brain.predict(state)
            target_f[0][action] = target
            self.brain.fit(state, target_f, epochs=1, verbose=0)
        if self.exploration_rate > self.exploration_min:
            self.exploration_rate *= self.exploration_decay
            
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

In [82]:
#if os.path.isfile(self.weight_backup):
#        model.load_weights(self.weight_backup)
#if        self.exploration_rate = self.exploration_min
#        return model



In [86]:
class CartPole:
    def __init__(self):
        self.sample_batch_size = 32
        #self.episodes          = 10000
        self.episodes          = 1000
        self.env               = gym.make('CartPole-v1')
        self.state_size        = self.env.observation_space.shape[0]
        self.action_size       = self.env.action_space.n
        self.agent             = Agent(self.state_size, self.action_size)

    def run(self):
        try:
            for index_episode in range(self.episodes):
                state = self.env.reset()
                state = np.reshape(state, [1, self.state_size])
                done = False
                index = 0
                while not done:
                     self.env.render()
                     action = self.agent.act(state)
                     next_state, reward, done, _ = self.env.step(action)
                     next_state = np.reshape(next_state, [1, self.state_size])
                     self.agent.remember(state, action, reward, next_state, done)
                     state = next_state
                     index += 1
                print("Episode {}# Score: {}".format(index_episode, index + 1))
                self.agent.replay(self.sample_batch_size)
        finally:
            self.agent.save_model()
            
if __name__ == "__main__":
    cartpole = CartPole()
    cartpole.run()

Episode 0# Score: 501
Episode 1# Score: 501
Episode 2# Score: 501
Episode 3# Score: 428
Episode 4# Score: 501
Episode 5# Score: 399
Episode 6# Score: 501


KeyboardInterrupt: 