In [1]:
%matplotlib inline

# Cartpole - DQN

Original environment: https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py<br/>
Source:  https://towardsdatascience.com/cartpole-introduction-to-reinforcement-learning-ed0eb5b58288

###  Observation:

Type: Box(4)

| Num | Observation | Min | Max |
|:---:|:-----------:|:---:|:---:|
| 0   | Cart Position|-4.8|4.8|
| 1   | Cart Velocity|-Inf|Inf|
| 2   | Pole Angle   | -0.418 rad (-24 deg)|0.418 rad (24 deg)|
| 3   | Pole Angular Velocity|-Inf|Inf|

### Actions:

Type: Discrete(2)

| Num | Action                  |
|:---:|:-----------------------:|
|  0  |  Push cart to the left  |
|  1  |  Push cart to the right |

Note: The amount the velocity that is reduced or increased is not
fixed; it depends on the angle the pole is pointing. This is because
the center of gravity of the pole increases the amount of energy needed
to move the cart underneath it

### Reward:
        Reward is 1 for every step taken, including the termination step


In [2]:
%matplotlib inline

In [3]:
import os 
## Suppress TensorFlow Info and Warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

from os.path import exists

import gym
import random
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout
from keras.optimizers import RMSprop, Adam

## With TF2 we might need this or otherwise it will be too slow
## In some hardware and library version configurations, it might
## be exactly the opposite.
import tensorflow as tf
tf.compat.v1.disable_eager_execution()

from math import exp,cos
import numpy as np
import matplotlib.pyplot as plt

from numpy.random import seed,randn

from collections import deque
from statistics import mean
import h5py

## Configuration

In [4]:
LEARNING_RATE = 1e-3
MAX_MEMORY = 100000
BATCH_SIZE = 32
GAMMA = 0.975
EXPLORATION_DECAY = 0.99
EXPLORATION_MIN = 0.01
EPISODES=100

In [5]:
class Network:

    def __init__(self, observation_space, action_space):

        self.action_space = action_space
        self.memory = deque(maxlen=MAX_MEMORY)
        self.exploration_rate = 1.0

        self.model = Sequential()
        self.model.add(Dense(32, input_shape=(observation_space,), activation='relu'))
        self.model.add(Dense(32, activation='relu'))
        self.model.add(Dense(self.action_space, activation='linear'))
        self.model.compile(loss='mse', optimizer=Adam(lr=LEARNING_RATE))

    def add_to_memory(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def take_action(self, state):
        if np.random.rand() < self.exploration_rate:
            return random.randrange(0, self.action_space)
        q_values = self.model.predict(state)
        return np.argmax(q_values[0])

    def experience_replay(self):
        if len(self.memory) < BATCH_SIZE:
            return
        else:
            minibatch = random.sample(self.memory, BATCH_SIZE)

            ## TODO:
            ## This loop trains one sample at a time the model, but we could
            ## use the whole minibatch at once

            for state, action, reward, state_next, done in minibatch:
                Q = reward
                if not done:
                    Q = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))
                Q_values = self.model.predict(state)
                Q_values[0][action] = Q
                self.model.fit(state, Q_values, verbose=0)
            self.exploration_rate *= EXPLORATION_DECAY
            self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)

    def get_model(self):
        return self.model

    def load_model(self,model_name):
        self.model = load_model(model_name)

In [6]:
class TrainSolver:

    def __init__(self, max_episodes):
        self.max_episodes = max_episodes
        self.score_table = deque(maxlen=400)
        self.average_of_last_runs = None
        self.model = None
        self.play_episodes = 100
        env = gym.make('CartPole-v1')
        observation_space = env.observation_space.shape[0]
        action_space = env.action_space.n
        self.solver = Network(observation_space, action_space)

    def train(self):
        
        env = gym.make('CartPole-v1')
        observation_space = env.observation_space.shape[0]
        action_space = env.action_space.n

        print("---------------------------------")
        print("Solver starts")
        print("---------------------------------")

        self.model = self.solver.get_model()
           
        episode = 0
        while episode < self.max_episodes:

            episode += 1
            state = env.reset()

            ## Hack a more diverse initial random position
            x, x_dot, theta, theta_dot = env.state
            x = randn()*3;
            env.state = (x,x_dot,theta,theta_dot)

            state = np.reshape(np.array(env.state), [1, observation_space])
            
            step = 0
            while True:

                env.render()
                
                step += 1
                action = self.solver.take_action(state)
                state_next, reward, done, info = env.step(action)

                state_next = np.reshape(state_next, [1, observation_space])

                ## State is a vector with one observation
                ## Type: Box(4)
                ## Num  Observation                 Min         Max
                ## 0    Cart Position             -4.8            4.8
                ## 1    Cart Velocity             -Inf            Inf
                ## 2    Pole Angle                 -24 deg        24 deg
                ## 3    Pole Velocity At Tip      -Inf            Inf
                
                ## Prefer to be in the middle and vertical
                reward = exp(-0.5*abs((state_next[0][0]**2)/0.5)) * \
                         cos(state_next[0][2])
                
                ##if not done:
                ##    reward = reward
                ##else:
                ##    reward = exp(-0.5*abs((state_next[0][0]**2)/0.5)) - 0.2

                self.solver.add_to_memory(state, action, reward, state_next, done)
                state = state_next

                # print("  State: " + str(state) +
                #       ", reward: " + str(reward) +
                #       "               ",
                #       end='\r', flush=True)
                
                if done:
                    print("Run: " + str(episode) +
                          ", exploration: "+str(self.solver.exploration_rate) +
                          ", score: " + str(step) +
                          ", mem: " + str(len(self.solver.memory)))

                    break
                ## Train the network
                self.solver.experience_replay()

    def return_trained_model(self):
        return self.model

    def save_model(self):
        self.model.save('cartpole_model.h5')

    def load_model(self):
        filename = 'cartpole_model.h5'
        if os.path.exists(filename):
            self.solver.load_model(filename)
            self.model = self.solver.get_model()
        else:
            print("File '" + filename + "' does not exist. Ignoring")

In [None]:
RL=TrainSolver(EPISODES)
RL.load_model();
RL.train()
RL.save_model()

---------------------------------
Solver starts
---------------------------------

Run: 1, exploration: 1.0, score: 18, mem: 18

Run: 2, exploration: 1.0, score: 13, mem: 31





Run: 3, exploration: 0.8775210229989678, score: 14, mem: 45

Run: 4, exploration: 0.7249803359578534, score: 20, mem: 65

Run: 5, exploration: 0.5416850759668536, score: 30, mem: 95

Run: 6, exploration: 0.4801414565714212, score: 13, mem: 108

Run: 7, exploration: 0.4386175018099108, score: 10, mem: 118

Run: 8, exploration: 0.392711028357805, score: 12, mem: 130

Run: 9, exploration: 0.35516081470507305, score: 11, mem: 141

Run: 10, exploration: 0.22594815553398728, score: 46, mem: 187

Run: 11, exploration: 0.21059844619672854, score: 8, mem: 195

Run: 12, exploration: 0.18855684516737714, score: 12, mem: 207

Run: 13, exploration: 0.17224993019150142, score: 10, mem: 217

Run: 14, exploration: 0.15422195179384465, score: 12, mem: 229

Run: 15, exploration: 0.14374493715362485, score: 8, mem: 237

Run: 16, exploration: 0.13000034453500542, score: 11, mem: 248

Run: 17, exploration: 0.11875755691154315, score: 10, mem: 258

Run: 18, exploration: 0.10632818368521123, score: 12, mem: