# Module Five Assignment: Cartpole Problem
Review the code in this notebook and in the score_logger.py file in the *scores* folder (directory). Once you have reviewed the code, return to this notebook and select **Cell** and then **Run All** from the menu bar to run this code. The code takes several minutes to run.

In [11]:
import random  
import gym  
import numpy as np  
from collections import deque  
from keras.models import Sequential  
from keras.layers import Dense  
from keras.optimizers import Adam  
  
  
from scores.score_logger import ScoreLogger  
  
ENV_NAME = "CartPole-v1"  
  
GAMMA = 0.95  
LEARNING_RATE = 0.001  
  
MEMORY_SIZE = 1000000  
BATCH_SIZE = 20  
  
EXPLORATION_MAX = 1.0  
EXPLORATION_MIN = 0.01  
EXPLORATION_DECAY = 0.995  
  
  
class DQNSolver:  
  
    def __init__(self, observation_space, action_space):  
        self.exploration_rate = EXPLORATION_MAX  
  
        self.action_space = action_space  
        self.memory = deque(maxlen=MEMORY_SIZE)  
  
        self.model = Sequential()  
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))  
        self.model.add(Dense(24, activation="relu"))  
        self.model.add(Dense(self.action_space, activation="linear"))  
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))  
  
    def remember(self, state, action, reward, next_state, done):  
        self.memory.append((state, action, reward, next_state, done))  
  
    def act(self, state):  
        if np.random.rand() < self.exploration_rate:  
            return random.randrange(self.action_space)  
        q_values = self.model.predict(state)  
        return np.argmax(q_values[0])  
  
    def experience_replay(self):  
        if len(self.memory) < BATCH_SIZE:  
            return  
        batch = random.sample(self.memory, BATCH_SIZE)  
        for state, action, reward, state_next, terminal in batch:  
            q_update = reward  
            if not terminal:  
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))  
            q_values = self.model.predict(state)  
            q_values[0][action] = q_update  
            self.model.fit(state, q_values, verbose=0)  
        self.exploration_rate *= EXPLORATION_DECAY  
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)  
  
  
def cartpole():  
    env = gym.make(ENV_NAME)  
    score_logger = ScoreLogger(ENV_NAME)  
    observation_space = env.observation_space.shape[0]  
    action_space = env.action_space.n  
    dqn_solver = DQNSolver(observation_space, action_space)  
    run = 0  
    while True:  
        run += 1  
        state = env.reset()  
        state = np.reshape(state, [1, observation_space])  
        step = 0  
        while True:  
            step += 1  
            #env.render()  
            action = dqn_solver.act(state)  
            state_next, reward, terminal, info = env.step(action)  
            reward = reward if not terminal else -reward  
            state_next = np.reshape(state_next, [1, observation_space])  
            dqn_solver.remember(state, action, reward, state_next, terminal)  
            state = state_next  
            if terminal:  
                print ("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))  
                score_logger.add_score(step, run)  
                break  
            dqn_solver.experience_replay()  



In [None]:
cartpole()

Run: 1, exploration: 0.990025, score: 22
Scores: (min: 22, avg: 22, max: 22)

Run: 2, exploration: 0.8822202429488013, score: 24
Scores: (min: 22, avg: 23, max: 24)

Run: 3, exploration: 0.810157377815473, score: 18
Scores: (min: 18, avg: 21.333333333333332, max: 24)

Run: 4, exploration: 0.7514768435208588, score: 16
Scores: (min: 16, avg: 20, max: 24)

Run: 5, exploration: 0.7076077347272662, score: 13
Scores: (min: 13, avg: 18.6, max: 24)

Run: 6, exploration: 0.6465587967553006, score: 19
Scores: (min: 13, avg: 18.666666666666668, max: 24)

Run: 7, exploration: 0.6118738784280476, score: 12
Scores: (min: 12, avg: 17.714285714285715, max: 24)

Run: 8, exploration: 0.567555222460375, score: 16
Scores: (min: 12, avg: 17.5, max: 24)

Run: 9, exploration: 0.5398075216808175, score: 11
Scores: (min: 11, avg: 16.77777777777778, max: 24)

Run: 10, exploration: 0.5057535983897912, score: 14
Scores: (min: 11, avg: 16.5, max: 24)

Run: 11, exploration: 0.483444593917636, score: 10
Scores: (mi

og code

In [1]:
import random  
import gym  
import numpy as np  
from collections import deque  
from keras.models import Sequential  
from keras.layers import Dense  
from keras.optimizers import Adam  
  
  
from scores.score_logger import ScoreLogger  
  
ENV_NAME = "CartPole-v1"  
  
GAMMA = 0.05  
LEARNING_RATE = 0.001  
  
MEMORY_SIZE = 1000000  
BATCH_SIZE = 20  
  
EXPLORATION_MAX = 1.0  
EXPLORATION_MIN = 0.01  
EXPLORATION_DECAY = 0.995  
  
  
class DQNSolver:  
  
    def __init__(self, observation_space, action_space):  
        self.exploration_rate = EXPLORATION_MAX  
  
        self.action_space = action_space  
        self.memory = deque(maxlen=MEMORY_SIZE)  
  
        self.model = Sequential()  
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))  
        self.model.add(Dense(24, activation="relu"))  
        self.model.add(Dense(self.action_space, activation="linear"))  
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))  
  
    def remember(self, state, action, reward, next_state, done):  
        self.memory.append((state, action, reward, next_state, done))  
  
    def act(self, state):  
        if np.random.rand() < self.exploration_rate:  
            return random.randrange(self.action_space)  
        q_values = self.model.predict(state)  
        return np.argmax(q_values[0])  
  
    def experience_replay(self):  
        if len(self.memory) < BATCH_SIZE:  
            return  
        batch = random.sample(self.memory, BATCH_SIZE)  
        for state, action, reward, state_next, terminal in batch:  
            q_update = reward  
            if not terminal:  
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))  
            q_values = self.model.predict(state)  
            q_values[0][action] = q_update  
            self.model.fit(state, q_values, verbose=0)  
        self.exploration_rate *= EXPLORATION_DECAY  
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)  
  
  
def cartpole():  
    env = gym.make(ENV_NAME)  
    score_logger = ScoreLogger(ENV_NAME)  
    observation_space = env.observation_space.shape[0]  
    action_space = env.action_space.n  
    dqn_solver = DQNSolver(observation_space, action_space)  
    run = 0  
    while True:  
        run += 1  
        state = env.reset()  
        state = np.reshape(state, [1, observation_space])  
        step = 0  
        while True:  
            step += 1  
            #env.render()  
            action = dqn_solver.act(state)  
            state_next, reward, terminal, info = env.step(action)  
            reward = reward if not terminal else -reward  
            state_next = np.reshape(state_next, [1, observation_space])  
            dqn_solver.remember(state, action, reward, state_next, terminal)  
            state = state_next  
            if terminal:  
                print ("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))  
                score_logger.add_score(step, run)  
                break  
            dqn_solver.experience_replay()  



Using TensorFlow backend.


In [2]:
cartpole()

Run: 1, exploration: 0.9511101304657719, score: 30
Scores: (min: 30, avg: 30, max: 30)

Run: 2, exploration: 0.9137248860125932, score: 9
Scores: (min: 9, avg: 19.5, max: 30)

Run: 3, exploration: 0.653073201944699, score: 68
Scores: (min: 9, avg: 35.666666666666664, max: 68)

Run: 4, exploration: 0.4907693883854626, score: 58
Scores: (min: 9, avg: 41.25, max: 68)

Run: 5, exploration: 0.37627099809304654, score: 54
Scores: (min: 9, avg: 43.8, max: 68)

Run: 6, exploration: 0.32864265128599696, score: 28
Scores: (min: 9, avg: 41.166666666666664, max: 68)

Run: 7, exploration: 0.2913921604631864, score: 25
Scores: (min: 9, avg: 38.857142857142854, max: 68)

Run: 8, exploration: 0.2757603055760701, score: 12
Scores: (min: 9, avg: 35.5, max: 68)

Run: 9, exploration: 0.2532352299289372, score: 18
Scores: (min: 9, avg: 33.55555555555556, max: 68)

Run: 10, exploration: 0.2408545925762412, score: 11
Scores: (min: 9, avg: 31.3, max: 68)

Run: 11, exploration: 0.2290792429684691, score: 11
Sc

KeyboardInterrupt: 

changed gamma to 0.05 and saw that the max score to 50 . ran close to 1000 runs. Was running slow with going up and down.

In [4]:
import random  
import gym  
import numpy as np  
from collections import deque  
from keras.models import Sequential  
from keras.layers import Dense  
from keras.optimizers import Adam  
  
  
from scores.score_logger import ScoreLogger  
  
ENV_NAME = "CartPole-v1"  
  
GAMMA = 0.95  
LEARNING_RATE = 0.1  
  
MEMORY_SIZE = 1000000  
BATCH_SIZE = 20  
  
EXPLORATION_MAX = 1.0  
EXPLORATION_MIN = 0.01  
EXPLORATION_DECAY = 0.995  
  
  
class DQNSolver:  
  
    def __init__(self, observation_space, action_space):  
        self.exploration_rate = EXPLORATION_MAX  
  
        self.action_space = action_space  
        self.memory = deque(maxlen=MEMORY_SIZE)  
  
        self.model = Sequential()  
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))  
        self.model.add(Dense(24, activation="relu"))  
        self.model.add(Dense(self.action_space, activation="linear"))  
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))  
  
    def remember(self, state, action, reward, next_state, done):  
        self.memory.append((state, action, reward, next_state, done))  
  
    def act(self, state):  
        if np.random.rand() < self.exploration_rate:  
            return random.randrange(self.action_space)  
        q_values = self.model.predict(state)  
        return np.argmax(q_values[0])  
  
    def experience_replay(self):  
        if len(self.memory) < BATCH_SIZE:  
            return  
        batch = random.sample(self.memory, BATCH_SIZE)  
        for state, action, reward, state_next, terminal in batch:  
            q_update = reward  
            if not terminal:  
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))  
            q_values = self.model.predict(state)  
            q_values[0][action] = q_update  
            self.model.fit(state, q_values, verbose=0)  
        self.exploration_rate *= EXPLORATION_DECAY  
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)  
  
  
def cartpole():  
    env = gym.make(ENV_NAME)  
    score_logger = ScoreLogger(ENV_NAME)  
    observation_space = env.observation_space.shape[0]  
    action_space = env.action_space.n  
    dqn_solver = DQNSolver(observation_space, action_space)  
    run = 0  
    while True:  
        run += 1  
        state = env.reset()  
        state = np.reshape(state, [1, observation_space])  
        step = 0  
        while True:  
            step += 1  
            #env.render()  
            action = dqn_solver.act(state)  
            state_next, reward, terminal, info = env.step(action)  
            reward = reward if not terminal else -reward  
            state_next = np.reshape(state_next, [1, observation_space])  
            dqn_solver.remember(state, action, reward, state_next, terminal)  
            state = state_next  
            if terminal:  
                print ("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))  
                score_logger.add_score(step, run)  
                break  
            dqn_solver.experience_replay()  



In [5]:
cartpole()

Run: 1, exploration: 1.0, score: 13
Scores: (min: 13, avg: 13, max: 13)

Run: 2, exploration: 0.851801859600347, score: 39
Scores: (min: 13, avg: 26, max: 39)

Run: 3, exploration: 0.7666961448653229, score: 22
Scores: (min: 13, avg: 24.666666666666668, max: 39)

Run: 4, exploration: 0.7292124703704616, score: 11
Scores: (min: 11, avg: 21.25, max: 39)

Run: 5, exploration: 0.6866430931872001, score: 13
Scores: (min: 11, avg: 19.6, max: 39)

Run: 6, exploration: 0.653073201944699, score: 11
Scores: (min: 11, avg: 18.166666666666668, max: 39)

Run: 7, exploration: 0.5848838636585911, score: 23
Scores: (min: 11, avg: 18.857142857142858, max: 39)

Run: 8, exploration: 0.531750826943791, score: 20
Scores: (min: 11, avg: 19, max: 39)

Run: 9, exploration: 0.4932355662165453, score: 16
Scores: (min: 11, avg: 18.666666666666668, max: 39)

Run: 10, exploration: 0.46912134373457726, score: 11
Scores: (min: 11, avg: 17.9, max: 39)

Run: 11, exploration: 0.43952667968844233, score: 14
Scores: (min

KeyboardInterrupt: 

changed leaarning rate. ran close to 2000 times with no solve. max score was 51. would increase slowly and fall.

In [1]:
import random  
import gym  
import numpy as np  
from collections import deque  
from keras.models import Sequential  
from keras.layers import Dense  
from keras.optimizers import Adam  
  
  
from scores.score_logger import ScoreLogger  
  
ENV_NAME = "CartPole-v1"  
  
GAMMA = 0.95  
LEARNING_RATE = 0.001  
  
MEMORY_SIZE = 1000000  
BATCH_SIZE = 20  
  
EXPLORATION_MAX = 1.0  
EXPLORATION_MIN = 0.01  
EXPLORATION_DECAY = 0.90  
  
  
class DQNSolver:  
  
    def __init__(self, observation_space, action_space):  
        self.exploration_rate = EXPLORATION_MAX  
  
        self.action_space = action_space  
        self.memory = deque(maxlen=MEMORY_SIZE)  
  
        self.model = Sequential()  
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))  
        self.model.add(Dense(24, activation="relu"))  
        self.model.add(Dense(self.action_space, activation="linear"))  
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))  
  
    def remember(self, state, action, reward, next_state, done):  
        self.memory.append((state, action, reward, next_state, done))  
  
    def act(self, state):  
        if np.random.rand() < self.exploration_rate:  
            return random.randrange(self.action_space)  
        q_values = self.model.predict(state)  
        return np.argmax(q_values[0])  
  
    def experience_replay(self):  
        if len(self.memory) < BATCH_SIZE:  
            return  
        batch = random.sample(self.memory, BATCH_SIZE)  
        for state, action, reward, state_next, terminal in batch:  
            q_update = reward  
            if not terminal:  
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))  
            q_values = self.model.predict(state)  
            q_values[0][action] = q_update  
            self.model.fit(state, q_values, verbose=0)  
        self.exploration_rate *= EXPLORATION_DECAY  
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)  
  
  
def cartpole():  
    env = gym.make(ENV_NAME)  
    score_logger = ScoreLogger(ENV_NAME)  
    observation_space = env.observation_space.shape[0]  
    action_space = env.action_space.n  
    dqn_solver = DQNSolver(observation_space, action_space)  
    run = 0  
    while True:  
        run += 1  
        state = env.reset()  
        state = np.reshape(state, [1, observation_space])  
        step = 0  
        while True:  
            step += 1  
            #env.render()  
            action = dqn_solver.act(state)  
            state_next, reward, terminal, info = env.step(action)  
            reward = reward if not terminal else -reward  
            state_next = np.reshape(state_next, [1, observation_space])  
            dqn_solver.remember(state, action, reward, state_next, terminal)  
            state = state_next  
            if terminal:  
                print ("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))  
                score_logger.add_score(step, run)  
                break  
            dqn_solver.experience_replay()  



Using TensorFlow backend.


In [2]:
cartpole()

Run: 1, exploration: 1.0, score: 19
Scores: (min: 19, avg: 19, max: 19)

Run: 2, exploration: 0.16677181699666577, score: 18
Scores: (min: 18, avg: 18.5, max: 19)

Run: 3, exploration: 0.07178979876918531, score: 9
Scores: (min: 9, avg: 15.333333333333334, max: 19)

Run: 4, exploration: 0.02781283894436938, score: 10
Scores: (min: 9, avg: 14, max: 19)

Run: 5, exploration: 0.01077526366430583, score: 10
Scores: (min: 9, avg: 13.2, max: 19)

Run: 6, exploration: 0.01, score: 9
Scores: (min: 9, avg: 12.5, max: 19)

Run: 7, exploration: 0.01, score: 10
Scores: (min: 9, avg: 12.142857142857142, max: 19)

Run: 8, exploration: 0.01, score: 8
Scores: (min: 8, avg: 11.625, max: 19)

Run: 9, exploration: 0.01, score: 9
Scores: (min: 8, avg: 11.333333333333334, max: 19)

Run: 10, exploration: 0.01, score: 9
Scores: (min: 8, avg: 11.1, max: 19)

Run: 11, exploration: 0.01, score: 43
Scores: (min: 8, avg: 14, max: 43)

Run: 12, exploration: 0.01, score: 82
Scores: (min: 8, avg: 19.666666666666668,

KeyboardInterrupt: 

chnaged the decay and got a max of 387 out of 90 runs. score would rise and fall.

goal: cartpole is a game with a pole and object ontop. it it to be balanced for long time. the peerson will know ow to balance after the end.
different state values: is the banace ontop of the pole and object.
possible actions: person can change max, decay, and gamma
reniforcement: is reinforment of trial and error.
How does experience relay work: the more expereicne can calculate a better score percentage.
effect of dicount facotr: is determining the min and ave and max
neural network: is a q learning 
increasing or decreasing: the learning rate in the q learning is massive in the perfromace