# Q learning to solve cartpole environment on OpenAI's gym 

In [1]:
import numpy as np
import gym
import time
import math

## Bellman's Equation

![Bellman's Equation](https://miro.medium.com/v2/resize:fit:750/format:webp/0*GBCjjZRYwpSvTu8U)

In [2]:
# Create the cart pole environment

env = gym.make("CartPole-v1")
print(env.action_space.n)

2


In [116]:
# Initialize the required variables

learning_rate = 0.1

discount = 0.95
episodes = 60000
total_time = 0
total_reward = 0
prior_episode_reward = 0

observation = [150,150,200,150]
np_array_win_size = np.array([0.25,0.25,0.01,0.1])

epsilon = 1

epsilon_decay = 0.99995
decay_from_episode = 10000

render_step = 1000 # render every n episodes

In [117]:
# Set up the Q table

q_table = np.random.uniform(low=0, high=1, size=(observation + [env.action_space.n]))
q_table.shape

(150, 150, 200, 150, 2)

In [118]:
# Get discreet state

def get_discreet_state(state):
    discreet_state = state[0]/np_array_win_size + np.array([15,10,10,19])
    return tuple(discreet_state.astype(int))

In [119]:
for episode in range(1,episodes+1):
    t_start = time.time() # start time
    discreet_state = get_discreet_state(env.reset()) # starting discreet state
    done = False # reset flag
    episode_reward = 0 # reset episode rewards to zero at start of each new episode

    if episode % render_step == 0: # render every 200th episode
        print(f"Episode: {episode}")
        mean_episode_time = total_time / render_step
        mean_episode_reward = total_reward / render_step
        print(f"Average episode reward: {mean_episode_reward}")
        print(f"Average episode time: {mean_episode_time}")
        if mean_episode_reward > 200 and epsilon < 0.4:
            break

    while not done: # play the episode
        ##### Taking Action ######
        if np.random.random() > epsilon:  # if late in to the number of episodes
            action = np.argmax(q_table[discreet_state]) # take action from mature Q table

        else:  # build the Q table in the early episodes by taking random action
            action = np.random.randint(0, env.action_space.n)

        # take step / next frame in episode
        new_state, reward, done, _, _ = env.step(action)

        episode_reward += 1

        new_discreet_state = get_discreet_state(new_state)

        if episode % render_step == 0:
            env.render()

        # update Q table
        if not done:

            max_future_q = np.max(q_table[discreet_state])

            current_q = q_table[discreet_state, action]

            # Bellman's Equation to get new Q value
            new_q = (1 - learning_rate) * current_q + learning_rate * (reward + discount * max_future_q)

            q_table[discreet_state, (action, )] = new_q


        discreet_state = new_discreet_state

    ####### End of Episode ###########
    t_end = time.time()
    episode_time = t_end - t_start
    total_time = total_time + episode_time

    total_reward += episode_reward


    
    # Decay the epsilon
    if epsilon > 0.05:
        if episode_reward > prior_episode_reward and episode > decay_from_episode:
            epsilon = math.pow(epsilon_decay, episode - decay_from_episode)

    prior_reward = episode_reward

env.close()

Episode: 1000
Average episode reward: 22.192
Average episode time: 0.03664329147338867
Episode: 2000
Average episode reward: 44.563
Average episode time: 0.0732880642414093
Episode: 3000
Average episode reward: 66.433
Average episode time: 0.10910961675643921
Episode: 4000
Average episode reward: 88.683
Average episode time: 0.14562257647514343
Episode: 5000
Average episode reward: 111.348
Average episode time: 0.1827344264984131
Episode: 6000
Average episode reward: 134.008
Average episode time: 0.21995615482330322
Episode: 7000
Average episode reward: 156.452
Average episode time: 0.25628769969940185
Episode: 8000
Average episode reward: 178.674
Average episode time: 0.29239729809761045
Episode: 9000
Average episode reward: 200.892
Average episode time: 0.32881074142456057
Episode: 10000
Average episode reward: 223.094
Average episode time: 0.364853901386261
Episode: 11000
Average episode reward: 245.643
Average episode time: 0.401360946893692
Episode: 12000
Average episode reward: 2

In [114]:
new_state

array([ 1.4809636 ,  2.180926  ,  0.10811789, -0.22904956], dtype=float32)

In [120]:
epsilon

0.38675117571691847

In [115]:
discreet_state

(20, 15, 153, 33)