In [116]:
import gym
from gym import logger as gymlogger
from gym.wrappers import RecordVideo
gymlogger.set_level(40) #error only
#import tensorflow as tf
import numpy as np
import random
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import math
import glob
import io
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")

In [117]:
#Making the environment
env = gym.make("CartPole-v1")

## Initialize variables

In [118]:
#Defining Hypermeters
#LEARNING_RATE = 0.1
LEARNING_RATE = 0.1

DISCOUNT = 0.99 #instead of 0.95
EPISODES = 60000
total = 0
total_reward = 0

#0.25, 0.25, 0.01, 0.1
# Define observation and window size
Observation = [30, 30, 50, 50]
np_array_win_size = np.array([0.5, 0.5, 0.05, 0.5])

epsilon = 1
epsilon_decay = 0.995
epsilon_min = 0.05

epsilon_decay_rate = (1 - epsilon_min)/30000

prev_mean = 0
obs = env.reset()


new_env = env.step(0)[0]
print(new_env)
print(round(new_env[2], 2))

[-0.04256266 -0.22426063 -0.00425448  0.33368134]
-0.0


## Create Q table

In [119]:
#Creating the q table
q_table = np.zeros((Observation + [env.action_space.n]))
q_table.shape

(30, 30, 50, 50, 2)

## Getting discrete state

In [120]:
def get_discrete_state(state):
    discrete_state = []
    for i in range(len(state)):
        if state[i] <= observation[i][0]:
            discrete_state.append(0)
        elif state[i] >= observation[i][1]:
            discrete_state.append(29)
        else:
            discrete_state.append(int((state[i] - observation[i][0]) / win_size[i]))
    return tuple(np.array(discrete_state).astype(int))



print(get_discrete_state([4, -2, 0.418, 0.63751878]))

(29, 9, 29, 16)


## Running

In [122]:
#Training the agent
for episode in range(EPISODES):

    discrete_state = get_discrete_state(env.reset()[0])
    done = False
    episode_reward = 0
    episode_length = 0
    
    # update every 2000 episodes
    if episode % 2000 == 0:
        print("Episode: " + str(episode))

    while not done:
        
      
        if np.random.random() < epsilon:
            action = np.argmax(q_table[discrete_state])
        else:
            action = np.random.randint(0, env.action_space.n)
        
        new_env = env.step(action)
        new_state = new_env[0]
        reward = new_env[1]
        done = new_env[2]
        episode_reward += reward
        episode_length += 1
        
        new_discrete_state = get_discrete_state(new_state)

        if not done:
            max_future_q = np.max(q_table[new_discrete_state])
            current_q = q_table[discrete_state + (action,)]
            new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)
            q_table[discrete_state + (action,)] = new_q

        discrete_state = new_discrete_state
        
        # check termination conditions
        if abs(new_state[0]) > 2.4 or abs(new_state[2]) > np.radians(12) or episode_length > 500:
            done = True
            
    total_reward += episode_reward
    
    if episode % 1000 == 0:
        mean_reward = total_reward / 1000
        print("Mean Reward: " + str(mean_reward))
        
        prev_mean = mean_reward
        total_reward = 0

env.close()


Episode: 0
Mean Reward: 215.54
Mean Reward: 215.471
Episode: 2000
Mean Reward: 222.023
Mean Reward: 208.172
Episode: 4000
Mean Reward: 229.556
Mean Reward: 202.652
Episode: 6000
Mean Reward: 199.421
Mean Reward: 217.555
Episode: 8000
Mean Reward: 269.987
Mean Reward: 252.425
Episode: 10000
Mean Reward: 232.949
Mean Reward: 217.156
Episode: 12000
Mean Reward: 202.509
Mean Reward: 193.435
Episode: 14000
Mean Reward: 210.068
Mean Reward: 216.693
Episode: 16000
Mean Reward: 250.234
Mean Reward: 256.363
Episode: 18000
Mean Reward: 236.576
Mean Reward: 224.106
Episode: 20000
Mean Reward: 230.125
Mean Reward: 223.558
Episode: 22000
Mean Reward: 224.231
Mean Reward: 243.529
Episode: 24000
Mean Reward: 241.641
Mean Reward: 248.198
Episode: 26000
Mean Reward: 256.66
Mean Reward: 247.259
Episode: 28000
Mean Reward: 253.566
Mean Reward: 226.226
Episode: 30000
Mean Reward: 222.756
Mean Reward: 234.577
Episode: 32000
Mean Reward: 238.282
Mean Reward: 248.338
Episode: 34000
Mean Reward: 240.405
Mean 