In [55]:
import gym
import numpy as np

In [56]:
#Setting up the game and environment
env = gym.make('MountainCar-v0')
env.reset()

#Constant variables
DISCRETE_OS_SIZE = [5, 5]
discrete_os_win_size = (env.observation_space.high - env.observation_space.low) / DISCRETE_OS_SIZE
size=(DISCRETE_OS_SIZE +[env.action_space.n])

#Q-learning settings
LEARNING_RATE = 0.1     #Used to pick how fast the agent will learn/ update Q-values
DISCOUNT = 0.95         #Used to add a discount to put more weight on long-term goals than short-term
EPISODES = 2000         #Amount of episodes which the 'simulation' will run and learn
SHOW_EVERY = 500        #Used to render the simulation every 200 episodes

#Initialize a random q_table with correct size
q_table = np.random.uniform(low=-2, high=0, size=size)

#Agent(Mountain Car) has 3 actions{0:push left, 1:no push, 2:push right}
print(env.action_space.n)

#Agent has 2 variables: position and velocity
print(f"Position low: {env.observation_space.low[0]}, Position high: {env.observation_space.high[0]}")
print(f"Velocity low: {env.observation_space.low[1]}, Velocity high: {env.observation_space.high[1]}")


3
Position low: -1.2000000476837158, Position high: 0.6000000238418579
Velocity low: -0.07000000029802322, Velocity high: 0.07000000029802322


In [57]:
#Helper method to get discrete state of a state (break into bins)
#We break into discrete states because decimal places are too big to compute, will take long
def get_discrete_state(state):
    discrete_state = (state - env.observation_space.low) / discrete_os_win_size
    return tuple(discrete_state.astype(np.int))

In [2]:
for episode in range(EPISODES):
    done = False
    discrete_state = get_discrete_state(env.reset())

    
    #SHOW_EVERY
    if episode % SHOW_EVERY == 0:
        render = True
        print(episode)
    else:
        render = False
    while not done:

        #Get max Q-value of current state and chooses action
        action = np.argmax(q_table[discrete_state])
        new_state, reward, done, _ = env.step(action)

        #Grabs discrete state of new_state to normalize
        new_discrete_state = get_discrete_state(new_state)

        #Opens the window to visualize
        if episode % SHOW_EVERY == 0:
            env.render()
        
        #if simulation is not done - update Q table
        if not done:
            
            #Gets the maximum q-value for new_state
            max_future_q = np.max(q_table[new_discrete_state])
            
            #Get current Q value for current state and action
            current_q = q_table[discrete_state + (action,)]
            
            #Equation to get the new q-value
            new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)
            
            #Update Q table with the new Q-value
            q_table[discrete_state + (action,)] = new_q
        
        #else if, simulation ends or goal is reached
        elif new_state[0] >= env.goal_position:
            
            #q_table[discrete_state + (action,)] = reward
            #In this case, reward = 0 is 'reached goal'
            q_table[discrete_state + (action,)] = 0
        
        discrete_state = new_discrete_state
    
    env.close()       

    

NameError: name 'EPISODES' is not defined

In [3]:
(1,2) + (2,) = (1,2,2)

NameError: name 'discrete_state' is not defined