In [None]:
#Environment gym 
import numpy as np
import gym
from gym import wrappers
np.random.seed(0)


In [None]:
n_states = 5
EPISODE = 10000
TIMESTEP = 1000

initial_lr = 1.0 #Initial Learning rate
min_lr = 0.003
discount_rate = .9
eps = 0.02

env = gym.make('MountainCar-v0')
env.seed(0)

q_table = np.zeros((n_states, n_states, env.action_space.n))
 

In [None]:
def train():
    for i in range(EPISODE):
        obs = env.reset()
        tot_reward = 0
        ## lr: learning rate is decreased at each step
        lr = max(min_lr, initial_lr * (0.85 ** (i//100)))
        for iter in range(TIMESTEP):
          
            #env.render()
            state = obs_to_state(obs)
            
            if np.random.uniform(0, 1) < eps:
                action = np.random.choice(env.action_space.n)
            else:
                logits = q_table[state]
                logits_exp = np.exp(logits)
                probs = logits_exp / np.sum(logits_exp)
                action = np.random.choice(env.action_space.n, p=probs)
            obs, reward, done, info = env.step(action)
            tot_reward += reward
            # update q table
            new_state = obs_to_state(obs)
            q_table[state + (action,)] = q_table[state + (action,)] + lr * (reward + discount_rate *  np.max(q_table[new_state]) - q_table[state + (action, )])
            if done:
                break
       # if i % 100 == 0:
        print('Iteration #{} -- Total reward = {}.'.format(i+1, tot_reward))


In [None]:
def obs_to_state(obs):
    """ Maps an observation to state """
    env_low = env.observation_space.low
    env_high = env.observation_space.high
    env_dx = (env_high - env_low) / n_states
    a = int((obs[0] - env_low[0])/env_dx[0])  # position
    b = int((obs[1] - env_low[1])/env_dx[1])  # velocity
    return a, b

In [None]:
def run(policy=None):
    obs = env.reset()
    total_reward = 0
    step_idx = 0
    for iter in range(TIMESTEP):
        env.render()
        if policy is None:
            action = env.action_space.sample()
        else:
            state =  obs_to_state(obs)
            action = policy[state]
    
        obs, reward, done, info = env.step(action)
        total_reward += discount_factor ** step_idx * reward
        step_idx += 1
        
        if done:
            break
            
    return total_reward

In [None]:
if __name__ == '__main__':
    train()
    solution_policy = np.argmax(q_table, axis=2)
    print("Solution policy")
    print(q_table)

    # Animate it
    solution_policy_scores = [run( policy=solution_policy) for _ in range(100)]
    print("Average score of solution = ", np.mean(solution_policy_scores))
    run(policy=solution_policy)