## Libraries

In [1]:
import numpy as np
import gym
from amalearn.agent import AgentBase
import matplotlib.pyplot as plt
import tiles3 as tc
from IPython.display import clear_output
import sys

import gym
gym.envs.register(
    id='MountainCarMyEasyVersion-v0',
    entry_point='gym.envs.classic_control:MountainCarEnv',
    max_episode_steps=250,      # MountainCar-v0 uses 200
    reward_threshold=-110.0,
)
env = gym.make('MountainCarMyEasyVersion-v0')
# env = gym.make('MountainCar-v0').env
env.reset()

array([-0.5151765,  0.       ], dtype=float32)

## Mountain Car Tile Coder

In [2]:
env = gym.make('MountainCar-v0').env
class MountainCarTileCoder:
    def __init__(self, iht_size, num_tilings, num_tiles):
        self.iht = tc.IHT(iht_size)
        self.num_tilings = num_tilings
        self.num_tiles = num_tiles
    
    def get_tiles(self, position, velocity):
        min_position = env.min_position
        max_position = env.max_position
        min_velocity = -env.max_speed
        max_velocity = env.max_speed
        position_scale = self.num_tiles / (max_position - min_position)
        velocity_scale = self.num_tiles / (max_velocity - min_velocity)
        tiles = tc.tiles(self.iht, self.num_tilings, [position * position_scale, velocity * velocity_scale])
        return np.array(tiles)

## Sarsa Agent

In [3]:
def epsilon_greedy_policy(w, epsilon, num_actions):
    def policy_fn(tiles):
        A = np.zeros(num_actions) + 1
        action_values = np.zeros(num_actions)
        for i in range(num_actions):
            action_values[i] = w[i][tiles].sum()
        best_action = np.argmax(action_values)
        A = A * epsilon/len(A)
        A[best_action] += 1 - epsilon
        return A, action_values
    return policy_fn


In [4]:
class Agent_Sarsa(AgentBase):
    def __init__(self, id, environment, discount, alpha, iht_size, num_tilings, num_tiles):
        self.env = environment
        self.alpha = alpha
        self.epsilon = 0.1
        self.discount_factor = discount
        self.num_tilings = num_tilings
        self.num_tiles = num_tiles
        self.iht_size = iht_size
        self.num_actions = 3
        
        self.mct = MountainCarTileCoder(iht_size, num_tilings, num_tiles)

        self.initial_weights = np.zeros((self.num_actions, iht_size))
        self.w = np.ones((self.num_actions, self.iht_size)) * self.initial_weights

        super(Agent_Sarsa, self).__init__(id, environment)
        
    def run(self, trail, max_time):
        step_episode = []
        for i_episode in range(1, trail+1):
            self.epsilon *= 0.995
            state = self.env.reset()
            [position, velocity] = state
            active_tiles = self.mct.get_tiles(position, velocity)
            behavior_policy = epsilon_greedy_policy(self.w, self.epsilon, self.num_actions)
            probs, q_vals = behavior_policy(active_tiles)
            action = np.random.choice(np.arange(len(probs)), p=probs)
            q_val = q_vals[action]

            for t in range(max_time):
                if(i_episode == trail + 1):
                    self.env.render()
                #Take action A, observe R,S'
                next_state, reward, done, _ = self.env.step(action)
                [next_position, next_velocity] = next_state
                next_active_tiles = self.mct.get_tiles(next_position, next_velocity)

                # if S' is terminal
                if done:
                    q_val = self.w[action][active_tiles].sum()
                    delta = reward - q_val
                    grad = np.zeros_like(self.w)
                    grad[action][active_tiles] = 1
                    self.w += self.alpha * delta * grad
                    step_episode.append(t)
                    print("\rEpisode {}/{} | steps: {} | Epsilon: {} ".format(i_episode, trail, t,self.epsilon ), end="")
                    sys.stdout.flush()
                    break

                #Choose A' as a function of q(s',.w)
                behavior_policy = epsilon_greedy_policy(self.w, self.epsilon, self.num_actions)
                next_action_probs, next_q_vals = behavior_policy(next_active_tiles)
                next_action = np.random.choice(np.arange(len(next_action_probs)), p=next_action_probs)
                next_q_val = next_q_vals[next_action]
                
    
                q_val = self.w[action][active_tiles].sum()
                delta = reward + self.discount_factor * next_q_val - q_val
                grad = np.zeros_like(self.w)
                grad[action][active_tiles] = 1
                self.w += self.alpha * delta * grad
                state = next_state
                [position, velocity] = state
                active_tiles = self.mct.get_tiles(position, velocity)
                action = next_action
        return step_episode

## Run

In [5]:
discount = 0.9
trial, max_time = [100,10000]
num_runs = 20

gym.envs.register(
    id='MountainCarVersion-v1',
    entry_point='gym.envs.classic_control:MountainCarEnv',
    max_episode_steps=max_time, # MountainCar-v0 uses 200
    reward_threshold=-1,     
)
env = gym.make('MountainCarVersion-v1')
env.reset()
steps_total = []


for r in range(num_runs):
    
    steps_per_episode = []
    iht_size, num_tilings, num_tiles = [4096, 2, 16]
    clear_output(wait=True)
    print('\nRun: ' , r, ' | Sarsa Agent | ','iht_size: ' ,iht_size, ' | num_tilings: ', num_tilings,' | num_tiles: ' ,num_tiles, '\n')
    alpha = 0.1
    agent_sarsa = Agent_Sarsa(0, env, discount, alpha, iht_size, num_tilings, num_tiles)
    steps_per_episode.append(agent_sarsa.run(trial, max_time))

    iht_size, num_tilings, num_tiles = [4096, 32, 4]
    clear_output(wait=True)
    print('\nRun: ' , r, ' | Sarsa Agent | ','iht_size: ' ,iht_size, ' | num_tilings: ', num_tilings,' | num_tiles: ' ,num_tiles, '\n')
    alpha = 0.0005
    agent_sarsa = Agent_Sarsa(1, env, discount, alpha, iht_size, num_tilings, num_tiles)
    steps_per_episode.append(agent_sarsa.run(trial, max_time))

    iht_size, num_tilings, num_tiles = [4096, 8, 8]
    clear_output(wait=True)
    print('\nRun: ' , r, ' | Sarsa Agent | ','iht_size: ' ,iht_size, ' | num_tilings: ', num_tilings,' | num_tiles: ' ,num_tiles, '\n')
    alpha = 0.012
    agent_sarsa = Agent_Sarsa(2, env, discount, alpha, iht_size, num_tilings, num_tiles)
    steps_per_episode.append(agent_sarsa.run(trial, max_time))
    steps_total.append(steps_per_episode)

plt.plot(np.mean(steps_total,axis=0)[0], label="num_tiles: 16, num_tilings: 2")
plt.plot(np.mean(steps_total,axis=0)[1], label="num_tiles: 4, num_tilings: 32")
plt.plot(np.mean(steps_total,axis=0)[2], label="num_tiles: 8, num_tilings: 8")
plt.legend()
plt.title("SARSA Agent steps per episode")
plt.ylabel('Steps per Episode')
plt.xlabel('Episode')
plt.ylim(0, 800)
plt.show()


Run:  1  | Sarsa Agent |  iht_size:  4096  | num_tilings:  32  | num_tiles:  4 

Episode 1/100 | steps: 4481 | Epsilon: 0.0995 