In [1]:
import gym
import numpy as np

In [2]:
EPS = 1e-7
INTERSECTING_TILES = 3
SPACE_TO_TILE = 6
EXPLORATION_PROBABILITY = 0.15
GAMMA = 0.95
LEARNING_RATE = 0.005
REWARD_SHAPING_WEIGHT = 100
EVAL_PERIOD = 100

class MCAgent():
    def __init__(self):
        self.env = gym.make('MountainCar-v0')
        self.state_space_low = self.env.observation_space.low
        self.state_space_high = self.env.observation_space.high
        self.state_space_sizes = self.state_space_high - self.state_space_low
        self.tile_sizes = self.state_space_sizes / SPACE_TO_TILE
        self.tile_shifts = self.tile_sizes / INTERSECTING_TILES
        self.tiling_dims = ((self.state_space_sizes - self.tile_sizes + self.tile_shifts + EPS) // self.tile_shifts
                           ).astype(np.int32)
        self.n_actions = self.env.action_space.n
        self.q_space_shape = tuple(self.tiling_dims) + (self.n_actions,)
        self.coord_multipliers = 1 / self.tile_shifts
        self.qs = np.zeros(self.q_space_shape)
        
    def _tile_touch_borders(self, coords):
        upper_border = ((coords - self.state_space_low + EPS) * self.coord_multipliers).astype(np.int32) + 1
        lower_border = upper_border - INTERSECTING_TILES
        lower_border[lower_border < 0] = 0
        return lower_border, upper_border

    def _touched_tiles_qs(self, state):
        down, up = self._tile_touch_borders(state)
        return self.qs[down[0]:up[0], down[1]:up[1]]

    def _action_qs(self, state):
        return self._touched_tiles_qs(state).mean(axis=(0, 1))

    def _choose_action(self, state, exploration_probability):
        explore = np.random.binomial(1, exploration_probability)
        if explore:
            return np.random.randint(self.n_actions)
        return np.argmax(self._action_qs(state))
        
    def train(self, iterations):
        for i in range(iterations):
            state = self.env.reset()
            done = False
            while not done:
                action = self._choose_action(state, EXPLORATION_PROBABILITY)
                new_state, reward, done, _ = self.env.step(action)
                shaped_reward = reward + (GAMMA * abs(new_state[1]) - abs(state[1])) * REWARD_SHAPING_WEIGHT
                touched_tiles = self._touched_tiles_qs(state)
                q_diff = shaped_reward + GAMMA * self._action_qs(new_state).max() - self._action_qs(state)[action]
                touched_tiles[:, :, action] += LEARNING_RATE * q_diff
                state = new_state
            if (i + 1) % EVAL_PERIOD == 0:
                print(self.run())
            
    def run(self, show=False):
        env = gym.make('MountainCar-v0')
        done = False
        steps = 0
        state = env.reset()
        if show:
            env.render()
        while not done:
            action = self._choose_action(state, 0)
            state, _, done, _ = env.step(action)
            steps += 1
            if show:
                env.render()
        env.close()
        return steps

In [3]:
model = MCAgent()
model.train(2000)

200
200
200
200
200
200
200
200
200
200
163
200
200
183
128
162
157
168
152
180


In [4]:
model.run(True)

181