![Logo](assets/logo.png)

Made by **Domonkos Nagy**

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Fortuz/rl_education/blob/main/9.%20On-policy%20Control/mountain_car.ipynb)

# Mountain Car

In [1]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import trange

In [2]:
env = gym.make('MountainCar-v0')  # Creating the environment

In [23]:
LOW = env.observation_space.low
HIGH = env.observation_space.high
N_ACTIONS = env.action_space.n

print(f'N_ACTIONS: {N_ACTIONS}\n')
print('\tPos.  Vel.')
print(f'LOW:\t{LOW}')
print(f'HIGH:\t{HIGH}')

N_ACTIONS: 3

	Pos.  Vel.
LOW:	[-1.2  -0.07]
HIGH:	[0.6  0.07]


In [61]:
class TiledQTable:
    def __init__(self, offsets, n_actions, n_bins):
        self.tilings = []
        self.n_tilings = len(offsets)
        self.n_actions = n_actions
        n_dims = len(offsets[0])

        # Create tilings
        for offset in offsets:
            tiling = [np.linspace(LOW[dim], HIGH[dim], n_bins + 1)[1:-1] + offset[dim] for dim in range(n_dims)]
            self.tilings.append(tiling)

        # Initialize q-table
        shape = (self.n_tilings, ) + (n_bins, ) * n_dims + (n_actions, )
        self.q_table = np.zeros(shape)

    def __getitem__(self, coords):
        # Coords is a state-action pair
        print(len(self.q_table.shape) - len(coords))
        if len(self.q_table.shape) - len(coords) == 0:
            val = 0
            state = coords[:-1]
            action = coords[-1]

            for i, tiling in enumerate(self.tilings):
                ind = [i]
                for dim in range(len(state)):
                    ind.append(np.searchsorted(tiling[dim], state[dim]))
                val += self.q_table[tuple(ind) + (action, )]

            return val

        # Coords is a state
        elif len(self.q_table.shape) - len(coords) == 1: 
            vals = np.zeros(self.n_actions)
            state = coords

            for i, tiling in enumerate(self.tilings):
                ind = [i]
                for dim in range(len(state)):
                    ind.append(np.searchsorted(tiling[dim], state[dim]))
                vals += self.q_table[tuple(ind)]

            return vals

    def __setitem__(self, coords, new):
        old = self.__getitem__(coords)
        state = coords[:-1]
        action = coords[-1]

        for i, tiling in enumerate(self.tilings):
            ind = [i]
            for dim in range(len(coords)):
                ind.append(np.searchsorted(tiling[dim], state[dim]))
            self.q_table[tuple(ind) + (action, )] += (new - old) / self.n_tilings

    def __call__(self, *args):
        shape = args[0].shape
        z = np.zeros(shape)

        for i in np.ndindex(shape):
            x = [arg[i] for arg in args]
            z[i] = self.__getitem__(x)

        return z

In [62]:
# Hyperparameters
N_TILINGS = 8  # number of tilings
N_BINS = 8  # number of bins per dimension per tiling
N_EPISODES = 100_000  # number of learning steps
ALPHA = 0.1  # learning rate
GAMMA = 1  # discount rate
EPSILON = 0.1  # exploration rate

In [63]:
# Set asymmetrical offsets to avoid artifacts in
# generalization; see Sutton & Barto pg. 218-220 for details
tile_width = (HIGH - LOW) / N_BINS
unit = tile_width / N_TILINGS
offsets = [(unit[0] * i, 3 * unit[1] * i) for i in range(N_TILINGS)]

# Initialize tiled q-table
tqt = TiledQTable(offsets, N_ACTIONS, N_BINS)

In [64]:
# epsilon-greedy action selection
def select_action(epsilon, state):
    if np.random.rand() > epsilon:
        return np.argmax(tqt[state])
    else:
        return env.action_space.sample()

In [65]:
# Training

for episode in trange(N_EPISODES):
    state, _ = env.reset()
    done = False
    action = select_action(EPSILON, state)

    while not done:
        new_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        target = reward

        if not done:
            new_action = select_action(EPSILON, state)
            target = reward + GAMMA * tqt[(new_state, new_action)]
        tqt[(state, action)] += ALPHA * (target - tqt[(state, action)])

        state = new_state
        action = new_action

  0%|          | 0/100000 [00:00<?, ?it/s]

2
2


TypeError: unsupported operand type(s) for *: 'int' and 'NoneType'