In [None]:
import gym
import numpy as np

env = gym.make("MountainCar-v0")

LEARNING_RATE = 0.1
DISCOUNT = 0.95
EPISODES = 2500
SHOW_EVERY = 100

DISCRETE_OS_SIZE = [20] * len(env.observation_space.high)
discrete_os_win_size = (env.observation_space.high - env.observation_space.low) / DISCRETE_OS_SIZE

# Exploration settings
epsilon = 1  # not a constant, going to be decayed
START_EPSILON_DECAYING = 1
END_EPSILON_DECAYING = EPISODES // 2
epsilon_decay_value = epsilon / (END_EPSILON_DECAYING - START_EPSILON_DECAYING)

q_table = np.random.uniform(low=-2, high=0, size=(DISCRETE_OS_SIZE + [env.action_space.n]))

def get_discrete_state(state):
    discrete_state = (state - env.observation_space.low) / discrete_os_win_size
    return tuple(discrete_state.astype(np.int))  # we use this tuple to look up the 3 Q values for the available actions in the q-table

total_reward = 0  # Variable to store the total reward for each episode

for episode in range(EPISODES):
    discrete_state = get_discrete_state(env.reset())
    done = False
    episode_reward = 0  # Initialize the episode reward to zero

    if episode % SHOW_EVERY == 0:
        render = True
        print(f"Episode {episode}, Total Reward {total_reward}")
    else:
        render = False

    # Choose action using epsilon-greedy policy
    action = np.argmax(q_table[discrete_state]) if np.random.random() > epsilon else np.random.randint(0, env.action_space.n)

    while not done:
        new_state, reward, done, _ = env.step(action)
        episode_reward += reward  # Accumulate the reward for each step
        new_discrete_state = get_discrete_state(new_state)

        if episode % SHOW_EVERY == 0:
            env.render()

        # Choose next action using epsilon-greedy policy
        next_action = np.argmax(q_table[new_discrete_state]) if np.random.random() > epsilon else np.random.randint(0, env.action_space.n)

        # Update Q-value using SARSA update rule
        current_q = q_table[discrete_state + (action,)]
        next_q = q_table[new_discrete_state + (next_action,)]
        new_q = current_q + LEARNING_RATE * (reward + DISCOUNT * next_q - current_q)
        q_table[discrete_state + (action,)] = new_q

        discrete_state = new_discrete_state
        action = next_action

    total_reward += episode_reward  # Add the episode reward to the total reward

    # Decaying is being done every episode if episode number is within decaying range
    if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING:
        epsilon -= epsilon_decay_value

env.close()


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  return tuple(discrete_state.astype(np.int))  # we use this tuple to look up the 3 Q values for the available actions in the q-table


Episode 0, Total Reward 0
Episode 100, Total Reward -20000.0
Episode 200, Total Reward -40000.0
Episode 300, Total Reward -60000.0
Episode 400, Total Reward -80000.0
Episode 500, Total Reward -100000.0
Episode 600, Total Reward -120000.0
Episode 700, Total Reward -140000.0
Episode 800, Total Reward -160000.0
Episode 900, Total Reward -179936.0
Episode 1000, Total Reward -199874.0
Episode 1100, Total Reward -219628.0
Episode 1200, Total Reward -238559.0
Episode 1300, Total Reward -257777.0
Episode 1400, Total Reward -276326.0
Episode 1500, Total Reward -294332.0
Episode 1600, Total Reward -314095.0
Episode 1700, Total Reward -333883.0
Episode 1800, Total Reward -353002.0
Episode 1900, Total Reward -371373.0
Episode 2000, Total Reward -388745.0
Episode 2100, Total Reward -407438.0
Episode 2200, Total Reward -426646.0
Episode 2300, Total Reward -444533.0
Episode 2400, Total Reward -461412.0


In [None]:
print(q_table)

[[[-1.79023553e+00 -2.63200727e-01 -2.26075042e-01]
  [-9.92056427e-01 -1.93049391e+00 -1.39927523e+00]
  [-1.45153166e+01 -1.45782390e+01 -1.43685590e+01]
  ...
  [-1.08961471e+00 -7.12270694e-01 -1.23964780e+00]
  [-3.92735203e-01 -2.28641920e-02 -2.76197621e-01]
  [-8.53430660e-01 -1.57872010e+00 -7.53769633e-01]]

 [[-1.10832119e+00 -1.67320397e+00 -1.91294738e+00]
  [-6.63893989e+00 -6.30287324e+00 -6.10189696e+00]
  [-1.41670279e+01 -1.42733567e+01 -1.42889844e+01]
  ...
  [-1.41315442e+00 -1.86849934e+00 -2.92183000e-01]
  [-1.02457439e+00 -1.54569550e+00 -1.01094059e+00]
  [-1.35712736e+00 -5.37574525e-01 -3.52467070e-01]]

 [[-2.34053438e-01 -7.39302679e-01 -8.23071422e-01]
  [-9.98686421e+00 -1.04616794e+01 -1.04601871e+01]
  [-1.40549372e+01 -1.40738238e+01 -1.42524862e+01]
  ...
  [-1.65683197e-01 -5.95759243e-01 -5.99871868e-01]
  [-7.16426716e-01 -1.71989065e+00 -4.89258594e-01]
  [-2.41028839e-01 -9.05894769e-02 -9.20955869e-01]]

 ...

 [[-1.91994604e+00 -1.05093052e+00