In [None]:
!pip install gym
!pip install matplotlib



In [None]:
import gym
import numpy as np

env = gym.make("MountainCar-v0")

LEARNING_RATE = 0.1
DISCOUNT = 0.95
EPISODES = 2500
SHOW_EVERY = 100

DISCRETE_OS_SIZE = [20] * len(env.observation_space.high)
discrete_os_win_size = (env.observation_space.high - env.observation_space.low) / DISCRETE_OS_SIZE

# Exploration settings
epsilon = 1  # not a constant, going to be decayed
START_EPSILON_DECAYING = 1
END_EPSILON_DECAYING = EPISODES // 2
epsilon_decay_value = epsilon / (END_EPSILON_DECAYING - START_EPSILON_DECAYING)

q_table = np.random.uniform(low=-2, high=0, size=(DISCRETE_OS_SIZE + [env.action_space.n]))

def get_discrete_state(state):
    discrete_state = (state - env.observation_space.low) / discrete_os_win_size
    return tuple(discrete_state.astype(np.int))  # we use this tuple to look up the 3 Q values for the available actions in the q-table

total_reward = 0  # Variable to store the total reward for each episode

for episode in range(EPISODES):
    discrete_state = get_discrete_state(env.reset())
    done = False
    episode_reward = 0  # Initialize the episode reward to zero

    if episode % SHOW_EVERY == 0:
        render = True
        print(f"Episode {episode}, Total Reward {total_reward}")
    else:
        render = False

    while not done:
        if np.random.random() > epsilon:
            # Get action from Q table
            action = np.argmax(q_table[discrete_state])
        else:
            # Get random action
            action = np.random.randint(0, env.action_space.n)

        new_state, reward, done, _ = env.step(action)
        episode_reward += reward  # Accumulate the reward for each step

        new_discrete_state = get_discrete_state(new_state)

        if episode % SHOW_EVERY == 0:
            env.render()

        if not done:
            max_future_q = np.max(q_table[new_discrete_state])
            current_q = q_table[discrete_state + (action,)]
            new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)
            q_table[discrete_state + (action,)] = new_q
        elif new_state[0] >= env.goal_position:
            # Take the goal reward directly from the environment
            q_table[discrete_state + (action,)] = reward

        discrete_state = new_discrete_state

    total_reward += episode_reward  # Add the episode reward to the total reward

    # Decaying is being done every episode if episode number is within decaying range
    if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING:
        epsilon -= epsilon_decay_value

env.close()


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  return tuple(discrete_state.astype(np.int))  # we use this tuple to look up the 3 Q values for the available actions in the q-table


Episode 0, Total Reward 0
Episode 100, Total Reward -20000.0
Episode 200, Total Reward -40000.0
Episode 300, Total Reward -60000.0
Episode 400, Total Reward -80000.0
Episode 500, Total Reward -100000.0
Episode 600, Total Reward -120000.0
Episode 700, Total Reward -140000.0
Episode 800, Total Reward -160000.0
Episode 900, Total Reward -179972.0
Episode 1000, Total Reward -199972.0
Episode 1100, Total Reward -219966.0
Episode 1200, Total Reward -239707.0
Episode 1300, Total Reward -259214.0
Episode 1400, Total Reward -277533.0
Episode 1500, Total Reward -296649.0
Episode 1600, Total Reward -316564.0
Episode 1700, Total Reward -336275.0
Episode 1800, Total Reward -355583.0
Episode 1900, Total Reward -374860.0
Episode 2000, Total Reward -394137.0
Episode 2100, Total Reward -413171.0
Episode 2200, Total Reward -432037.0
Episode 2300, Total Reward -449219.0
Episode 2400, Total Reward -467178.0


In [None]:
print(q_table)

[[[-1.86453443e+00 -1.20199128e-01 -1.71365996e+00]
  [-1.08163019e+00 -3.85440654e-01 -7.11909801e-02]
  [-1.27468243e+01 -1.28780235e+01 -1.26667057e+01]
  ...
  [-6.72229030e-01 -1.42695717e+00 -6.72530775e-01]
  [-1.19145154e+00 -1.26830679e+00 -1.67767871e+00]
  [-1.82502446e+00 -1.87941135e+00 -1.51429879e+00]]

 [[-1.36748677e+00 -8.95473305e-01 -1.37185111e+00]
  [-5.60634602e+00 -5.26379366e+00 -5.47159423e+00]
  [-1.21985972e+01 -1.18176018e+01 -1.20296225e+01]
  ...
  [-7.24310457e-01 -1.90952085e+00 -6.60787796e-01]
  [-5.81838832e-01 -1.14494814e+00 -7.35246804e-01]
  [-5.09658313e-01 -6.88854538e-01 -5.69446030e-01]]

 [[-1.24391783e+00 -2.32475138e-01 -1.47074683e+00]
  [-6.16388867e+00 -6.07084297e+00 -5.69219857e+00]
  [-1.20075513e+01 -1.27431026e+01 -1.29177017e+01]
  ...
  [-3.20530831e-01 -1.33794092e+00 -7.96830244e-01]
  [-3.94104126e-01 -9.47768606e-01 -1.53602281e+00]
  [-1.15430249e+00 -7.55562327e-01 -7.15514340e-03]]

 ...

 [[-6.28450787e-01 -1.39055702e+00