In [10]:
pip install gym numpy



In [11]:
import numpy as np
if not hasattr(np, 'bool8'):
    np.bool8 = np.bool_

In [12]:
import gym
import matplotlib.pyplot as plt

In [13]:
pip install --upgrade gym



In [14]:
env = gym.make('MountainCar-v0')

In [15]:
n_bins = (18, 14)  # position, velocity
obs_space_low = env.observation_space.low
obs_space_high = env.observation_space.high
obs_bin_width = (obs_space_high - obs_space_low) / n_bins

In [16]:
def discretize(obs):
    return tuple(((obs - obs_space_low) / obs_bin_width).astype(int))

In [17]:
alpha = 0.1      # Learning rate
gamma = 0.99     # Discount factor
epsilon = 1.0    # Initial exploration rate
epsilon_min = 0.01
epsilon_decay = 0.995
n_episodes = 10000
max_steps = 200

In [19]:
q_table = np.zeros(n_bins + (env.action_space.n,))

rewards = []

for episode in range(n_episodes):
    obs = env.reset()
    if isinstance(obs, tuple):
        obs = obs[0]

    state = discretize(obs)
    total_reward = 0

    for step in range(max_steps):
        # Epsilon-greedy action selection
        if np.random.random() < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_table[state])

        next_obs, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        next_state = discretize(next_obs)

        # Q-Learning update
        best_next_action = np.argmax(q_table[next_state])
        td_target = reward + gamma * q_table[next_state][best_next_action]
        td_error = td_target - q_table[state][action]
        q_table[state][action] += alpha * td_error

        state = next_state
        total_reward += reward

        if done:
            break

    # Decay epsilon
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

    rewards.append(total_reward)

    if (episode + 1) % 1000 == 0:
        print(f"Episode {episode + 1}: Average Reward: {np.mean(rewards[-1000:])}")

Episode 1000: Average Reward: -197.483
Episode 2000: Average Reward: -189.302
Episode 3000: Average Reward: -172.82
Episode 4000: Average Reward: -146.123
Episode 5000: Average Reward: -145.52
Episode 6000: Average Reward: -148.158
Episode 7000: Average Reward: -143.713
Episode 8000: Average Reward: -142.166
Episode 9000: Average Reward: -158.246
Episode 10000: Average Reward: -142.983
