In [7]:
import copy
import gymnasium as gym
import numpy as np
from datetime import datetime, timedelta

In [8]:
class Config():
    def __init__(self):
        self.env_name = "CartPole-v1"
        self.total_episode = 10000
        self.gamma = 0.99
        self.num_action = 2
        self.n_bins = 10
        self.epsilon=1.0
        self.decay_step=int(0.5 * self.total_episode)
        self.eps_min=0.05
        self.lr = 0.1

config = Config()

In [9]:
import numpy as np

class Discretizer:
    def __init__(self, n_bins=6):
        self.n_bins = n_bins
        self.cart_pos_bins = np.linspace(-4.8, 4.8, n_bins - 1)
        self.cart_vel_bins = np.linspace(-3.0, 3.0, n_bins - 1)  
        self.pole_angle_bins = np.linspace(-0.419, 0.419, n_bins - 1)
        self.pole_vel_bins = np.linspace(-3.0, 3.0, n_bins - 1) 

    def discretize(self, state):
        cart_pos, cart_vel, pole_angle, pole_vel = state

        d_cart_pos = np.digitize(cart_pos, self.cart_pos_bins)
        d_cart_vel = np.digitize(cart_vel, self.cart_vel_bins)
        d_pole_angle = np.digitize(pole_angle, self.pole_angle_bins)
        d_pole_vel = np.digitize(pole_vel, self.pole_vel_bins)

        return (d_cart_pos, d_cart_vel, d_pole_angle, d_pole_vel)

    def get_state_index(self, discrete_state):
        d = discrete_state
        return ((d[0] * self.n_bins + d[1]) * self.n_bins + d[2]) * self.n_bins + d[3]

In [10]:
def select_action(Q_Table, state, epsilon):
    if np.random.rand() < epsilon:
        return np.random.randint(Q_Table.shape[1])
    else:
        return np.argmax(Q_Table[state])

In [11]:
def decay_epsilon(epsilon, decay_per_step):
    epsilon = max(epsilon - decay_per_step, config.eps_min)
    return epsilon

In [None]:
discretizer = Discretizer(config.n_bins)
Q_Table = np.zeros((config.n_bins**4, config.num_action))
episode_rewards = []
epsilon = config.epsilon
decay_per_step = (epsilon - config.eps_min) / config.decay_step

for episode in range(config.total_episode):
    env = gym.make(config.env_name)
    state, _ = env.reset()
    state = discretizer.get_state_index(discretizer.discretize(state))
    done = False
    action = select_action(Q_Table, state, epsilon)

    episode_reward = 0.

    while not done:
        next_state, reward, terminated, truncated, info = env.step(action)
        next_state = discretizer.get_state_index(discretizer.discretize(next_state))
        episode_reward += reward
        done = terminated or truncated
        if not done:
            next_action = select_action(Q_Table, next_state, epsilon)
            target = Q_Table[next_state, next_action] * config.gamma + reward
        else:
            next_action = -1
            target = reward
        q_sa = Q_Table[state, action]
        Q_Table[state, action] += config.lr * (target - q_sa)
        state = next_state
        action = next_action
    
    episode_rewards.append(episode_reward)
    epsilon = decay_epsilon(epsilon, decay_per_step)
    
    if episode % 100 == 0:
        print(f"Episode: {episode} Rewards: {episode_reward} Mean_Rewards: {np.array(episode_rewards[-min(100, len(episode_rewards)):]).mean():.4f} Epsilon: {epsilon:.4f}")