In [None]:
import gym
import numpy as np
import math
import random
import matplotlib.pyplot as plt

In [None]:
env = gym.make("CartPole-v1", render_mode="human")

In [None]:
discretization_segments = (2,2,2,2)
NUM_ACTIONS = env.action_space.n
q_table = np.zeros(discretization_segments + (NUM_ACTIONS,))

cart_vel_min = -1.25
cart_vel_max = 1.25
pole_ang_vel_min = -np.radians(45)
pole_ang_vel_max = np.radians(45)

state_bounds = list(zip(env.observation_space.low, env.observation_space.high))
state_bounds[1] = [cart_vel_min, cart_vel_max]
state_bounds[3] = [pole_ang_vel_min, pole_ang_vel_max]

EXPLORE_RATE_MIN = 0.01
LEARNING_RATE_MIN = 0.1

In [None]:
def get_explore_rate(t):
    return max(EXPLORE_RATE_MIN, min(1, 1.0-math.log10((t+1)/25)))

In [None]:
def get_learning_rate(t):
    return max(LEARNING_RATE_MIN, min(0.5, 1.0-math.log10((t+1)/25)))

In [None]:
def select_action(state, explore_rate):
    if random.random() < explore_rate:
        action = env.action_space.sample()
    else:
        action = np.argmax(q_table[state])
    return action

In [None]:
def state_to_bucket(state):
    
    bucket_indices = []
    
    for i in range(len(state)):
        if state[i] <= state_bounds[i][0]:
            bucket_index = 0
            
        elif state[i] >= state_bounds[i][1]:
            bucket_index = discretization_segments[i] - 1
        
        else:
            bound_width = state_bounds[i][1] - state_bounds[i][0]
            
            offset = (discretization_segments[i] - 1) * state_bounds[i][0] / bound_width
            scaling = (discretization_segments[i] - 1) / bound_width
            
            bucket_index = int(round(scaling * state[i] - offset))
        
        bucket_indices.append(bucket_index)
    
    return tuple(bucket_indices)

In [None]:
def simulate():
    learning_rate = get_learning_rate(0)
    explore_rate = get_explore_rate(0)
    
    discount_factor = 0.99
    num_streaks = 0
    reward_sum_per_episode = []
    for episode in range(100):
        episode_rewards = []
        (observ,_) = env.reset()
        
        state_0 = state_to_bucket(observ)
        
        for t in range(250):
            
            env.render()
            
            action = select_action(state_0, explore_rate)
            
            (observ, reward, done, truncated,_) = env.step(action)
            episode_rewards.append(reward)
            state = state_to_bucket(observ)
            
            best_q = np.amax(q_table[state])
            
            q_table[state_0 + (action,)] += learning_rate * (reward + discount_factor*(best_q) - q_table[state_0 + (action,)])
            
            
            state_0 = state
            
            # print("\nEpisode = %d" % episode)
            # print("t = %d" % t)
            # print("Action: %d" % action)
            # print("State: %s" %str(state))
            # print("Reward: %f" % np.sum(episode_rewards))
            # print("Best Q: %f" % best_q)
            # print("Explore rate: %f" % explore_rate)
            # print("Learning rate: %f" % learning_rate)
            # print("Streaks: %d" %num_streaks)
            
            # print("")
            
            if done:
                print("Episode %d finished after %f time steps" % (episode, t))
                reward_sum_per_episode.append(np.sum(episode_rewards))
                if (t >= 199):
                    num_streaks += 1
                else:
                    num_streaks = 0
                break
            
            if num_streaks > 120:
                break
            explore_rate = get_explore_rate(episode)
            learning_rate = get_learning_rate(episode)
    return reward_sum_per_episode

In [None]:
reward = simulate()

In [None]:
env.close()

In [None]:
print(q_table)

In [None]:
plt.figure(figsize=(12, 5))
plt.plot(reward, color="green", linewidth=1)
plt.xlabel("Episode")
plt.ylabel("Sum of Rewards in Episode")
plt.show()