In [None]:
import gym
import numpy as np
import matplotlib.pyplot as plt

In [None]:
env = gym.make("CartPole-v1")  # , render_mode = "human"

alpha = 0.995
dis_factor = 0.9
epsilon = 0.05
max_number_episodes = 5000

discretization_segments = (7,7,7,7)

cart_vel_min = -1.25
cart_vel_max = 1.25
pole_ang_vel_min = -np.radians(45)
pole_ang_vel_max = np.radians(45)

state_bounds = list(zip(env.observation_space.low, env.observation_space.high))
state_bounds[1] = [cart_vel_min, cart_vel_max]
state_bounds[3] = [pole_ang_vel_min, pole_ang_vel_max]

In [None]:
class QLearning:
    def __init__(
        self,
        env,
        alpha,
        dis_factor,
        epsilon,
        max_episodes,
        discretization_segments,
        state_bounds,
    ):
        self.env = env
        self.alpha = alpha
        self.dis_factor = dis_factor
        self.epsilon = epsilon
        self.max_episodes = max_episodes
        self.discretization_segments = discretization_segments
        self.state_bounds = state_bounds
        self.action = self.env.action_space.n
        self.reward_sum_per_episode = []
        self.best_action_value_per_episode = []
        self.mean_reward = []
        self.q_table = np.random.uniform(
            low=0, high=1, size=(self.discretization_segments + (self.action,))
        )
        # If you want the initial q table to be zero, comment the line above this one 
        # and uncomment the line below
        # self.q_table = np.zeros(self.discretization_segments + (self.action,))
        
    def select_action(self, state):
        if (
            np.random.random() < self.epsilon
            or self.q_table[state][0] == self.q_table[state][1]
        ):
            action = self.env.action_space.sample()
        else:
            action = np.argmax(self.q_table[state])
        return action
    
    def discretize_state(self, state):
        state_indices = []
        for i in range(len(state)):
            if state[i] <= self.state_bounds[i][0]:
                indexed_state = 0
            elif state[i] >= self.state_bounds[i][1]:
                indexed_state = self.discretization_segments[i] - 1
            else:
                bound_width = self.state_bounds[i][1] - self.state_bounds[i][0]

                offset = (
                    (self.discretization_segments[i] - 1)
                    * self.state_bounds[i][0]
                    / bound_width
                )
                scaling = (self.discretization_segments[i] - 1) / bound_width

                indexed_state = int(round(scaling * state[i] - offset))

            state_indices.append(indexed_state)

        return tuple(state_indices)

    def train(self):
        total_reward = 0
        for episode in range(self.max_episodes):
            episode_rewards = 0
            epsiode_action_values = []
            (state, _) = self.env.reset()
            indexed_state = self.discretize_state(state)
            terminal_state = False
            
            print("\nEpisode = %d" % episode)
            
            for t in range(750):
                self.env.render()
                action = self.select_action(indexed_state)

                (next_state, reward, terminal_state, truncated, _) = self.env.step(action)
                episode_rewards += reward
                total_reward += reward
                next_indexed_state = self.discretize_state(next_state)

                next_best_action = np.amax(self.q_table[next_indexed_state])

                epsiode_action_values.append(self.q_table[next_indexed_state])

                td_target = reward + (self.dis_factor * next_best_action)

                td_error = td_target - self.q_table[indexed_state + (action,)]

                self.q_table[indexed_state + (action,)] = self.q_table[
                    indexed_state + (action,)
                ] + self.alpha * (td_error)

                indexed_state = next_indexed_state

                if terminal_state or truncated:
                    self.reward_sum_per_episode.append(episode_rewards)
                    self.best_action_value_per_episode.append(
                        np.max(epsiode_action_values)
                    )
                    print("Episode %d finished after %f time steps" % (episode, t))
                    print("Reward: %f" % episode_rewards)
                    break
            if episode == 0:
                self.mean_reward.append(total_reward)
            else:
                self.mean_reward.append(total_reward / episode)
        print("Average Reward: {}".format(total_reward / episode))
        self.env.close()
        
    def simulate(self,simulation_iterations):
        env2 = gym.make("CartPole-v1", render_mode = "human")
        total_reward = 0
        mean = []
        for episode in range(simulation_iterations):
            (state, _) = env2.reset()
            indexed_state = self.discretize_state(state)
            terminal_state = False
            for t in range(500):
                env2.render()
                action = self.select_action(indexed_state)

                (next_state, reward, terminal_state, truncated, _) = env2.step(action)
                total_reward += reward
                
                next_indexed_state = self.discretize_state(next_state)

                next_best_action = np.amax(self.q_table[next_indexed_state])


                indexed_state = next_indexed_state

                if terminal_state or truncated:
                    break
            if episode == 0:
                mean.append(total_reward)
            else:
                mean.append(total_reward / episode)
        print("Average Reward: {}".format(total_reward / episode))            
        env2.close()
        return mean

In [None]:
Q = QLearning(
     env,
        alpha,
        dis_factor,
        epsilon,
        max_number_episodes,
        discretization_segments,
        state_bounds
    )
Q.train()

In [None]:
plt.figure(figsize=(12, 5))
plt.plot(Q.reward_sum_per_episode, color="green", linewidth=1)
plt.xlabel("Episode")
plt.ylabel("Sum of Rewards in Episode While Training")
plt.show()

In [None]:
plt.figure(figsize=(12, 5))
plt.plot(Q.mean_reward, color="black", linewidth=1),
plt.xlabel("Episode")
plt.ylabel("Mean Reward While Training")
plt.show()

In [None]:
plt.figure(figsize=(12, 5)),
plt.plot(Q.best_action_value_per_episode, color="blue", linewidth=1)
plt.xlabel("Episode")
plt.ylabel("Best Action Value in Each Episode While Training")
plt.show()

In [None]:
print(Q.q_table)
print(np.max(Q.q_table))

In [None]:
mean = Q.simulate(100)

In [None]:
plt.figure(figsize=(12, 5))
plt.plot(mean, color="black", linewidth=1),
plt.xlabel("Episode")
plt.ylabel("Mean reward During Simulation")
plt.show()