In [2]:
import gymnasium
import numpy as np
import matplotlib.pyplot as plt

In [3]:
EPSILONS = [0.1, 0.5, 0.9]
GAMMAS = [0.5, 0.9]
LEARNING_RATES = [0.1, 0.5, 0.9]
EPISODES = 50000
CHECKPOINT_EVERY = 50
EXPERIMENT_TRIES = 50
MAX_STEPS = 1000

In [5]:
def epsilon_greedy(env, s, q, epsilon):
    prob = np.random.rand()
    if prob < epsilon:
        # print('Exploration')
        return env.action_space.sample()
    else:
        # print("Exploitation")
        return np.argmax([q[s, a] for a in range(env.action_space.n)])


def cosine_annealing(episode, total_episodes, epsilon_max=1.0, epsilon_min=0.01):
    return epsilon_min + 0.5 * (epsilon_max - epsilon_min) * (1 + np.cos(np.pi * episode / total_episodes))



def q_learning(env, gamma, epsilon, learning_rate):
    states = env.observation_space.n
    actions = env.action_space.n

    q = np.zeros((states, actions))

    print(q)
    print(q.shape)
    average_rewards = []

    for episode in range(EPISODES):
        print('Episode:', episode)
        curr_epsilon = cosine_annealing(episode, EPISODES, epsilon)
        state, _ = env.reset()
        terminated = False
        cur_steps = 0
        while not terminated and cur_steps < MAX_STEPS:
            action = epsilon_greedy(env, state, q, curr_epsilon)
            next_state, reward, terminated, _, _= env.step(action)
            q[state, action] = q[state, action] + learning_rate * (reward + gamma * max(q[next_state, :]) - q[state, action])
            state = next_state
            cur_steps += 1

        if episode % CHECKPOINT_EVERY == 0:
            print(f"Current epsilon: {curr_epsilon}")
            sum_rewards = 0
            policy = np.zeros(states)
            for s in range(states):
                policy[s] = np.argmax(q[s, :])

            for run in range(EXPERIMENT_TRIES):
                # print(f"Start run: {run}")
                run_state, _ = env.reset()
                terminated_run = False
                run_steps = 0
                while not terminated_run and run_steps < MAX_STEPS:
                    run_action = policy[run_state]
                    next_run_state, run_reward, terminated_run, _, _ = env.step(run_action)
                    sum_rewards += run_reward
                    run_state = next_run_state
                    run_steps += 1
                # print(f"Finish run: {run}")

            average_reward = sum_rewards / EXPERIMENT_TRIES
            average_rewards.append(average_reward)

        print(f"Done episode {episode}")
    pi = np.zeros(states)
    for s in range(states):
        pi[s] = np.argmax(q[s, :])
    return pi, average_rewards

def sarsa(env, gamma, epsilon, learning_rate):
    states = env.observation_space.n
    actions = env.action_space.n

    q = np.zeros((states, actions))
    print(q)
    print(q.shape)
    average_rewards = []
    for episode in range(EPISODES):
        print('Episode:', episode)
        curr_epsilon = cosine_annealing(episode, EPISODES, epsilon)
        state, _ = env.reset()
        terminated = False
        curr_steps = 0
        action = epsilon_greedy(env, state, q, curr_epsilon)
        while not terminated and curr_steps < MAX_STEPS:
            next_state, reward, terminated, _, _ = env.step(action)
            action_prime = epsilon_greedy(env, next_state, q, curr_epsilon)
            q[state, action] = q[state, action] + learning_rate * (reward + gamma * q[next_state, action_prime] - q[state, action])
            state = next_state
            action = action_prime
            curr_steps += 1

        if episode % CHECKPOINT_EVERY == 0:
            print(f"Current epsilon: {curr_epsilon}")
            sum_rewards = 0
            policy = np.zeros(states)
            for s in range(states):
                policy[s] = np.argmax(q[s, :])

            for run in range(EXPERIMENT_TRIES):
                run_state, _ = env.reset()
                terminated_run = False
                run_steps = 0
                while not terminated_run and run_steps < MAX_STEPS:
                    run_action = policy[run_state]
                    next_run_state, run_reward, terminated_run, _, _ = env.step(run_action)
                    sum_rewards += run_reward
                    run_state = next_run_state
                    run_steps += 1

            average_reward = sum_rewards / EXPERIMENT_TRIES
            average_rewards.append(average_reward)
        print(f"Done episode {episode}")

    pi = np.zeros(states)
    for s in range(states):
        pi[s] = np.argmax(q[s, :])

    return pi, average_rewards

def compute_data():
    envs = [gymnasium.make('Taxi-v3'), gymnasium.make('FrozenLake-v1')]
    results = {}
    for env in envs:
        for gamma in GAMMAS:
            for learning_rate in LEARNING_RATES:
                for epsilon in EPSILONS:
                    q_policy, q_rewards = q_learning(env, gamma, epsilon, learning_rate)
                    s_policy, s_rewards = sarsa(env, gamma, epsilon, learning_rate)
                    key = (env.spec.id, gamma, learning_rate, epsilon)

                    results[key] = {
                        'q_rewards': q_rewards,
                        's_rewards': s_rewards,
                        'q_policy': q_policy,
                        's_policy': s_policy,
                        'q_max_reward': np.max(q_rewards),
                        's_max_reward': np.max(s_rewards)
                    }
    return results


def run_experiments(data):
    envs = [gymnasium.make('Taxi-v3'), gymnasium.make('FrozenLake-v1')]
    for env in envs:
        for gamma in GAMMAS:
            for learning_rate in LEARNING_RATES:
                plt.figure()
                for epsilon in EPSILONS:
                    key = (env.spec.id, gamma, learning_rate, epsilon)
                    q_rewards = data[key]['q_rewards']
                    s_rewards = data[key]['s_rewards']
                    max_q = data[key]['q_max_reward']
                    max_s = data[key]['s_max_reward']
                    plt.plot(q_rewards, label=f'Q-Learning Rewards for Epsilon = {epsilon}')
                    plt.plot(s_rewards, label=f'SARSA Rewards for Epsilon = {epsilon}')
                    plt.scatter(q_rewards.index(max_q), max_q, color='blue', marker='o', label=f'Max Q-Learning Reward for Epsilon = {epsilon}')
                    plt.scatter(s_rewards.index(max_s), max_s, color='red', marker='o', label=f'Max SARSA Reward for Epsilon = {epsilon}')
                plt.xlabel('Checkpoint')
                plt.ylabel('Reward')
                plt.title(f"γ={gamma}, α={learning_rate}")
                plt.legend()
                plt.savefig(f"../results/{key[0]}/G{GAMMAS.index(gamma)}_LR{LEARNING_RATES.index(learning_rate)}.png")
                plt.close()

    for env in envs:
        for epsilon in EPSILONS:
            for learning_rate in LEARNING_RATES:
                plt.figure()
                for gamma in GAMMAS:
                    key = (env.spec.id, gamma, learning_rate, epsilon)
                    q_rewards = data[key]['q_rewards']
                    s_rewards = data[key]['s_rewards']
                    max_q = data[key]['q_max_reward']
                    max_s = data[key]['s_max_reward']
                    plt.plot(q_rewards, label=f'Q-Learning Rewards for Gamma = {gamma}')
                    plt.plot(s_rewards, label=f'SARSA Rewards for Gamma = {gamma}')
                    plt.scatter(q_rewards.index(max_q), max_q, color='blue', marker='o', label=f'Max Q-Learning Reward for Gamma = {gamma}')
                    plt.scatter(s_rewards.index(max_s), max_s, color='red', marker='o', label=f'Max SARSA Reward for Gamma = {gamma}')
                plt.xlabel('Checkpoint')
                plt.ylabel('Reward')
                plt.title(f"ε={epsilon}, α={learning_rate}")
                plt.legend()
                plt.savefig(f"../results/{key[0]}/E{EPSILONS.index(epsilon)}_LR{LEARNING_RATES.index(learning_rate)}.png")
                plt.close()

    for env in envs:
        for epsilon in EPSILONS:
            for gamma in GAMMAS:
                plt.figure()
                for learning_rate in LEARNING_RATES:
                    key = (env.spec.id, gamma, learning_rate, epsilon)
                    q_rewards = data[key]['q_rewards']
                    s_rewards = data[key]['s_rewards']
                    max_q = data[key]['q_max_reward']
                    max_s = data[key]['s_max_reward']
                    plt.plot(q_rewards, label=f'Q-Learning Rewards for Learning Rate = {learning_rate}')
                    plt.plot(s_rewards, label=f'SARSA Rewards for Learning Rate = {learning_rate}')
                    plt.scatter(q_rewards.index(max_q), max_q, color='blue', marker='o', label=f'Max Q-Learning Reward for Learning Rate = {learning_rate}')
                    plt.scatter(s_rewards.index(max_s), max_s, color='red', marker='o', label=f'Max SARSA Reward for Learning Rate = {learning_rate}')
                plt.xlabel('Checkpoint')
                plt.ylabel('Reward')
                plt.title(f"ε={epsilon}, γ={gamma}")
                plt.legend()
                plt.savefig(f"../results/{key[0]}/E{EPSILONS.index(epsilon)}_G{GAMMAS.index(gamma)}.png")
                plt.close()


In [None]:
data = compute_data()

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]
(500, 6)
Episode: 0
Current epsilon: 0.1
Done episode 0
Episode: 1
Done episode 1
Episode: 2
Done episode 2
Episode: 3
Done episode 3
Episode: 4
Done episode 4
Episode: 5
Done episode 5
Episode: 6
Done episode 6
Episode: 7
Done episode 7
Episode: 8
Done episode 8
Episode: 9
Done episode 9
Episode: 10
Done episode 10
Episode: 11
Done episode 11
Episode: 12
Done episode 12
Episode: 13
Done episode 13
Episode: 14
Done episode 14
Episode: 15
Done episode 15
Episode: 16
Done episode 16
Episode: 17
Done episode 17
Episode: 18
Done episode 18
Episode: 19
Done episode 19
Episode: 20
Done episode 20
Episode: 21
Done episode 21
Episode: 22
Done episode 22
Episode: 23
Done episode 23
Episode: 24
Done episode 24
Episode: 25
Done episode 25
Episode: 26
Done episode 26
Episode: 27
Done episode 27
Episode: 28
Done episode 28
Episode: 29
Done episode 29
Episode: 30
Done e

In [None]:
run_experiments(data)