In [None]:
# ! pip install gym

In [None]:
import os
from typing import Optional, Union, Tuple, List
import gym
import gym.envs.registration
import gym.spaces
import matplotlib.pyplot as plt
import numpy as np
from tqdm.auto import tqdm
import os
import plotly.express as px
import plotly.graph_objects as go

MAIN = __name__ == "__main__"
max_episode_steps = 1000
IS_CI = os.getenv("IS_CI")
N_RUNS = 200 if not IS_CI else 5

In [None]:
ObsType = int
ActType = int

class MultiArmedBandit(gym.Env):
    action_space: gym.spaces.Discrete
    observation_space: gym.spaces.Discrete
    num_arms: int
    stationary: bool
    arm_reward_means: np.ndarray
    arm_star: int

    def __init__(self, num_arms=10, stationary=True):
        super().__init__()
        self.num_arms = num_arms
        self.stationary = stationary
        self.observation_space = gym.spaces.Discrete(1)
        self.action_space = gym.spaces.Discrete(num_arms)
        self.reset()

    def step(self, arm: ActType) -> Tuple[ObsType, float, bool, dict]:
        '''
        Note: some documentation references a new style which has (termination, truncation) bools in place of the done bool.
        '''
        assert self.action_space.contains(arm)
        if not self.stationary:
            q_drift = self.np_random.normal(loc=0.0, scale=0.01, size=self.num_arms)
            self.arm_reward_means += q_drift
            self.best_arm = int(np.argmax(self.arm_reward_means))
        reward = self.np_random.normal(loc=self.arm_reward_means[arm], scale=1.0)
        obs = 0
        done = False
        truncated = False
        info = dict(best_arm=self.best_arm)
        return (obs, reward, done, truncated, info)

    def reset(
        self, seed: Optional[int] = None, return_info=False, options=None
    ) -> Union[ObsType, Tuple[ObsType, dict]]:
        super().reset(seed=seed)
        if self.stationary:
            self.arm_reward_means = self.np_random.normal(loc=0.0, scale=1.0, size=self.num_arms)
        else:
            self.arm_reward_means = np.zeros(shape=[self.num_arms])
        self.best_arm = int(np.argmax(self.arm_reward_means))
        if return_info:
            return (0, dict())
        else:
            return 0

    def render(self, mode="human"):
        assert mode == "human", f"Mode {mode} not supported!"
        bandit_samples = []
        for arm in range(self.action_space.n):
            bandit_samples += [np.random.normal(loc=self.arm_reward_means[arm], scale=1.0, size=1000)]
        plt.violinplot(bandit_samples, showmeans=True)
        plt.xlabel("Bandit Arm")
        plt.ylabel("Reward Distribution")
        plt.show()

In [None]:
gym.envs.registration.register(
    id="ArmedBanditTestbed-v0",
    entry_point=MultiArmedBandit,
    max_episode_steps=max_episode_steps,
    nondeterministic=True,
    reward_threshold=1.0,
    kwargs={"num_arms": 10, "stationary": True},
)
if MAIN:
    env = gym.make("ArmedBanditTestbed-v0")
    print("Our env inside its wrappers looks like: ", env)

In [None]:
class Agent:
    '''Base class for agents in a multi-armed bandit environment (you do not need to add any implementation here)'''

    rng: np.random.Generator

    def __init__(self, num_arms: int, seed: int):
        self.num_arms = num_arms
        self.reset(seed)

    def get_action(self) -> ActType:
        raise NotImplementedError()

    def observe(self, action: ActType, reward: float, info: dict) -> None:
        pass

    def reset(self, seed: int) -> None:
        self.rng = np.random.default_rng(seed)

def run_episode(env: gym.Env, agent: Agent, seed: int):
    (rewards, was_best) = ([], [])
    env.reset(seed=seed)
    agent.reset(seed=seed)
    done = False
    while not done:
        arm = agent.get_action()
        (obs, reward, done, truncated, info) = env.step(arm)
        done = done or truncated
        agent.observe(arm, reward, info)
        rewards.append(reward)
        was_best.append(1 if arm == info["best_arm"] else 0)
    rewards = np.array(rewards, dtype=float)
    was_best = np.array(was_best, dtype=int)
    return (rewards, was_best)

def test_agent(env: gym.Env, agent: Agent, n_runs=200, base_seed=0):
    all_rewards = []
    all_was_bests = []
    for seed in tqdm(range(n_runs), total=n_runs):
        (rewards, corrects) = run_episode(env, agent, seed+base_seed)
        all_rewards.append(rewards)
        all_was_bests.append(corrects)
    return (np.array(all_rewards), np.array(all_was_bests))

class RandomAgent(Agent):
    def get_action(self) -> ActType:
        return self.rng.integers(self.num_arms)

if MAIN:
    num_arms = 10
    stationary = True
    env = gym.make("ArmedBanditTestbed-v0", num_arms=num_arms, stationary=stationary)
    random_agent = RandomAgent(num_arms=num_arms, seed=0)
    test_agent(env=env, agent=random_agent)

In [None]:
def moving_avg(a, n):
    ret = np.cumsum(a, dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    return ret[n - 1:] / n

def plot_rewards(
    all_rewards: List[np.ndarray], 
    names: List[str],
    moving_avg_window: Optional[int] = 15,
):
    fig = go.Figure(layout=dict(template="simple_white", title_text="Mean reward over all runs"))
    for rewards, name in zip(all_rewards, names):
        rewards_avg = rewards.mean(axis=0)
        if moving_avg_window is not None:
            rewards_avg = moving_avg(rewards_avg, moving_avg_window)
        fig.add_trace(go.Scatter(y=rewards_avg, mode="lines", name=name))
    fig.show()
    

class RewardAveraging(Agent):
    def __init__(self, num_arms: int, seed: int, epsilon: float, optimism: float):
        self.epsilon = epsilon
        self.optimism = optimism
        super().__init__(num_arms=num_arms, seed=seed)

    def get_action(self) -> ActType:
        if self.rng.random() < self.epsilon:
            return self.rng.integers(self.num_arms)
        else:
            return np.argmax(self.average_award_by_action)

    def observe(self, action: ActType, reward, info):
        self.sample_count_by_action[action] += 1
        self.average_award_by_action[action] += (reward - self.average_award_by_action[action])/self.sample_count_by_action[action]

    def reset(self, seed: int):
        super().reset(seed=seed)
        self.average_award_by_action = np.zeros(num_arms, dtype=np.float32)
        self.sample_count_by_action = np.full(num_arms, fill_value=self.optimism, dtype=np.uint32)


if MAIN:
    num_arms = 10
    stationary = True
    names, all_rewards = [], []
    env = gym.make("ArmedBanditTestbed-v0", num_arms=num_arms, stationary=stationary)

    for optimism in [0, 5]:
        agent = RewardAveraging(num_arms, 0, epsilon=0.1, optimism=optimism)
        (rewards, num_correct) = test_agent(env, agent, n_runs=N_RUNS, base_seed=1)
        names.append(str(agent))
        all_rewards.append(rewards)
        print(agent)
        print(f" -> Frequency of correct arm: {num_correct.mean():.4f}")
        print(f" -> Average reward: {rewards.mean():.4f}")

    plot_rewards(all_rewards, names, moving_avg_window=15)

In [None]:
class CheatyMcCheater(Agent):
    def __init__(self, num_arms: int, seed: int):
        super().__init__(num_arms=num_arms, seed=seed)
        self.best_arm = 0

    def get_action(self) -> ActType:
        return self.best_arm

    def observe(self, action, reward, info):
        self.best_arm = info["best_arm"]

    def repr(self):
        pass

if MAIN:
    cheater = CheatyMcCheater(num_arms, 0)
    reward_averaging = RewardAveraging(num_arms, 0, epsilon=0.1, optimism=0)
    random = RandomAgent(num_arms, 0)

    names = []
    all_rewards = []

    for agent in [cheater, reward_averaging, random]:
        (rewards, num_correct) = test_agent(env, agent, n_runs=N_RUNS)
        names.append(str(agent))
        all_rewards.append(rewards)

    plot_rewards(all_rewards, names)

## Upper-Confidence-Bound Action Selection

In [None]:
class UCBActionSelection(Agent):
    def __init__(self, num_arms: int, seed: int, c: float):
        super().__init__(num_arms=num_arms, seed=seed)
        self.c = c  # confidence
        self.t = 0

    def get_action(self):
        return np.argmax(self.average_award_by_action + self.c * np.sqrt(np.log(self.t)/self.sample_count_by_action))

    def observe(self, action, reward, info):
        self.t += 1
        self.sample_count_by_action[action] += 1
        self.average_award_by_action[action] += (reward - self.average_award_by_action[action])/self.sample_count_by_action[action]

    def reset(self, seed: int):
        super().reset(seed=seed)
        self.average_award_by_action = np.zeros(num_arms, dtype=np.float32)
        # not sure what sutton mean by N(a) == 0 then a is considered a maximizing action
        self.sample_count_by_action = np.ones(num_arms, dtype=np.uint32)    

if MAIN:
    cheater = CheatyMcCheater(num_arms, 0)
    reward_averaging = RewardAveraging(num_arms, 0, epsilon=0.1, optimism=0)
    reward_averaging_optimism = RewardAveraging(num_arms, 0, epsilon=0.1, optimism=5)
    ucb = UCBActionSelection(num_arms, 0, c=2.0)
    random = RandomAgent(num_arms, 0)

    names = []
    all_rewards = []

    for agent in [cheater, reward_averaging, reward_averaging_optimism, ucb, random]:
        (rewards, num_correct) = test_agent(env, agent, n_runs=N_RUNS, base_seed=1)
        names.append(str(agent))
        all_rewards.append(rewards)

    plot_rewards(all_rewards, names, moving_avg_window=15)

## Sutton book

Q: What is the Markov property?

A property of environemnts and their state signals. The state signal must contain all relevant information. Past sensations are summarized compactly in addition to immediate sensations. 
the reward and state at t+1 can be predicted with the action and state at t, there is no improvement by including information from previous timesteps.

Q: What is a Markov decision process?

A reinforcement learning task that satisfies the Markov property is called a
Markov decision process.

Q: Implicitly, we have assumed that the agent need only be Markovian as well. Is this a reasonable assumption?

A markovian agents action only depend on the current state. In a Markovian environment this is sufficient. Alternatively in a non markovian environemnt, the agent would require information of past states that is not in the current state to act.

Q: Why discount?

While it is ok to not use a discount factor for environments with a concise number of timesteps it gets a problem in others. In some environemts there is no terminal step so the sum of rewards could be going to infinity. The sum of (approaching) infinite series is mathematically complex. The discount factor $\gamma$ means that the agent will prefer rewards sooner rather than later which is often a good property. A "lazy" agent that waits around for a long time and then does an action that leads to a reward would be worse than a active agent that gets the same reward straight away. This would also speed up the training process and it allows to approximate things by truncating episodes to a maximum length.

proof the recursive function (reward spelt rewward)

∀s∈S.V colon instead of period

Q: Is policy $\pi_L$ better than $\pi_R$?

with $\gamma$=1 $\pi_R$ has double the reward but the reward is shifted by one timestep. only if the discount factor is below 0.5 does $\pi_L$ win

Q: Should the agent go clockwise or anticlockwise around the wall? 

for small movement penalties the agent should go clockwise since there is a positive reward after 5 steps. also the agent has less of a likelihood of slipping into other cells. for large movement penalties the agent can choose to go to the -1 negative terminal state in 4 steps. the tipping point should be close to movement penalty==-1