# Imports

In [None]:
!pip install tqdm

In [None]:
import numpy as np
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

<font size=6>__Define Classes__</font>

## The first step is to define a `KArmedBandit` class. It needs to keep track of:
### 1] `k` : Integer $\rightarrow$ the number of arms that the bandit has

### 2] `means` == $\mu_a$ : List of floats $\rightarrow$ The mean reward for pulling each arm 
><font size=3>Begin by choosing these means from a normal distribution.</font>
### 3] `stdevs` == $\sigma_a$ : List of floats $\rightarrow$ The standard deviations of rewards for pulling each arm
><font size=3>To begin with, this can be set to 1 for all arms, but feel free to experiment later!</font>

## The `KArmedBandit` should also have:
### 1] A function that pulls a specified arm and returns a reward drawn from the correct distribution
### 2] A function that returns the index of the optimal arm to pull

In [None]:
class KArmedBandit:
    def __init__(self, k):
        
        # We're going to initialise a K-Armed bandit class with k arms by attaching the value of k 
        # passed through the class constructor to the class instance.
        self.k = k
        
        # Next, let's define the distributions that each arm will draw from. First, let's define the mean of
        # each distribution by generating a corresponding list of floats, themselves drawn from a normal distribution.
        # The index of this list then indicates which arm is being referred to
        
        self.means = np.random.normal(loc=0, scale=2, size=k)
        
        # Finally, let's define the width of each distribution. To start with, we will set the standard deviation of
        # each arm to 1.
        self.stdevs = [1] * k

    def pull_arm(self, arm):
        """
        Computes the reward for pulling the specified arm by drawing from a normal distribution with the mean and
        standard deviation corresponding to that arm as defined in the __init__ function.
        """
        return np.random.normal(loc=self.means[arm], scale=self.stdevs[arm])

    def optimal_arm(self):
        """
        Returns the index of the optimal arm by finding the largest entry in the self.means list.
        """
        return np.argmax(self.means)

## Next, define an `Agent` class. The agent should keep track of:
### 1] `epsilon` : float $\rightarrow$ Its exploration rate
### 2] `Q` == $Q_t(a)$ : list of floats  $\rightarrow$ The agent's current estimate of the true action value function  
### 3] `n` == $n_a$ : list of floats $\rightarrow$ The number of times the agent has pulled each arm
### 4] `num_optimal_pulls` : integer $\rightarrow$ The number of times the agent has pulled the optimal arm
### 5] `reward_history` : list of floats $\rightarrow$  A list which is initially empty but which is appended to at each timestep, tracking the reward received with each arm pull
### 6] `optimal_history` : list of floats $\rightarrow$ A list which is initially empty but which is appended to at each timestep. This variable tracks the proportion that the agent has pulled the arm with the optimal reward as the number of time steps increases. For example, if the optimal arm is 7, and the agent's first three pulls are [1,  7,  2] , then `optimal_history` should be [0, 0.5, 0.333...] after those three pulls.

## The `Agent` should also have:

### 1] A function to choose an arm to pull $\epsilon$-greedily
### 2] An `act` function, which pulls an arm on the bandit, receives a reward, and updates tracking of rewards and optimal pull %
### 3] An `update_Q` function, which updates our estimated action-value function $Q(a)$ given a reward and the index of which arm was pulled
><font size=4>The simplest way to do this is to keep track of the rewards assigned to each arm, but there is a more elegant solution</font>
### 4] A `run_trial` function, which performs `act` $n_{steps}$ times



In [None]:
class Agent():
    def __init__(self, bandit, epsilon):
        
        # We're going to initialise the agent so that it acts on a specific bandit.
        self.bandit = bandit
        
        # We will also initialise epsilon to the value passed into the class constructor.
        self.epsilon = epsilon
        
        # n tracks the number of times we have pulled each lever, so we initialise this to be a list of zeros with the same 
        # length as the number of arms that the specified bandit has.
        self.n = [0]*self.bandit.k
        
        # When we initialise the agent we have not made any arm pulls yet so we also haven't yet pulled the optimal arm!
        self.num_optimal_pulls = 0
        
        # These lists track the histories that we want to track so we initialise them as empty lists
        self.reward_history = []
        self.optimal_history = []
        
        
        # How will you initialise your Q estimates? If we don't know anything about the rewards gained from each arm 
        # before we start then what should our prior be?
        
        self.Q = ?
        
        
    
    def choose_e_greedy_action(self):
        """
        Chooses an arm to pull epsilon-greedily. Returns the index of the selected arm as an integer.
        """
        # Need to write a function here to select an arm to pull epsilon-greedily.
        # Remember that to do this, we sample a number from the uniform distribution between 0 and 1.
        # If this number is less than self.epsilon, then we choose a random action (explore).
        # Otherwise, we choose the action that we currently estimate has the highest reward by looking at Q(a) (exploit).
        
        selected_arm = ?
                
        return selected_arm
    
    
    def act(self):
        
        # Choose an action e-greedily
        arm = self.choose_e_greedy_action()
        
        # Update the array keeping track of how many times each arm has been pulled
        self.n[arm] += 1
        
        # Now that you know which arm to pull, how will you get the reward? We wrote a function to do this
        # in the bandit class
        
        reward = ?
        
        # Did the agent pull the optimal arm? If it did, we should update the variable tracking the number of times
        # that we have pulled the optimal arm.
        if arm == self.bandit.optimal_arm():
            self.num_optimal_pulls += 1
        
        # Each time we pull an arm, we want to update our histories =>
        
        # Append the reward that we just received
        self.reward_history ?
        
        # Compute the proportion of times that the agent has so far pulled the optimal arm and append this to
        # the list
        self.optimal_history ?

        # Update your Q estimate
        self.update_Q(arm, reward)
    

    # How will you update the Q values? Remember that our Q value for each arm is just the mean reward we have received from 
    # pulling that arm so far.
    def update_Q(self, arm, reward):
        ?

    def run_trial(self, n_steps):
        for step in range(n_steps):
            # Which function needs to be called here?
            self.?

# Now let's run some trials!

## Define Hyperparameters

In [None]:
k = 10
num_steps = 1000

## Define Bandit & Agent

In [None]:
bandit = KArmedBandit(k=k)

In [None]:
agent = Agent(bandit, epsilon=0.1)

## Run a single trial and plot the results!

In [None]:
agent.run_trial(num_steps)

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
ax.plot(agent.reward_history)
ax.tick_params(labelsize=16)
ax.set_xlabel("Steps", fontsize=16)
ax.set_ylabel("Reward received", fontsize=16)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
ax.plot(agent.optimal_history)
ax.tick_params(labelsize=16)
ax.set_xlabel("Steps", fontsize=16)
ax.set_ylabel("Proportion of optimal arm pulls", fontsize=16)
plt.show()

## Compare the bandit means to your best estimates!

In [None]:
for idx, (true_mean, estimated_mean) in enumerate(zip(bandit.means , agent.Q)):
    print(f"Arm {idx}:")
    print(f"True mean: {true_mean}")
    print(f"Agent estimated mean: {estimated_mean}")
    print("")

# Compare runs with different values of epsilon

In [None]:
num_trials = 1000
epsilons_to_test = [0.01, 0.1, 1]

In [None]:
mean_reward_history_array = []
mean_optimal_history_array = []

for eps in epsilons_to_test:

    print(f"Testing epsilon = {eps}")
    # Initialise containers for histories
    reward_history_array = []
    optimal_history_array = []

    # Run num_trials trials and average the results
    for trial in tqdm(range(num_trials)):
        bandit = KArmedBandit(k=k)
        ag = Agent(bandit, eps)

        ag.run_trial(num_steps)

        # After each trial, add the reward and optimal % history to an array
        reward_history_array.append(ag.reward_history)
        optimal_history_array.append(ag.optimal_history)

    # After running num_trials trials, take the mean of the histories and store them
    # in an array
    mean_reward_history_array.append(
        np.mean(np.array(reward_history_array), axis=0)
    )
    mean_optimal_history_array.append(
        np.mean(np.array(optimal_history_array), axis=0)
    )

## Plot your results

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
for idx, eps in enumerate(epsilons_to_test):
    ax.plot(mean_reward_history_array[idx])
ax.legend([str(e) for e in epsilons_to_test], fontsize=16)
ax.set_xlabel("Steps", fontsize=16)
ax.set_ylabel("Mean reward", fontsize=16)
ax.tick_params(labelsize=16)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
for idx, eps in enumerate(epsilons_to_test):
    ax.plot(mean_optimal_history_array[idx])
ax.legend([str(e) for e in epsilons_to_test], fontsize=16)
ax.set_xlabel("Steps", fontsize=16)
ax.set_ylabel("Optimal arm %", fontsize=16)
ax.tick_params(labelsize=16)
plt.show()

# Extensions:

### 1. How do your estimates Q_t of the true action-value function q* converge over time? As a function of epsilon?

### 2. Optimistic initialisation: what happens when you initialise your Q values at 5 instead of 0? why?

### 3. Non-stationary q*(a): try adding a function that slightly modifies your bandit means after every time the agent acts. What happens now? (Hint: your Q_n update function should value nearer rewards to those further away. How can we achieve this?)