# Exercise 2.5:
![Exercise 2.5](images/Exercise2.5-Sutton.jpeg)

In [52]:
import numpy as np

class Bandit:
    
    def __init__(self, mu = 1, k = 10):
        """
        By default creates an array of size k and mean of 1 which represents the q*(a).
        """
        self.Qstars = np.ones(k) * mu
        #print(self.Qstars)
        
    def __step(self):
        for i in range(0, len(self.Qstars)):
            self.Qstars[i] += np.random.normal(0, 0.01)
        #print(self.Qstars)
            
    def pull_a_bandit(self, lever):
        if lever < len(self.Qstars) and lever >=0:
            reward = np.random.normal(self.Qstars[lever], 1)
            self.__step()
            return reward
        else:
            raise Exception(f"Number {lever} out of range")
            
    def print_bandit_content(self):
        print(self.Qstars)

In [53]:
class Bandit_Agent:
    
    def __init__(self):
        self.action_values = np.zeros(10) # Q(a)
        self.counts = np.zeros(10) # N(a)
        self.iteration_number = 0
        self.total_reward = 0
        
    def greedy_update(self):
        self.iteration_number += 1
        action_index = np.argmax(self.action_values) # Choose the best action
        reward = ten_armed_bandit(action_index) # Get the reward from the bandit
        self.total_reward += reward # Add the reward to the accumulated reward
        self.counts[action_index] += 1 # Increment the number of that action
        self.action_values[action_index] = self.action_values[action_index] + (reward - self.action_values[action_index])/self.counts[action_index]
        return self.total_reward/self.iteration_number
    
    def epsilon_greedy_update(self, epsilon=0.01):
        self.iteration_number += 1
        if np.random.rand() > epsilon:
            action_index = np.argmax(self.action_values) # Choose the best action
        else:
            action_index = np.random.randint(0,10) # Choose a random action
        reward = ten_armed_bandit(action_index) # Get the reward from the bandit
        self.total_reward += reward # Add the reward to the accumulated reward
        self.counts[action_index] += 1 # Increment the number of that action
        self.action_values[action_index] = self.action_values[action_index] + (reward - self.action_values[action_index])/self.counts[action_index]
        return self.total_reward/self.iteration_number
        
    def obj_print(self):
        print(f"{self.action_values},avg_reward={self.total_reward/self.iteration_number},best_action={self.best_action()}")
        
    def best_action(self):
        return np.max(self.action_values)

In [55]:
bandit = Bandit()

for i in range(1000):
    x = bandit.pull_a_bandit(7)
    print("x = ",x)
    bandit.print_bandit_content()

x =  0.9603662992170114
[1.00539413 0.97745887 1.00232099 0.98810408 1.0071583  0.9943158
 1.00336097 0.99995996 0.99830949 1.01057432]
x =  -0.21676749670123718
[0.99144805 0.97762723 0.99577508 0.98827366 1.00930801 0.99393345
 0.99487648 0.9886227  0.99945395 1.0243175 ]
x =  0.4479066015067975
[0.99284652 0.97326084 1.00230085 0.98945641 1.01218221 0.9991408
 0.99508942 0.97716153 0.99350535 1.0168041 ]
x =  2.2247248800657142
[0.99082094 0.9717055  0.9919284  0.99151722 1.0070828  0.99980767
 1.01345338 0.96450602 0.99990487 1.0264456 ]
x =  0.10619755647144913
[0.98060321 0.99755165 1.00124638 1.01265552 0.99803962 0.99622726
 1.02564302 0.94963816 1.00814547 1.01563082]
x =  0.0018115128731891472
[0.97613362 0.97551386 0.98968477 1.01804893 1.00371924 1.01480385
 1.03002099 0.93415553 0.99802085 1.00915781]
x =  0.7930362853476657
[0.97931166 0.97823339 0.97559746 1.01634624 1.00755652 1.00782993
 1.02889883 0.91892471 0.99013621 1.01519779]
x =  1.9104425184047993
[0.98800352 0

 0.86198795 0.87592097 0.99355275 0.93485898]
x =  1.219726657442463
[1.09171507 0.82462833 1.13165385 0.84898882 0.84000375 0.90792029
 0.85935115 0.86537576 0.97306759 0.94110592]
x =  -1.3608267250948543
[1.10289812 0.82310051 1.14482556 0.85053793 0.81953998 0.9004794
 0.85414714 0.87604583 0.9837768  0.93036707]
x =  1.0540968723614768
[1.09992723 0.81238524 1.1454483  0.84471551 0.8137211  0.90236207
 0.87201022 0.873311   0.98850601 0.93482742]
x =  0.16675005538815246
[1.09510701 0.80614957 1.1442747  0.8502001  0.80749035 0.8878476
 0.89321641 0.88585976 0.99566303 0.92398738]
x =  1.3073995875408244
[1.09549834 0.8032763  1.14559507 0.83654785 0.8222739  0.86515062
 0.89899809 0.86679897 0.98552935 0.92760801]
x =  0.5795178282713715
[1.11041957 0.80309367 1.15207662 0.8326961  0.81356114 0.85682187
 0.89100096 0.87460784 0.97007377 0.93213976]
x =  1.3028891336396546
[1.10680543 0.80708166 1.13282305 0.82971327 0.80897544 0.87435101
 0.88801512 0.87144463 0.97863442 0.920934

 1.01121227 0.65059062 1.29539118 1.12011263]
x =  1.5245217392130397
[0.81981719 0.84558437 1.33645302 0.96392118 0.51551677 1.08299652
 1.00978731 0.63983212 1.30388798 1.11082289]
x =  1.0851626795533948
[0.80902193 0.84914075 1.35691797 0.98459056 0.51947096 1.07227242
 1.00901844 0.61604069 1.29352133 1.10875391]
x =  0.172200539646098
[0.81858987 0.84913567 1.34653188 0.9915861  0.52382124 1.05944126
 1.00767982 0.59932749 1.27894759 1.11317491]
x =  1.86074283809528
[0.81333307 0.8638302  1.33442297 0.98777051 0.50330124 1.05934613
 1.01689834 0.59100185 1.27457499 1.10882053]
x =  0.9764886184687782
[0.8273053  0.87039256 1.32910999 0.98019551 0.48715368 1.05501416
 1.03409163 0.58271307 1.28613492 1.11028781]
x =  1.2823098013593057
[0.81862731 0.86784779 1.32402215 0.97985554 0.49548455 1.07384592
 1.04211844 0.59092907 1.27934069 1.11765494]
x =  -0.3354030396475134
[0.81219335 0.86210184 1.32749636 0.98147971 0.48868633 1.08321458
 1.02361152 0.57572517 1.28634894 1.0947593