## Ex 1: another version

In [1]:
import numpy as np
import time
from matplotlib import pyplot as plt
plt.style.use('ggplot')

In [2]:
class SimpleBandit:
    '''
    The bandit class you will use in this homework. DO NOT modify
    '''
    def __init__(self):
        self._mu = np.array([-1.,-2.,1.5,0.5,-0.25,.75,.1,1.8,-3])
        self._p = 1 / (1 + np.exp(-self._mu))
        self.num_arms = len(self._mu)
        self.total_rewards = np.zeros(len(self._mu))
        
    def pull(self,arms):
        self.current_rewards = np.random.binomial(1,self._p)
        self.total_rewards += self.current_rewards
        return self.current_rewards[arms]

In [3]:
class RandomPolicy:
    """
    Random policy, pure exploration. DO NOT modify
    """
    def __init__(self, num_arms):
        self.num_arms = num_arms
        self.current_arm = None
        
    def select_arm(self):
        """
        choose which arm to pull
        """
        self.current_arm = np.random.randint(self.num_arms)
        return self.current_arm
    
    def update_reward(self, reward):
        """
        enter observed reward
        """
        return None

In [4]:
def bak_run_trajectory(bandit, policies, T):
    """
    Run T steps of bandit pulling each policy in each time step
    
    Arguments: 
    bandit: 
        a fresh instance of a Bandit class, 
        in this homework you will be always using an instance from RandomPolicy class
    
    policies:
        a list like [policy1, policy2, policy3 ...]
        each of the policy will have select_arm method and update_reward method
    
    Output: 
        regret of each policy in list, like
        [regret1, regret2, ...]
    """
    regrets_list = []  # per policy
    i = 0  # policy index 
    for policy in policies:  # iterate all policies 
        bandit = SimpleBandit()  # reset the simulation
        num_arms = policy.num_arms  # arms space in each policy 
        selected_arms = [policy.select_arm() for _ in range(T)]  # select random arm T times under each policy
        print(f"The {i}th policy selected these arms: {selected_arms} under {T} time steps")
        arms_current_rewards = [bandit.pull(arm) for arm in selected_arms]  # for a specific time step
        actual_rewards = sum(arms_current_rewards)
        print(f"The current binary rewards for {T} arms: {arms_current_rewards}")
        print(f"The actually rewards received from the {i}th pilicy: {actual_rewards}")
        print(f"The potential total rewards under the {i}th policy for each arm: {bandit.total_rewards}")
        regret_i = np.amax(bandit.total_rewards) - actual_rewards
        print(f"The regrets under the {i}th policy: {regret_i}")
        regrets_list.append(regret_i)
        i += 1
    return regrets_list

In [6]:
""" DEMO run_trajectory
"""
bandit = SimpleBandit()
p1 = RandomPolicy(3)
p2 = RandomPolicy(4)
p3 = RandomPolicy(5)
regrets = [bak_run_trajectory(SimpleBandit(), [p1, p2, p3], 15)] #for _ in range(10)]
print(regrets)
np.mean(regrets)

The 0th policy selected these arms: [2, 2, 2, 1, 1, 1, 2, 2, 1, 0, 0, 2, 1, 2, 2] under 15 time steps
The current binary rewards for 15 arms: [1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0]
The actually rewards received from the 0th pilicy: 7
The potential total rewards under the 0th policy for each arm: [ 3.  0. 12.  6.  4. 13. 10. 14.  1.]
The regrets under the 0th policy: 7.0
The 1th policy selected these arms: [3, 0, 1, 2, 1, 2, 1, 2, 1, 2, 1, 3, 0, 3, 3] under 15 time steps
The current binary rewards for 15 arms: [1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1]
The actually rewards received from the 1th pilicy: 11
The potential total rewards under the 1th policy for each arm: [ 4.  2. 14. 11.  5.  8.  9. 14.  1.]
The regrets under the 1th policy: 3.0
The 2th policy selected these arms: [3, 0, 3, 4, 4, 4, 3, 3, 4, 1, 1, 4, 2, 1, 2] under 15 time steps
The current binary rewards for 15 arms: [1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1]
The actually rewards received from the 2th pilicy: 

6.666666666666667