In [1]:

import numpy as np


class DeepContextualBandit:
    def __init__(self, num_actions, context_size, hidden_size, learning_rate):
        self.num_actions = num_actions
        self.context_size = context_size
        self.hidden_size = hidden_size
        self.learning_rate = learning_rate
        
        # initialize weights for the neural network
        self.W1 = np.random.randn(context_size, hidden_size) / np.sqrt(context_size)
        self.b1 = np.zeros((1, hidden_size))
        
        self.W2 = np.random.randn(hidden_size, num_actions) / np.sqrt(hidden_size)
        self.b2 = np.zeros((1, num_actions))
        
        # initialize optimizer
        self.optimizer = Adam(learning_rate)
        
    def predict(self, x):
        # forward pass through the neural network
        h1 = np.maximum(0, np.dot(x, self.W1) + self.b1)
        out = np.dot(h1, self.W2) + self.b2
        return out
    
    def update(self, x, a, r):
        # compute the gradient w.r.t. the action taken
        grad = self.softmax_grad(self.predict(x), a)
        
        # compute the loss and update the weights
        loss = -r * np.log(grad)
        grad_wrt_out = grad.copy()
        grad_wrt_out[0,a] -= 1
        grad_wrt_h1 = np.dot(grad_wrt_out, self.W2.T)
        h1 = np.maximum(0, np.dot(x, self.W1) + self.b1)
        grad_wrt_h1[h1 <= 0] = 0
        grad_wrt_W2 = np.dot(h1.T, grad_wrt_out)
        grad_wrt_b2 = np.sum(grad_wrt_out, axis=0, keepdims=True)
        grad_wrt_W1 = np.dot(x.T, grad_wrt_h1)
        grad_wrt_b1 = np.sum(grad_wrt_h1, axis=0, keepdims=True)
        
        # update the weights using Adam optimizer
        self.W1, self.b1, self.W2, self.b2 = self.optimizer.update(
            self.W1, self.b1, self.W2, self.b2,
            grad_wrt_W1, grad_wrt_b1, grad_wrt_W2, grad_wrt_b2
        )
        
    def softmax(self, x):
        exp_x = np.exp(x - np.max(x))
        return exp_x / np.sum(exp_x)
    
    def softmax_grad(self, out, a):
        grad = self.softmax(out)
        grad[0,a] -= 1
        return grad


class Adam:
    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m_t = None
        self.v_t = None
        self.t = 0
    
    def update(self, W1, b1, W2, b2, grad_wrt_W1, grad_wrt_b1, grad_wrt_W2, grad_wrt_b2):
        self.t += 1
        
        if self.m_t is None:
            self.m_t = [np.zeros_like(W1), np.zeros_like(b1), np.zeros_like(W2), np.zeros_like(b2)]
            self.v_t = [np.zeros_like(W1), np.zeros_like(b1), np.zeros_like(W2), np.zeros_like(b2)]
        
        m_t_prev = self.m_t.copy()
        v_t_prev = self.v_t.copy()
        
        self.m_t[0] = self.beta1 * m_t_prev[0] + (1 - self.beta1) * grad_wrt_W1
        self.m_t[1] = self.beta1 * m_t_prev[1] + (1 - self.beta1) * grad_wrt_b1
        self.m_t[2] = self.beta1 * m_t_prev[2] + (1 - self.beta1) * grad_wrt_W2
        self.m_t[3] = self.beta1 * m_t_prev[3] + (1 - self.beta1) * grad_wrt_b2
        
        self.v_t[0] = self.beta2 * v_t_prev[0] + (1 - self.beta2) * (grad_wrt_W1 ** 2)
        self.v_t[1] = self.beta2 * v_t_prev[1] + (1 - self.beta2) * (grad_wrt_b1 ** 2)
        self.v_t[2] = self.beta2 * v_t_prev[2] + (1 - self.beta2) * (grad_wrt_W2 ** 2)
        self.v_t[3] = self.beta2 * v_t_prev[3] + (1 - self.beta2) * (grad_wrt_b2 ** 2)
 
        m_t_hat_0 = self.m_t[0] / (1 - self.beta1 ** self.t)
        m_t_hat_1 = self.m_t[1] / (1 - self.beta1 ** self.t)
        m_t_hat_2 = self.m_t[2] / (1 - self.beta1 ** self.t)
        m_t_hat_3 = self.m_t[3] / (1 - self.beta1 ** self.t)
 
        v_t_hat_0 = self.v_t[0] / (1 - self.beta2 ** self.t)
        v_t_hat_1 = self.v_t[1] / (1 - self.beta2 ** self.t)
        v_t_hat_2 = self.v_t[2] / (1 - self.beta2 ** self.t)
        v_t_hat_3 = self.v_t[3] / (1 - self.beta2 ** self.t)
 
        W1 -= self.learning_rate * m_t_hat_0 / (np.sqrt(v_t_hat_0) + self.epsilon)
        b1 -= self.learning_rate * m_t_hat_1 / (np.sqrt(v_t_hat_1) + self.epsilon)
        W2 -= self.learning_rate * m_t_hat_2 / (np.sqrt(v_t_hat_2) + self.epsilon)
        b2 -= self.learning_rate * m_t_hat_3 / (np.sqrt(v_t_hat_3) + self.epsilon)
 
        return W1, b1, W2, b2

In [None]:
num_actions = 4
context_size = 10
hidden_size = 20
learning_rate = 0.01

env = ContextualBandit(num_actions, context_size)
agent = DeepContextualBandit(num_actions, context_size, hidden_size, learning_rate)

for i in range(10000):
# get a context from the environment
context = env.get_context()

In [None]:
import numpy as np


class LinUCB:
    """
    LinUCB algorithm implementation
    """

    def __init__(self, obs_dim,action_n,alpha, context="user"):
        """
        Parameters
        ----------
        alpha : number
            LinUCB parameter
        context: string
            'user' or 'both'(item+user): what to use as a feature vector
        """
        self.n_features =obs_dim
        if context == "user":
            self.context = 1
        elif context == "both":
            self.context = 2
            self.n_features *= 2

        self.A = np.array([np.identity(self.n_features)] * action_n)
        self.A_inv = np.array([np.identity(self.n_features)] * action_n)
        self.b = np.zeros((action_n, self.n_features, 1))
        self.alpha = round(alpha, 1)
        self.algorithm = "LinUCB (α=" + str(self.alpha) + ", context:" + context + ")"

    def choose_arm(self, context):
        """
        Returns the best arm's index relative to the pool
        Parameters
        ----------
        user : array
            user features
        pool_idx : array of indexes
            pool indexes for article identification
        """

        A_inv = self.A_inv[pool_idx]
        b = self.b[pool_idx]

        n_pool = len(pool_idx)

        user = np.array([user] * n_pool)
        if self.context == 1:
            x = user
        else:
            x = np.hstack((user, dataset.features[pool_idx]))

        x = x.reshape(n_pool, self.n_features, 1)

        theta = A_inv @ b

        p = np.transpose(theta, (0, 2, 1)) @ x + self.alpha * np.sqrt(
            np.transpose(x, (0, 2, 1)) @ A_inv @ x
        )
        return np.argmax(p)

    def update(self, context, action, reward,features=None):
        """
        Updates algorithm's parameters(matrices) : A,b
        Parameters
        ----------
        displayed : index
            displayed article index relative to the pool
        reward : binary
            user clicked or not
        user : array
            user features
        pool_idx : array of indexes
            pool indexes for article identification
        """

        a = action # displayed article's index
        if self.context == 1:
            x = np.array(context)
        else:
            x = np.hstack((context, features[a]))

        x = x.reshape((self.n_features, 1))

        self.A[a] += x @ x.T
        self.b[a] += reward * x
        self.A_inv[a] = np.linalg.inv(self.A[a])





In [59]:
'''
Basic LinUCB implementation.
'''

# Python imports.
import numpy as np
from collections import defaultdict

# Other imports.
#from simple_rl.agents.AgentClass import Agent
''' AgentClass.py: Class for a basic RL Agent '''

# Python imports.
from collections import defaultdict

class Agent(object):
    ''' Abstract Agent class. '''

    def __init__(self, name, actions, gamma=0.99):
        self.name = name
        self.actions = list(actions) # Just in case we're given a numpy array (like from Atari).
        self.gamma = gamma
        self.episode_number = 0
        self.prev_state = None
        self.prev_action = None

    def get_parameters(self):
        '''
        Returns:
            (dict) key=param_name (str) --> val=param_val (object).
        '''
        return {}

    def act(self, state, reward):
        '''
        Args:
            state (State): see StateClass.py
            reward (float): the reward associated with arriving in state @state.

        Returns:
            (str): action.
        '''
        pass

    def policy(self, state):
        return self.act(state, 0)

    def reset(self):
        '''
        Summary:
            Resets the agent back to its tabula rasa config.
        '''
        self.prev_state = None
        self.prev_action = None
        self.step_number = 0

    def end_of_episode(self):
        '''
        Summary:
            Resets the agents prior pointers.
        '''
        self.prev_state = None
        self.prev_action = None
        self.episode_number += 1

    def set_name(self, name):
        self.name = name

    def get_name(self):
        return self.name

    def __str__(self):
        return str(self.name)


class LinUCBAgent(Agent):
    '''
    From:
        Lihong Li, et al. "A Contextual-Bandit Approach to Personalized
        News Article Recommendation." In Proceedings of the 19th
        International Conference on World Wide Web (WWW), 2010.
    '''

    def __init__(self, actions, name="LinUCB", rand_init=True, context_size=1, alpha=1.5):
        '''
        Args:
            actions (list): Contains a string for each action.
            name (str)
            context_size (int)
            alpha (float): Uncertainty parameter.
        '''
        Agent.__init__(self, name, actions)
        self.alpha = alpha
        self.context_size = context_size
        self.prev_context = None
        self.step_number = 0
        self.rand_init = rand_init
        self._init_action_model(rand_init)


    def get_parameters(self):
        '''
        Returns:
            (dict) key=param_name (str) --> val=param_val (object).
        '''
        param_dict = defaultdict(int)
        
        param_dict["rand_init"] = self.rand_init
        param_dict["context_size"] = self.context_size
        param_dict["alpha"] = self.alpha

        return param_dict

    def _init_action_model(self, rand_init=True):
        '''
        Summary:
            Initializes model parameters
        '''
        self.model = {'act': {}, 'act_inv': {}, 'theta': {}, 'b': {}}
        for action_id in range(len(self.actions)):
            self.model['act'][action_id] = np.identity(self.context_size)
            self.model['act_inv'][action_id] = np.identity(self.context_size)
            if rand_init:
                self.model['theta'][action_id] = np.random.random((self.context_size, 1))
            else:
                self.model['theta'][action_id] = np.zeros((self.context_size, 1))
            self.model['b'][action_id] = np.zeros((self.context_size,1))

    def _compute_score(self, context):
        '''
        Args:
            context (list)

        Returns:
            (dict):
                K (str): action
                V (float): score
        '''

        a_inv = self.model['act_inv']
        theta = self.model['theta']

        estimated_reward = {}
        uncertainty = {}
        score_dict = {}
        max_score = 0
        for action_id in range(len(self.actions)):
            action_context = np.reshape(context[action_id], (-1, 1))
            estimated_reward[action_id] = float(theta[action_id].T.dot(action_context))
            uncertainty[action_id] = float(self.alpha * np.sqrt(action_context.T.dot(a_inv[action_id]).dot(action_context)))
            score_dict[action_id] = estimated_reward[action_id] + uncertainty[action_id]

        return score_dict

    def update(self, reward):
        '''
        Args:
            reward (float)

        Summary:
            Updates self.model according to self.prev_context, self.prev_action, @reward.
        '''
        action_id = self.actions.index(self.prev_action)
        action_context = np.reshape(self.prev_context[action_id], (-1, 1))
        self.model['act'][action_id] += action_context.dot(action_context.T)
        self.model['act_inv'][action_id] = np.linalg.inv(self.model['act'][action_id])
        self.model['b'][action_id] += reward * action_context
        self.model['theta'][action_id] = self.model['act_inv'][action_id].dot(self.model['b'][action_id])

    def act(self, context, reward):
        '''
        Args:
            context (iterable)
            reward (float)

        Returns:
            (str): action.
        '''

        # Update previous context-action pair.
        if self.prev_action is not None:
            self.update(reward)

        # Compute score.
        context = self._pre_process_context(context)
        score = self._compute_score(context)

        # Compute best action.
        best_action = np.random.choice(self.actions)
        max_score = float("-inf")
        for action_id in range(len(self.actions)):
            if score[action_id] > max_score:
                max_score = score[action_id]
                best_action = self.actions[action_id]


        # Update prev pointers.
        self.prev_action = best_action
        self.prev_context = context
        self.step_number += 1
        
        return best_action

    def _pre_process_context(self, context):
        if context.get_num_feats() == 1:
            # If there's no context (that is, we're just in a regular bandit).
            context = context.features()

        if not hasattr(context[0], '__iter__'):
            # If we only have a single context.
            new_context = {}
            for action_id in range(len(self.actions)):
                new_context[action_id] = context
            context = new_context

        return context

In [60]:
actions = ["DC1","DC2","DC3"]#[0,1,2]
test=LinUCBAgent(actions, name="LinUCB", rand_init=True, context_size=4, alpha=1.5)

In [61]:
test.get_parameters()



defaultdict(int, {'rand_init': True, 'context_size': 4, 'alpha': 1.5})

In [62]:
# Python imports
from collections.abc import Sequence

import numpy as np

''' StateClass.py: Contains the State Class. '''

class State(Sequence):
    ''' Abstract State class '''

    def __init__(self, data=[], is_terminal=False):
        self.data = data
        self._is_terminal = is_terminal

    def features(self):
        '''
        Summary
            Used by function approximators to represent the state.
            Override this method in State subclasses to have functiona
            approximators use a different set of features.
        Returns:
            (iterable)
        '''
        return np.array(self.data).flatten()

    def get_data(self):
        return self.data

    def get_num_feats(self):
        return len(self.features())

    def is_terminal(self):
        return self._is_terminal

    def set_terminal(self, is_term=True):
        self._is_terminal = is_term

    def __hash__(self):
        if type(self.data).__module__ == np.__name__:
            # Numpy arrays
            return hash(str(self.data))
        elif self.data.__hash__ is None:
            return hash(tuple(self.data))
        else:
            return hash(self.data)

    def __str__(self):
        return "s." + str(self.data)

    def __eq__(self, other):
        if isinstance(other, State):
            return self.data == other.data
        return False

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return len(self.data)

feat = State([1,0,0,1])
feat.features()

arms = [
    np.array([1.0, 0.0, 0.0,1.0]),
    np.array([0.0, 1.0, 0.0,0.0]),
    np.array([0.0, 0.0, 1.0,1.0])
]

feat2=State(arms)
test.act(feat2,1.0)

'DC3'

In [72]:
context = feat2
np.reshape(context[1], (-1, 1)).T

array([[0., 1., 0., 0.]])

In [74]:
action_context = np.reshape(context[1], (-1, 1))
action_context.dot(action_context.T)

array([[0., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [58]:
feat2.features()

array([1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 1.])

In [65]:
for i in range(10):
    print(test.act(feat2,1.0))

DC3
DC3
DC3
DC3
DC3
DC3
DC3
DC3
DC3
DC3


In [64]:
for i in range(10):
    test.update(10.0)

In [66]:
test.model

{'act': {0: array([[6., 0., 0., 5.],
         [0., 1., 0., 0.],
         [0., 0., 1., 0.],
         [5., 0., 0., 6.]]),
  1: array([[1., 0., 0., 0.],
         [0., 2., 0., 0.],
         [0., 0., 1., 0.],
         [0., 0., 0., 1.]]),
  2: array([[ 1.,  0.,  0.,  0.],
         [ 0.,  1.,  0.,  0.],
         [ 0.,  0., 25., 24.],
         [ 0.,  0., 24., 25.]])},
 'act_inv': {0: array([[ 0.54545455,  0.        ,  0.        , -0.45454545],
         [ 0.        ,  1.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  1.        ,  0.        ],
         [-0.45454545,  0.        ,  0.        ,  0.54545455]]),
  1: array([[1. , 0. , 0. , 0. ],
         [0. , 0.5, 0. , 0. ],
         [0. , 0. , 1. , 0. ],
         [0. , 0. , 0. , 1. ]]),
  2: array([[ 1.        ,  0.        ,  0.        ,  0.        ],
         [ 0.        ,  1.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.51020408, -0.48979592],
         [ 0.        ,  0.        , -0.48979592,