In [3]:
import tensorflow as tf
import numpy as np
import sys
import pickle

from model import PMF, DRRAveStateRepresentation, Actor, Critic

from utils.prioritized_replay_buffer import NaivePrioritizedReplayMemory, Transition
from utils.history_buffer import HistoryBuffer
from utils.general import export_plot

In [79]:
class DRRTrainer(object):
    def __init__(self,
                 config,
                 actor_function,
                 critic_function,
                 state_rep_function,
                 reward_function,
                 users,
                 items,
                 train_data,
                 test_data,
                 user_embeddings,
                 item_embeddings):
        
        ## importing reward function
        self.reward_function = reward_function
        ## importing training and testing data
        self.train_data = train_data
        self.test_data = test_data
        ## importing users and items
        self.users = users
        self.items = items
        ## importing user and item embeddings
        self.user_embeddings = user_embeddings
        self.item_embeddings = item_embeddings
        ## declaring index identifier for dataset
        ## u for user, i for item, r for reward/rating
        self.u = 0
        self.i = 1
        self.r = 2
        
        ## dimensions
        ## self.item_embeddings already hold the weights array
        ## this should be 100
        self.item_features = self.item_embeddings.shape[1]
        self.user_features = self.user_embeddings.shape[1]
        
        ## number of user and items
        self.n_items = self.item_embeddings.shape[0]
        self.n_users = self.user_embeddings.shape[0]
        
        ## the shape of state space, action space
        ## this should be 300
        self.state_shape = 3 * self.item_features
        ## this should be 100
        self.action_shape = self.item_features
        
        self.critic_output_shape = 1
        self.config = config
        ## Data dimensions Extracted
        
        ## instantiate a drravestaterepresentation
        self.state_rep_net = state_rep_function(self.config.history_buffer_size,
                                                self.item_features,
                                                self.user_features)
        
        ## instantiate actor and target actor networks
        self.actor_net = actor_function(self.state_shape, self.action_shape)                                
        self.target_actor_net = actor_function(self.state_shape, self.action_shape)
        
        ## instantiate critic and target critics networks
        self.critic_net = critic_function(self.action_shape,
                                          self.state_shape,
                                          self.critic_output_shape)
        
        self.target_critic_net = critic_function(self.action_shape,
                                                 self.state_shape,
                                                 self.critic_output_shape)
        print("Actor-Critic model has successfully instantiated")
        
        self.state_rep_optimizer = tf.keras.optimizers.Adam(
            learning_rate=self.config.lr_state_rep)
        
        self.actor_optimizer = tf.keras.optimizers.Adam(
            learning_rate=self.config.lr_actor)
        
        self.critic_optimizer = tf.keras.optimizers.Adam(
            learning_rate=self.config.lr_critic)
        
        print("DRR Instantiazed")
        
    def learn(self):
        # Initialize buffers
        print("NPRM and History Buffer Initialized")
        replay_buffer = NaivePrioritizedReplayMemory(self.config.replay_buffer_size,
                                                     prob_alpha=self.config.prob_alpha)

        history_buffer = HistoryBuffer(self.config.history_buffer_size)
        
        # Initialize trackers
        # initialize timesteps and epoch
        timesteps = 0
        epoch = 0
        ## this variable is for episode
        eps_slope = abs(self.config.eps_start - self.config.eps)/self.config.eps_steps
        eps = self.config.eps_start
        ## this variable is to hold the losses along the time
        actor_losses = []
        critic_losses = []
        ## this variable is to hold the episodic rewards
        epi_rewards = []
        epi_avg_rewards = []
        
        e_arr = []
        
        ## this variable holds the user index
        ## got from the dictionary
        user_idxs = np.array(list(self.users.values()))
        np.random.shuffle(user_idxs)
        
        ## loop all the users based on indexes
        for idx, e in enumerate(user_idxs):
            ## starting the episodes
            
            ## the loops stop when timesteps-learning_start
            ## is bigger than the max timesteps
            if timesteps - self.config.learning_start > self.config.max_timesteps_train:
                break
            
            ## extracting positive user reviews
            ## e variable is an element right now
            user_reviews = self.train_data[self.train_data[:, self.u] == e]
            pos_user_reviews = user_reviews[user_reviews[:, self.r] > 0]
            
            ## check if the user ratings doesn't have enough positive review
            ## in this case history_buffer_size is 4
            ## get the shape object and 0 denote the row index
            if pos_user_reviews.shape[0] < self.config.history_buffer_size:
                continue
                
            candidate_items = tf.identity(tf.stop_gradient(self.item_embeddings))
            
            ## extracting user embedding tensors
            user_emb = self.user_embeddings[e]
            
            ## fill history buffer with positive item embeddings
            ## and remove item embeddings from candidate item sets
            ignored_items = []
            
            ## history_buffer_size has size of n items
            ## in this case 5
            for i in range(self.config.history_buffer_size):
                emb = candidate_items[tf.cast(pos_user_reviews[i, self.i], dtype='int32')]
                history_buffer.push(tf.identity(tf.stop_gradient(emb)))
                
            ## initialize rewards list
            rewards = []
            
            ## starting item index
            t = 0
            
            ## declaring the needed variable
            state = None
            action = None
            reward = None
            next_state = None
            
            while t < self.config.episode_length:
                ## observing the current state
                ## choose action according to actor network explorations
                
                ## inference calls start here
                
                if eps > self.config.eps:
                    eps -= eps_slope
                else:
                    eps = self.config.eps
                
                ## state is the result of DRRAve model inference
                ## history_buffer.to_list get the list of previous items
                ## state representaton has the size (300, )
                state = self.state_rep_net(user_emb, tf.stack(history_buffer.to_list()))
                if np.random.uniform(0, 1) < eps:
                    action = tf.convert_to_tensor(np.random.rand(self.action_shape), dtype='float32')
                    ranking_scores = candidate_items @ tf.reshape(action, (action.shape[0], 1))
                else:
                    action = self.actor_net(tf.stop_gradient(state), training=False)
                    ranking_scores = candidate_items @ tf.reshape(action, (action.shape[1], 1))
                    
                ## calculating ranking scores accross items, discard ignored items
                
                if len(ignored_items) > 0:
                    rec_items = tf.stack(ignored_items)
                else:
                    rec_items = []
                
                ## setting value of negative infinite
#                 ranking_scores[rec_items] = -sys.maxsize -1
                
                ## get the recommended items
                ## first get the maximum value index
                ## then get the items by index from candidate items
                ranking_scores = tf.reshape(ranking_scores, (ranking_scores.shape[0],))
                rec_item_idx = tf.math.argmax(ranking_scores)
                rec_item_emb = candidate_items[rec_item_idx]
                
                ## get the item reward
                if tf.cast(rec_item_idx, 'float32') in user_reviews[:, self.i]:
                    ## get the reward from rating in the dataset
                    ## if the user is rating the item
                    reward = user_reviews[user_reviews[:, self.i] == rec_item_idx, self.r]
                else:
                    if self.config.zero_reward:
                        reward = tf.convert_to_tensor(0)
                    else:
                        reward = self.reward_function(tf.convert_to_tensor(e), rec_item_idx)
                
                ## track the episode rewards
                rewards.append(reward.item())
                
                ## add item to history buffer if positive reviews
                if reward > 0:
                    history_buffer.push(tf.stop_gradient(rec_item_emb))
                    
                    next_state = self.state_rep_net(user_emb, tf.stack(history_buffer.to_list()), training=False)
                else:
                    ## keep the history buffer the same
                    ## the next state is the current state
                    next_state = tf.stop_gradient(state)
                
                ## remove new items from future recommendation
                ignored_items.append(tf.conver_to_tensor(rec_item_idx))
                
                ## add the (state, action, reward, next_state)
                ## to the experience replay
                replay_buffer.push(state, action, next_state, reward)
                
                ## Inference calling stops here
                ## Training start here
                if(timesteps > self.config.learning_start) and (len(replay_buffer) >= self.config.batch_size) and (timesteps % self.config.learning_freq == 0):
                    critic_loss, actor_loss, critic_params_norm = self.training_step(timesteps,
                                                                                     replay_buffer,
                                                                                     True
                                                                                     )
                    ## storing the losses along the time
                    actor_losses.append(actor_loss)
                    critic_losses.append(critic_loss)
                    
                    ## outputting the result
                    if timesteps % self.config.log_freq == 0:
                        if len(rewards) > 0:
                            print(
                                f'Timestep {timesteps - self.config.learning_start} | '
                                f'Episode {epoch} | '
                                f'Mean Ep R '
                                f'{np.mean(rewards):.4f} | '
                                f'Max R {np.max(rewards):.4f} | '
                                f'Critic Params Norm {critic_params_norm:.4f} | '
                                f'Actor Loss {actor_loss:.4f} | '
                                f'Critic Loss {critic_loss:.4f} | ')
                            sys.stdout.flush()
            
                ## housekeeping
                t += 1
                timesteps += 1
            
                ## end of timesteps
            ## end of episodes
            if timesteps - self.config.learning_start > t:
                epoch += 1
                e_arr.append(epoch)
                epi_rewards.append(np.sum(rewards))
                epi_avg_rewards.append(np.mean(rewards))
        
        print("Training Finished")
        return actor_losses, critic_losses, epi_avg_rewards
    
    def training_step(self, t, replay_buffer, training):
        ## create batches
        ## from utils programs
        # Create batches by calling sample methods
        transitions, indicies, weights = replay_buffer.sample(self.config.batch_size, beta=self.config.beta)
        ## create the tuple using Transition function
        batch = Transition(*zip(*transtitions))
        
        ## preparing the batch for each data
        ## the concat function will flatten the data
        ## the reshape will reshape the data so that it receive 64 rows
        next_state_batch = tf.reshape(tf.concat(batch.next_state), (self.config.batch_size, -1))
        state_batch = tf.reshape(tf.concat(batch.state), (self.config.batch_size, -1))
        action_batch = tf.reshape(tf.concat(batch.action), (self.config.batch_size, -1))
        reward_batch = tf.reshape(tf.concat(batch.reward), (self.config.batch_size, -1))
        
        ## updating the critic networks
        with tf.GradientTape() as tape:
            critic_loss, new_priorities = self.compute_prioritized_dqn_loss(state_batch.detach(),
                                                                            action_batch,
                                                                            reward_batch,
                                                                            next_state_batch,
                                                                            weights)
        ## apply the gradient
        grads = tape.gradient(critic_loss, self.critic_net.trainable_variables)
        ## step the optimizers
        self.critic_optimizer.apply_gradients(zip(grads, self.critic_net.trainable_variables))
        
        ## updating the actor networks
        with tf.GradientTape() as tape:
            actions_pred = self.actor_net(state_batch)
            actor_loss = -self.critic_net(tf.stop_gradient(state_batch), actions_pred).numpy().mean()
        
        ## compute the gradient
        grads = tape.gradient(actor_loss, self.actor_net.trainable_variables)
        ## apply the step to the optimizers
        self.actor_optimizer.apply_gradients(zip(grads, self.actor_net.trainable_variables))
        self.state_rep_optimizer.apply_gradients(zip(grads, self.state_rep_net.trainable_variables))
        ## minimizing the loss
        
        self.soft_update(self.critic_net, self.target_critic_net, self.config.tau)
        self.soft_update(self.actor_net, self.target_actor_net, self.config.tau)
        
        return critic_loss.item(), actor_loss.item(), critic_param_norm
        
        
    
    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_weights, local_weights in zip(reward_function_copy.layers, reward_function.layers):
            temp_w = local_weights.get_weights()[0] * tau + (1.0 - tau) * target_weights.get_weights()[0]
            target_weights.set_weights([temp_w])
    
    @tf.function    
    def compute_prioritized_dqn_loss(self,
                                     state_batch,
                                     action_batch,
                                     reward_batch,
                                     next_state_batch,
                                     weights):
        '''
        :param state_batch: (tensor) shape = (batch_size x state_dims),
                The batched tensor of states collected during
                training (i.e. s)
        :param action_batch: (LongTensor) shape = (batch_size,)
                The actions that you actually took at each step (i.e. a)
        :param reward_batch: (tensor) shape = (batch_size,)
                The rewards that you actually got at each step (i.e. r)
        :param next_state_batch: (tensor) shape = (batch_size x state_dims),
                The batched tensor of next states collected during
                training (i.e. s')
        :param weights: (tensor) shape = (batch_size,)
                Weights for each batch item w.r.t. prioritized experience replay buffer
        :return: loss: (torch tensor) shape = (1),
                 new_priorities: (numpy array) shape = (batch_size,)
        '''
        ## create batches
        next_action = self.target_actor_net(next_state_batch, training=False)
        q_target = self.target_critic_net(next_state_batch, next_action, training=False)
        
        ## logits
        logits = reward_batch + self.config.gamma * q_target
        
        ## get q values from the current state
        q_vals = self.critic_net(state.batch, action_batch)
        
        ## calculate loss
        loss = y - q_vals
        ## because loss is tensor shape
        ## we can extract the numpy value
        loss = np.power(loss.numpy().flatten(), 2)
        weights_ten = tf.stop_grad(tf.convert_to_tensor(weights))
        loss = loss * weights_ten
        
        ## calculate new priorities
        new_priorities = tf.stop_grad(loss + 1e-5)
        loss = loss.mean()
        
        return loss, new_priorities

In [61]:
ranking_scores = item_embeddings @ tf.reshape(action, (action.shape[1], 1))

In [68]:
tf.reshape(ranking_scores, (ranking_scores.shape[0],))[0]

<tf.Tensor: shape=(), dtype=float32, numpy=0.009489409>

In [70]:
ranking_scores = tf.reshape(ranking_scores, (ranking_scores.shape[0],))

In [71]:
ranking_scores

<tf.Tensor: shape=(1682,), dtype=float32, numpy=
array([ 0.00948941,  0.04992265, -0.03665902, ...,  0.19314665,
       -0.01089569, -0.21394743], dtype=float32)>

In [73]:
rec_idx = tf.math.argmax(ranking_scores)

In [54]:
actor_net = Actor(300, 100)

In [57]:
action = actor_net(state)

In [59]:
action.shape

TensorShape([1, 100])

In [56]:
state = tf.convert_to_tensor(np.random.rand(300, ))

In [41]:
item_embeddings @ 

<tf.Tensor: shape=(1682, 100), dtype=float32, numpy=
array([[ 0.02927838, -0.01858752, -0.01971002, ...,  0.01666964,
         0.01175865,  0.01821153],
       [ 0.04730224, -0.01002619, -0.04633345, ..., -0.05041706,
        -0.0197012 , -0.05130218],
       [-0.01143864, -0.01942264, -0.02575671, ...,  0.0525948 ,
        -0.00352616,  0.01375635],
       ...,
       [ 0.0363218 , -0.00957389,  0.01001419, ...,  0.00554959,
         0.0330762 ,  0.05551627],
       [ 0.11699422,  0.04615022, -0.0305032 , ..., -0.00461498,
         0.0034758 , -0.06877374],
       [ 0.09200613,  0.00553454,  0.02183542, ...,  0.04724813,
        -0.09510388,  0.11581472]], dtype=float32)>

In [80]:
class config():
    ## hyperparameters
    ## setting the batch_size
    batch_size = 64
    gamma = 0.9
    replay_buffer_size = 100000
    history_buffer_size = 5
    learning_start = 32
    learning_freq = 1
    ## learning rate for each model networks
    lr_state_rep = 0.001
    lr_actor = 0.0001
    lr_critic = 0.001
    
    eps_start = 1
    eps = 0.1
    eps_steps = 10000
    eps_eval = 0.1
    episode_length = 10
    
    tau = 0.01 # inital 0.001
    beta = 0.4
    prob_alpha = 0.3
    
    max_timesteps_train = 260000
    max_epochs_offline = 500
    max_timesteps_online = 20000
    embedding_feature_size = 100
    
    train_ratio = 0.8
    weight_decay = 0.01
    clip_val = 1.0
    log_freq = 100
    saving_freq = 1000
    zero_reward = False
    
## First importing the data
users = pickle.load(open('Dataset/user_id_to_num_mov.pkl', 'rb'))
items = pickle.load(open('Dataset/movie_id_to_num_mov.pkl', 'rb'))
data = np.load('Dataset/data.npy')

## hold the length of the data
n_users = len(users)
n_items = len(items)

## don't forget to normalize the data first
data[:, 2] = 0.5 * (data[:, 2] - 3)

## split and shuffle the data
np.random.shuffle(data)
## split the data
## ratio should be 0.8
train_data = tf.convert_to_tensor(data[:int(config.train_ratio * data.shape[0])])
test_data = tf.convert_to_tensor(data[int(config.train_ratio * data.shape[0]):])
print("Train Data:{}, Test Data:{}".format(np.shape(train_data), np.shape(test_data)))

## hold the PMF model
## get the user and item embeddings
reward_function = PMF(n_users, n_items, config.embedding_feature_size)
## need to flow some data to build the model
reward_function(1, 1)
## loading the whole layer weights
reward_function.load_weights('trained/adam/pmf_150_adam')
## freeze the model, because it will be used for inference
reward_function.trainable = False

## take the embedding layers weight
## and split the user and item weights
user_embeddings = tf.convert_to_tensor(reward_function.user_embedding.get_weights()[0])
item_embeddings = tf.convert_to_tensor(reward_function.item_embedding.get_weights()[0])
## output
print("user embedding has shape {} and item embedding has shape {}"
      .format(np.shape(user_embeddings[0]), np.shape(item_embeddings[0])))

## hold the model in the variable
## so it can be tracked
state_rep_function = DRRAveStateRepresentation
actor_function = Actor
critic_function = Critic

## initialize DRRTrain Class
trainer = DRRTrainer(config,
                     actor_function,
                     critic_function,
                     state_rep_function,
                     reward_function,
                     users,
                     items,
                     train_data,
                     test_data,
                     user_embeddings,
                     item_embeddings)

print("Start Training")
trainer.learn()

Train Data:(80000, 3), Test Data:(20000, 3)
user embedding has shape (100,) and item embedding has shape (100,)
Actor-Critic model has successfully instantiated
DRR Instantiazed
Start Training
NPRM and History Buffer Initialized


ValueError: Tensor conversion requested dtype float64 for Tensor with dtype float32: <tf.Tensor: shape=(), dtype=float32, numpy=1645.0>

In [5]:
reward_function_copy = PMF(n_users, n_items, config.embedding_feature_size)
reward_function_copy(1, 1)

<tf.Tensor: shape=(), dtype=float32, numpy=1.7724931>

In [23]:
tau = 0.01

In [None]:
a = tf.convert_to_tensor([[1, 2, 3], [4, 5, 6]])

In [25]:
for target_weights, local_weights in zip(reward_function_copy.layers, reward_function.layers):
    temp_w = local_weights.get_weights()[0] * tau + (1.0 - tau) * target_weights.get_weights()[0]
    target_weights.set_weights([temp_w])

In [21]:
reward_function_copy.layers[0].set_weights(reward_function.layers[0].get_weights())
print([reward_function_copy.layers[0].get_weights()[0]])

[array([[-0.02264013, -0.00803596, -0.00766289, ...,  0.00541727,
         0.01059815, -0.01396531],
       [ 0.00427928,  0.00162332,  0.05373245, ...,  0.03625406,
        -0.07006924,  0.01003331],
       [ 0.00060505, -0.0082522 ,  0.0349106 , ..., -0.01670074,
        -0.00933639,  0.00602781],
       ...,
       [-0.00582577, -0.04501645, -0.0346372 , ..., -0.0845314 ,
        -0.04627491,  0.05608604],
       [-0.04641426,  0.01256191,  0.01490307, ..., -0.00078481,
         0.03249994, -0.03189057],
       [ 0.01732369, -0.01391567, -0.00765261, ...,  0.0080509 ,
        -0.02056003,  0.0187523 ]], dtype=float32)]


In [14]:
temp_w = reward_function.layers[0].get_weights()[0] * 0.1
temp_w

array([[-2.2640128e-03, -8.0359605e-04, -7.6628890e-04, ...,
         5.4172706e-04,  1.0598153e-03, -1.3965314e-03],
       [ 4.2792820e-04,  1.6233191e-04,  5.3732456e-03, ...,
         3.6254060e-03, -7.0069241e-03,  1.0033314e-03],
       [ 6.0504572e-05, -8.2521979e-04,  3.4910606e-03, ...,
        -1.6700743e-03, -9.3363895e-04,  6.0278119e-04],
       ...,
       [-5.8257702e-04, -4.5016450e-03, -3.4637197e-03, ...,
        -8.4531410e-03, -4.6274913e-03,  5.6086038e-03],
       [-4.6414263e-03,  1.2561911e-03,  1.4903073e-03, ...,
        -7.8481113e-05,  3.2499942e-03, -3.1890569e-03],
       [ 1.7323690e-03, -1.3915672e-03, -7.6526118e-04, ...,
         8.0509018e-04, -2.0560033e-03,  1.8752298e-03]], dtype=float32)

In [12]:
weight = []

In [15]:
weight.append(temp_w)

In [16]:
weight

[array([[-2.2640128e-03, -8.0359605e-04, -7.6628890e-04, ...,
          5.4172706e-04,  1.0598153e-03, -1.3965314e-03],
        [ 4.2792820e-04,  1.6233191e-04,  5.3732456e-03, ...,
          3.6254060e-03, -7.0069241e-03,  1.0033314e-03],
        [ 6.0504572e-05, -8.2521979e-04,  3.4910606e-03, ...,
         -1.6700743e-03, -9.3363895e-04,  6.0278119e-04],
        ...,
        [-5.8257702e-04, -4.5016450e-03, -3.4637197e-03, ...,
         -8.4531410e-03, -4.6274913e-03,  5.6086038e-03],
        [-4.6414263e-03,  1.2561911e-03,  1.4903073e-03, ...,
         -7.8481113e-05,  3.2499942e-03, -3.1890569e-03],
        [ 1.7323690e-03, -1.3915672e-03, -7.6526118e-04, ...,
          8.0509018e-04, -2.0560033e-03,  1.8752298e-03]], dtype=float32)]

In [18]:
reward_function_copy.layers[0].set_weights(weight)

In [19]:
reward_function_copy.layers[0].get_weights()

[array([[-2.2640128e-03, -8.0359605e-04, -7.6628890e-04, ...,
          5.4172706e-04,  1.0598153e-03, -1.3965314e-03],
        [ 4.2792820e-04,  1.6233191e-04,  5.3732456e-03, ...,
          3.6254060e-03, -7.0069241e-03,  1.0033314e-03],
        [ 6.0504572e-05, -8.2521979e-04,  3.4910606e-03, ...,
         -1.6700743e-03, -9.3363895e-04,  6.0278119e-04],
        ...,
        [-5.8257702e-04, -4.5016450e-03, -3.4637197e-03, ...,
         -8.4531410e-03, -4.6274913e-03,  5.6086038e-03],
        [-4.6414263e-03,  1.2561911e-03,  1.4903073e-03, ...,
         -7.8481113e-05,  3.2499942e-03, -3.1890569e-03],
        [ 1.7323690e-03, -1.3915672e-03, -7.6526118e-04, ...,
          8.0509018e-04, -2.0560033e-03,  1.8752298e-03]], dtype=float32)]

In [None]:
for target_layers, layers in zip(reward_function_copy.layers, reward_function.layers):
    target_layers.set_weights(layers.get_weights())

In [None]:
for target_layers in reward_function_copy.layers:
    print(target_layers.get_weights())

In [None]:
np.power(a.numpy().flatten(), 2)

In [None]:
tf.reshape(tf.concat([item_1, item_2, item_3], 0), (3, -1))

In [None]:
user_e = user_embeddings[0]