### Environment Check

In [1]:
import tensorflow as tf
import numpy as np
import sys
import pickle

from model import PMF, DRRAveStateRepresentation, Actor, Critic

from utils.prioritized_replay_buffer import NaivePrioritizedReplayMemory, Transition
from utils.history_buffer import HistoryBuffer

In [7]:
class config():
    ## hyperparameters
    ## setting the batch_size
    batch_size = 64
    gamma = 0.9
    replay_buffer_size = 100000
    history_buffer_size = 5
    learning_start = 250
    learning_freq = 1
    ## learning rate for each model networks
    lr_state_rep = 0.001
    lr_actor = 0.0001
    lr_critic = 0.001
    
    eps_start = 1
    eps = 0.1
    eps_steps = 10000
    eps_eval = 0.1
    episode_length = 10
    
    tau = 0.01 # inital 0.001
    beta = 0.4
    prob_alpha = 0.3
    
    max_timesteps_train = 260000
    max_epochs_offline = 500
    max_timesteps_online = 20000
    embedding_feature_size = 100
    
    train_ratio = 0.8
    weight_decay = 0.01
    clip_val = 1.0
    log_freq = 100
    saving_freq = 1000
    zero_reward = False

In [37]:
## Inisialisasi variable
users = pickle.load(open('Dataset/user_id_to_num_mov.pkl', 'rb'))
items = pickle.load(open('Dataset/movie_id_to_num_mov.pkl', 'rb'))
data = np.load('Dataset/data.npy')

data[:, 2] = 0.5 * (data[:, 2] - 3)

train_data = tf.convert_to_tensor(data[:int(config.train_ratio * data.shape[0])], dtype='float32')

n_items = len(items)
n_users = len(users)

reward_function = PMF(n_users, n_items, config.embedding_feature_size)
reward_function(1, 1)
reward_function.load_weights('trained/adam/pmf_150_adam')

user_embeddings = tf.convert_to_tensor(reward_function.user_embedding.get_weights()[0])
item_embeddings = tf.convert_to_tensor(reward_function.item_embedding.get_weights()[0])

item_features = item_embeddings.shape[1] ## 100
user_features = user_embeddings.shape[1] ## 100
state_shape = 3 * item_features  # dimensionality 300
action_shape = item_features #100
critic_output_shape = 1 #1

u_id = 0
r_id = 2
i_id = 1

In [41]:
train_data[:, 2]

<tf.Tensor: shape=(80000,), dtype=float32, numpy=array([ 0. ,  0. , -1. , ..., -0.5, -1. ,  0. ], dtype=float32)>

In [9]:
## instantiate a drravestaterepresentation
reward_function = PMF(n_users, n_items, config.embedding_feature_size)
reward_function(1, 1)
reward_function.load_weights('trained/adam/pmf_150_adam')

user_embeddings = tf.convert_to_tensor(reward_function.user_embedding.get_weights()[0])
item_embeddings = tf.convert_to_tensor(reward_function.item_embedding.get_weights()[0])

state_rep_net = DRRAveStateRepresentation(config.history_buffer_size, item_features, user_features)

## instantiate actor and target actor networks
actor_net = Actor(state_shape, action_shape)                                
target_actor_net = Actor(state_shape, action_shape)

## instantiate critic and target critics networks
critic_net = Critic(action_shape, state_shape, critic_output_shape)
target_critic_net = Critic(action_shape, state_shape, critic_output_shape)

## data flow for building the model
flow_item = tf.convert_to_tensor(np.random.rand(5, 100), dtype='float32')
flow_state = tf.convert_to_tensor(np.random.rand(1, 300), dtype='float32')
flow_action = tf.convert_to_tensor(np.random.rand(1, 100), dtype='float32')

## flowing the data into the model to build the model
state_rep_net(user_embeddings[0], flow_item)
actor_net(flow_state)
target_actor_net(flow_state)
critic_net(flow_state, flow_action)
target_critic_net(flow_state, flow_action)
print("Actor-Critic model has successfully instantiated")

state_rep_optimizer = tf.keras.optimizers.Adam(learning_rate=config.lr_state_rep)
actor_optimizer = tf.keras.optimizers.Adam(learning_rate=config.lr_actor)
critic_optimizer = tf.keras.optimizers.Adam(learning_rate=config.lr_critic)

Actor-Critic model has successfully instantiated


In [10]:
replay_buffer = NaivePrioritizedReplayMemory(config.replay_buffer_size, prob_alpha=config.prob_alpha)
history_buffer = HistoryBuffer(config.history_buffer_size)

timesteps = 0
epoch = 0

## this variable is for episode
eps_slope = abs(config.eps_start - config.eps)/config.eps_steps
eps = config.eps_start

actor_losses = []
critic_losses = []

## this variable is to hold the episodic rewards
epi_rewards = []
epi_avg_rewards = []
e_arr = []

## this variable holds the user index
## got from the dictionary
user_idxs = np.array(list(users.values()))
np.random.shuffle(user_idxs)

In [15]:
e = 0

user_reviews = train_data[train_data[:, u_id] == e] # 53
pos_user_reviews = user_reviews[user_reviews[:, r_id] > 0] #25

## ngambil embedding items dan user
candidate_items = tf.identity(tf.stop_gradient(item_embeddings))
user_emb = user_embeddings[e]

## masih gatau kenapa ada variable ini
ignored_items = []

## 5
for i in range(config.history_buffer_size):
    emb = candidate_items[int(pos_user_reviews[i, i_id].numpy())]
    history_buffer.push(tf.identity(tf.stop_gradient(emb)))

## initialize rewards list
rewards = []

## starting item index
t = 0

## declaring the needed variable
state = None
action = None
reward = None
next_state = None

## while di sini
    

if eps > config.eps:
    eps -= eps_slope
else:
    eps = config.eps

state = state_rep_net(user_emb, tf.stack(history_buffer.to_list()))
state = tf.reshape(tf.stop_gradient(state), [1, state.shape[0]])
action = actor_net(tf.stop_gradient(state), training=False)

## calculate candidate items
ranking_scores = candidate_items @ tf.transpose(action)

## flatten the result
ranking_scores = tf.reshape(ranking_scores, (ranking_scores.shape[0],)).numpy()

if len(ignored_items) > 0:
    rec_items = tf.stack(ignored_items)
else:
    rec_items = []

ranking_scores[rec_items] = -float("inf")

rec_item_idx = tf.math.argmax(ranking_scores).numpy()
rec_item_emb = candidate_items[rec_item_idx]

## intinya ngambil reward, kalau gaada dibikin dari PMF
if rec_item_idx in user_reviews[:, i_id]:
    user_rec_item_idx = np.where(user_reviews[:, i_id] == float(rec_item_idx))[0][0]
    reward = user_reviews[user_rec_item_idx, r_id]
else:
    if config.zero_reward:
        reward = tf.convert_to_tensor(0)
    else:
        reward = reward_function(float(e), float(rec_item_idx))

rewards.append(reward.numpy())

if reward > 0:
    history_buffer.push(tf.identity(tf.stop_gradient(rec_item_emb)))
    next_state = state_rep_net(user_emb, tf.stack(history_buffer.to_list()), training=False)
else:
    next_state = tf.stop_gradient(state)

ignored_items.append(tf.convert_to_tensor(rec_item_idx))
replay_buffer.push(state, action, next_state, reward)

if (timesteps > config.learning_start) and (len(replay_buffer) >= config.batch_size) and (timesteps % config.learning_freq == 0):
    print("OK")

t += 1
timesteps += 1

In [35]:
user_reviews[3, 2]

<tf.Tensor: shape=(), dtype=float64, numpy=0.5>

In [34]:
user_rec_item_idx = np.where(user_reviews[:, i_id] == 159)[0][0]
user_rec_item_idx

3

In [16]:
user_emb = user_embeddings[0]

state = state_rep_net(user_emb, tf.stack(history_buffer.to_list()))
state = tf.reshape(tf.stop_gradient(state), [1, state.shape[0]])
action = actor_net(tf.stop_gradient(state), training=False)

## calculate candidate items
ranking_scores = candidate_items @ tf.transpose(action)

## flatten the result
ranking_scores = tf.reshape(ranking_scores, (ranking_scores.shape[0],)).numpy()

In [21]:
ignored_items = [1, 2, 3, 4]
rec_items = tf.stack(ignored_items).numpy()

In [22]:
ranking_scores[rec_items] = -float("inf")

In [24]:
ranking_scores

array([1.0846074e-03,          -inf,          -inf, ..., 2.7740351e-03,
       3.3080969e-03, 5.8118429e-05], dtype=float32)

In [None]:
## loop all the users based on indexes
## enumerates start with zero
for idx, e in enumerate(user_idxs):
    ## starting the episodes

    ## learning_start = 250
    if timesteps - config.learning_start > config.max_timesteps_train:
        break

    ## extracting positive user reviews
    ## e variable is an element right now
    user_reviews = train_data[train_data[:, u] == e]
    pos_user_reviews = user_reviews[user_reviews[:, r] > 0]

    ## check if the user ratings doesn't have enough positive review
    ## in this case history_buffer_size is 4
    ## get the shape object and 0 denote the row index
    if pos_user_reviews.shape[0] < config.history_buffer_size:
        continue

    candidate_items = tf.identity(tf.stop_gradient(item_embeddings))

    ## extracting user embedding tensors
    user_emb = user_embeddings[e]

    ## fill history buffer with positive item embeddings
    ## and remove item embeddings from candidate item sets
    ignored_items = []

    ## history_buffer_size has size of n items
    ## in this case 5
    for i in range(config.history_buffer_size):
        emb = candidate_items[tf.cast(pos_user_reviews[i, i], dtype='int32')]
        history_buffer.push(tf.identity(tf.stop_gradient(emb)))

    ## initialize rewards list
    rewards = []

    ## starting item index
    t = 0

    ## declaring the needed variable
    state = None
    action = None
    reward = None
    next_state = None

    while t < config.episode_length:
        ## observing the current state
        ## choose action according to actor network explorations

        ## inference calls start here

        if eps > config.eps:
            eps -= eps_slope
        else:
            eps = config.eps

        ## state is the result of DRRAve model inference
        ## history_buffer.to_list get the list of previous items
        ## state representaton has the size (300, )
        state = state_rep_net(user_emb, tf.stack(history_buffer.to_list()))
        if np.random.uniform(0, 1) < eps:
            action = tf.convert_to_tensor(np.random.rand(1, action_shape), dtype='float32') 
        else:
            action = actor_net(tf.reshape(tf.stop_gradient(state), [1, state.shape[0]]), training=False)

        ranking_scores = candidate_items @ tf.reshape(action, (action.shape[1], 1))
        ## calculating ranking scores accross items, discard ignored items

        if len(ignored_items) > 0:
            rec_items = tf.stack(ignored_items)
        else:
            rec_items = []

        ## setting value of negative infinite

#       ranking_scores[rec_items] = -99999999

        ## get the recommended items
        ## first get the maximum value index
        ## then get the items by index from candidate items
        ranking_scores = tf.reshape(ranking_scores, (ranking_scores.shape[0],))
        rec_item_idx = tf.math.argmax(ranking_scores)
        rec_item_emb = candidate_items[rec_item_idx]

        ## get the item reward
        if tf.cast(rec_item_idx, 'float64') in user_reviews[:, i]:
            ## get the reward from rating in the dataset
            ## if the user is rating the item
            user_rec_item_idx = np.where(user_reviews[:, i] == float(rec_item_idx))[0][0]
            reward = user_reviews[user_rec_item_idx, r]
        else:
            if config.zero_reward:
                reward = tf.convert_to_tensor(0)
            else:
                reward = reward_function(tf.convert_to_tensor(e), rec_item_idx)

        ## track the episode rewards
        rewards.append(reward.numpy())

        ## add item to history buffer if positive reviews
        if reward > 0:
            history_buffer.push(tf.stop_gradient(rec_item_emb))

            next_state = state_rep_net(user_emb, tf.stack(history_buffer.to_list()), training=False)
        else:
            ## keep the history buffer the same
            ## the next state is the current state
            next_state = tf.stop_gradient(state)

        ## remove new items from future recommendation
        ignored_items.append(tf.convert_to_tensor(rec_item_idx))

        ## add the (state, action, reward, next_state)
        ## to the experience replay
        replay_buffer.push(state, action, next_state, reward)

### CHECK CRITIC AMAN

In [None]:
action_shape = 3
in_features = 9
out_features = 1
combo_features = in_features + action_shape

linear_fn_1 = tf.keras.layers.Dense(in_features, activation='relu')
linear_fn_2 = tf.keras.layers.Dense(combo_features, activation='relu')
linear_fn_3 = tf.keras.layers.Dense(combo_features, activation='relu')
linear_fn_4 = tf.keras.layers.Dense(out_features, activation=None)

In [None]:
action = tf.convert_to_tensor(np.array([[0.9004, 0.8004, 0.7004], [0.9004, 0.6004, 0.7004]]), dtype='float32')
input_st = tf.convert_to_tensor(np.array([[1.0000, 3.0000, 5.0000, 0.5000, 2.1000, 4.5000, 0.5000, 0.7000, 0.9000],
                                          [1.0000, 3.0000, 5.0000, 0.5000, 2.1000, 4.5000, 0.5000, 0.7000, 0.9000]]), dtype='float32')

In [None]:
input_st.shape

In [None]:
combbo = tf.concat([action, input_st], 1)

In [None]:
## build the model
linear_fn_1(input_st)
linear_fn_2(combbo)
linear_fn_3(combbo)
linear_fn_4(combbo)

In [None]:
weight1 = 0.1 * np.ones((9,9))
weight2 = 0.1 * np.ones((12,12))
weight3 = 0.1 * np.ones((12,12))
weight4 = 0.1 * np.ones((12,1))
bias1 = np.zeros((9,))
bias2 = np.zeros((12,))
bias3 = np.zeros((12,))
bias4 = np.zeros((1,))

In [None]:
linear_fn_1.set_weights([weight1, bias1])
linear_fn_2.set_weights([weight2, bias2])
linear_fn_3.set_weights([weight3, bias3])
linear_fn_4.set_weights([weight4, bias4])

In [None]:
output = linear_fn_1(input_st)
output = tf.concat([action, output], 1)
output = linear_fn_2(output)
output = linear_fn_3(output)
output = linear_fn_4(output)
output

### CHECK ACTOR AMAN

In [None]:
in_features = 9
out_features = 3

linear_fn_1 = tf.keras.layers.Dense(in_features, activation='relu', kernel_regularizer=tf.keras.regularizers.L2(0.1))
linear_fn_2 = tf.keras.layers.Dense(in_features, activation='relu', kernel_regularizer=tf.keras.regularizers.L2(0.1))
linear_fn_3 = tf.keras.layers.Dense(out_features, activation='tanh', kernel_regularizer=tf.keras.regularizers.L2(0.1))

In [None]:
weights = 0.1 * np.ones((9, 9))
bias = np.zeros(9,)
weights_3 = 0.1 * np.ones((9, 3))
bias_3 = np.zeros(3,)

In [None]:
input_st = tf.convert_to_tensor(np.array([[1.0000, 3.0000, 5.0000, 0.5000, 2.1000, 4.5000, 0.5000, 0.7000, 0.9000]]), dtype='float32')

In [None]:
input_st.shape

In [None]:
## build the model
linear_fn_1(input_st)
linear_fn_2(input_st)
linear_fn_3(input_st)

In [None]:
# linear_fn(input_st)
linear_fn_1.set_weights([weights, bias])
linear_fn_2.set_weights([weights, bias])
linear_fn_3.set_weights([weights_3, bias_3])

In [None]:
output = linear_fn_1(input_st)
output = linear_fn_2(output)
output = linear_fn_3(output)
print(output)

### DRRAVE AMAN

In [None]:
users = tf.convert_to_tensor(np.array([1, 3, 5]), dtype='float32')
items = tf.convert_to_tensor(np.array([[1, 2, 3], [4, 5, 6]]), dtype='float32')
attention_weights = tf.Variable(tf.convert_to_tensor(np.array([[0.1], [0.1]]), dtype='float32'))
# attention_weights = tf.random.uniform((3, 1), minval=0., maxval=1.)
# attention_weights
attention_weights

In [None]:
right = tf.transpose(items) @ attention_weights
right = tf.reshape(right, (right.shape[0],))
middle = users * right
output = tf.concat([users, middle, right], 0)
output

### DQN LOSS AMAN

In [None]:
y = tf.convert_to_tensor(np.array([1, 2, 3, 4]), dtype='float32')
q_vals = tf.convert_to_tensor(np.array([2, 2, 2, 2]), dtype='float32')
weights = [1., 1., 1., 1.]

In [None]:
## calculate loss
loss = tf.convert_to_tensor(y - q_vals)
## because loss is tensor shape
## we can extract the numpy value
loss = tf.pow(loss, 2)
weights_ten = tf.stop_gradient(weights)
loss = tf.reshape(loss, (4,)) * weights_ten
## calculate new priorities
new_priorities = tf.stop_gradient(loss).numpy() + 1e-5
loss = tf.convert_to_tensor(tf.math.reduce_mean(loss))
print(new_priorities)
print(loss)

### Cek Soft Update

In [None]:
def soft_update(local_model, target_model, tau):
    """Soft update model parameters.
    θ_target = τ*θ_local + (1 - τ)*θ_target
    Params
    ======
        local_model: model which the weights will be copied from
        target_model: model which weights will be copied to
        tau (float): interpolation parameter
    """
    for t_layer, layer in zip(target_model.layers, local_model.layers):
        ## initiate list
        temp_w_arr = []
        for t_weights, weights in zip(t_layer.get_weights(), layer.get_weights()):
            ## fill the array list
            temp_w_arr.append(weights * tau + (1.0 - tau)*t_weights)
        ## copy the weights
        t_layer.set_weights(temp_w_arr)