In [1]:
import tensorflow as tf
import numpy as np
import sys
import pickle

from model import PMF, DRRAveStateRepresentation, Actor, Critic

from utils.prioritized_replay_buffer import NaivePrioritizedReplayMemory, Transition
from utils.history_buffer import HistoryBuffer
from utils.general import export_plot

In [2]:
class DRRTrainer(object):
    def __init__(self,
                 config,
                 actor_function,
                 critic_function,
                 state_rep_function,
                 reward_function,
                 users,
                 items,
                 train_data,
                 test_data,
                 user_embeddings,
                 item_embeddings):
        
        ## importing reward function
        self.reward_function = reward_function
        ## importing training and testing data
        self.train_data = train_data
        self.test_data = test_data
        ## importing users and items
        self.users = users
        self.items = items
        ## importing user and item embeddings
        self.user_embeddings = user_embeddings
        self.item_embeddings = item_embeddings
        ## declaring index identifier for dataset
        ## u for user, i for item, r for reward/rating
        self.u = 0
        self.i = 1
        self.r = 2
        
        ## dimensions
        ## self.item_embeddings already hold the weights array
        ## this should be 100
        self.item_features = len(self.item_embeddings[0][0])
        self.user_features = len(self.user_embeddings[0][0])
        
        ## number of user and items
        self.n_items = len(self.item_embeddings[0])
        self.n_users = len(self.user_embeddings[0])
        
        ## the shape of state space, action space
        ## this should be 300
        self.state_shape = 3 * self.item_features
        ## this should be 100
        self.action_shape = self.item_features
        
        self.critic_output_shape = 1
        self.config = config
        ## Data dimensions Extracted
        
        ## instantiate a drravestaterepresentation
        self.state_rep_net = state_rep_function(self.config.history_buffer_size,
                                                self.item_features,
                                                self.user_features)
        
        ## instantiate actor and target actor networks
        self.actor_net = actor_function(self.state_shape, self.action_shape)                                
        self.target_actor_net = actor_function(self.state_shape, self.action_shape)
        
        ## instantiate critic and target critics networks
        self.critic_net = critic_function(self.action_shape,
                                          self.state_shape,
                                          self.critic_output_shape)
        
        self.target_critic_net = critic_function(self.action_shape,
                                                 self.state_shape,
                                                 self.critic_output_shape)
        print("Actor-Critic model has successfully instantiated")
        print("DRR Instantiazed")
        
    def learn(self):
        # Initialize buffers
        print("NPRM and History Buffer Initialized")
        replay_buffer = NaivePrioritizedReplayMemory(self.config.replay_buffer_size,
                                                     prob_alpha=self.config.prob_alpha)

        history_buffer = HistoryBuffer(self.config.history_buffer_size)
        
        # Initialize trackers
        # initialize timesteps and epoch
        timesteps = 0
        epoch = 0
        ## this variable is for episode
        eps_slope = abs(self.config.eps_start - self.config.eps)/self.config.eps_steps
        eps = self.config.eps_start
        ## this variable is to hold the losses along the time
        actor_losses = []
        critic_losses = []
        ## this variable is to hold the episodic rewards
        epi_rewards = []
        epi_avg_rewards = []
        
        e_arr = []
        
        ## this variable holds the user index
        ## got from the dictionary
        user_idxs = np.array(list(self.users.values()))
        np.random.shuffle(user_idxs)
        
        ## loop all the users based on indexes
        for idx, e in enumerate(user_idxs):
            ## starting the episodes
            
            ## the loops stop when timesteps-learning_start
            ## is bigger than the max timesteps
            if timesteps - self.config.learning_start > self.config.max_timesteps_train:
                break
            
            ## extracting positive user reviews
            ## e variable is an element right now
            user_reviews = self.train_data[self.train_data[:, self.u] == e]
            pos_user_reviews = user_reviews[user_reviews[:, self.r] > 0]
            
            ## check if the user ratings doesn't have enough positive review
            ## in this case history_buffer_size is 4
            ## get the shape object and 0 denote the row index
            if pos_user_reviews.shape[0] < self.config.history_buffer_size:
                continue
                
            candidate_items = self.items_embeddings

In [10]:
class config():
    ## hyperparameters
    ## setting the batch_size
    batch_size = 64
    gamma = 0.9
    replay_buffer_size = 100000
    history_buffer_size = 5
    learning_start = 32
    learning_freq = 1
    lr_state_rep = 0.001
    lr_actor = 0.0001
    lr_critic = 0.001
    
    eps_start = 1
    eps = 0.1
    eps_steps = 10000
    eps_eval = 0.1
    episode_length = 10
    
    tau = 0.01 # inital 0.001
    beta = 0.4
    prob_alpha = 0.3
    
    max_timesteps_train = 260000
    max_epochs_offline = 500
    max_timesteps_online = 20000
    embedding_feature_size = 100
    
    train_ratio = 0.8
    weight_decay = 0.01
    clip_val = 1.0
    log_freq = 100
    saving_freq = 1000
    zero_reward = False
    
## First importing the data
users = pickle.load(open('Dataset/user_id_to_num_mov.pkl', 'rb'))
items = pickle.load(open('Dataset/movie_id_to_num_mov.pkl', 'rb'))
data = np.load('Dataset/data.npy')

## hold the length of the data
n_users = len(users)
n_items = len(items)

## don't forget to normalize the data first
data[:, 2] = 0.5 * (data[:, 2] - 3)

## split and shuffle the data
np.random.shuffle(data)
## split the data
## ratio should be 0.8
train_data = tf.convert_to_tensor(data[:int(config.train_ratio * data.shape[0])])
test_data = tf.convert_to_tensor(data[int(config.train_ratio * data.shape[0]):])
print("Train Data:{}, Test Data:{}".format(np.shape(train_data), np.shape(test_data)))

## hold the PMF model
## get the user and item embeddings
reward_function = PMF(n_users, n_items, config.embedding_feature_size)
## need to flow some data to build the model
reward_function(1, 1)
## loading the whole layer weights
reward_function.load_weights('trained/adam/pmf_150_adam')
## freeze the model, because it will be used for inference
reward_function.trainable = False

## take the embedding layers weight
## and split the user and item weights
user_embeddings = reward_function.user_embedding.get_weights()
item_embeddings = reward_function.item_embedding.get_weights()
## output
print("user embedding has shape {} and item embedding has shape {}"
      .format(np.shape(user_embeddings[0]), np.shape(item_embeddings[0])))

## hold the model in the variable
## so it can be tracked
state_rep_function = DRRAveStateRepresentation
actor_function = Actor
critic_function = Critic

## initialize DRRTrain Class
trainer = DRRTrainer(config,
                     actor_function,
                     critic_function,
                     state_rep_function,
                     reward_function,
                     users,
                     items,
                     train_data,
                     test_data,
                     user_embeddings,
                     item_embeddings)

Train Data:(80000, 3), Test Data:(20000, 3)
user embedding has shape (943, 100) and item embedding has shape (1682, 100)
Actor-Critic model has successfully instantiated
DRR Instantiazed
