In [1]:
import tensorflow as tf
import numpy as np
import sys
import pickle

from model import PMF, DRRAveStateRepresentation, Actor, Critic

from utils.prioritized_replay_buffer import NaivePrioritizedReplayMemory, Transition
from utils.history_buffer import HistoryBuffer
from utils.general import export_plot

In [81]:
class DRRTrainer(object):
    def __init__(self,
                 config,
                 actor_function,
                 critic_function,
                 state_rep_function,
                 reward_function,
                 users,
                 items,
                 train_data,
                 test_data,
                 user_embeddings,
                 item_embeddings):
        
        ## importing reward function
        self.reward_function = reward_function
        ## importing training and testing data
        self.train_data = train_data
        self.test_data = test_data
        ## importing users and items
        self.users = users
        self.items = items
        ## importing user and item embeddings
        self.user_embeddings = user_embeddings
        self.item_embeddings = item_embeddings
        ## declaring index identifier for dataset
        ## u for user, i for item, r for reward/rating
        self.u = 0
        self.i = 1
        self.r = 2
        
        ## dimensions
        ## self.item_embeddings already hold the weights array
        ## this should be 100
        self.item_features = self.item_embeddings.shape[1]
        self.user_features = self.user_embeddings.shape[1]
        
        ## number of user and items
        self.n_items = self.item_embeddings.shape[0]
        self.n_users = self.user_embeddings.shape[0]
        
        ## the shape of state space, action space
        ## this should be 300
        self.state_shape = 3 * self.item_features
        ## this should be 100
        self.action_shape = self.item_features
        
        self.critic_output_shape = 1
        self.config = config
        ## Data dimensions Extracted
        
        ## instantiate a drravestaterepresentation
        self.state_rep_net = state_rep_function(self.config.history_buffer_size,
                                                self.item_features,
                                                self.user_features)
        
        ## instantiate actor and target actor networks
        self.actor_net = actor_function(self.state_shape, self.action_shape)                                
        self.target_actor_net = actor_function(self.state_shape, self.action_shape)
        
        ## instantiate critic and target critics networks
        self.critic_net = critic_function(self.action_shape,
                                          self.state_shape,
                                          self.critic_output_shape)
        
        self.target_critic_net = critic_function(self.action_shape,
                                                 self.state_shape,
                                                 self.critic_output_shape)
        
        ## data flow for building the model
        flow_item = tf.convert_to_tensor(np.random.rand(5, 100), dtype='float32')
        flow_state = tf.convert_to_tensor(np.random.rand(1, 300), dtype='float32')
        flow_action = tf.convert_to_tensor(np.random.rand(1, 100), dtype='float32')
        
        ## flowing the data into the model to build the model
        self.state_rep_net(user_embeddings[0], flow_item)
        self.actor_net(flow_state)
        self.target_actor_net(flow_state)
        self.critic_net(flow_state, flow_action)
        self.target_critic_net(flow_state, flow_action)
        print("Actor-Critic model has successfully instantiated")
        
        self.state_rep_optimizer = tf.keras.optimizers.Adam(
            learning_rate=self.config.lr_state_rep)
        
        self.actor_optimizer = tf.keras.optimizers.Adam(
            learning_rate=self.config.lr_actor)
        
        self.critic_optimizer = tf.keras.optimizers.Adam(
            learning_rate=self.config.lr_critic)
        
        print("DRR Instantiazed")
        
    def learn(self):
        # Initialize buffers
        print("NPRM and History Buffer Initialized")
        replay_buffer = NaivePrioritizedReplayMemory(self.config.replay_buffer_size,
                                                     prob_alpha=self.config.prob_alpha)

        history_buffer = HistoryBuffer(self.config.history_buffer_size)
        
        # Initialize trackers
        # initialize timesteps and epoch
        timesteps = 0
        epoch = 0
        ## this variable is for episode
        eps_slope = abs(self.config.eps_start - self.config.eps)/self.config.eps_steps
        eps = self.config.eps_start
        ## this variable is to hold the losses along the time
        actor_losses = []
        critic_losses = []
        ## this variable is to hold the episodic rewards
        epi_rewards = []
        epi_avg_rewards = []
        
        e_arr = []
        
        ## this variable holds the user index
        ## got from the dictionary
        user_idxs = np.array(list(self.users.values()))
        np.random.shuffle(user_idxs)
        
        ## loop all the users based on indexes
        for idx, e in enumerate(user_idxs):
            ## starting the episodes
            
            ## the loops stop when timesteps-learning_start
            ## is bigger than the max timesteps
            if timesteps - self.config.learning_start > self.config.max_timesteps_train:
                break
            
            ## extracting positive user reviews
            ## e variable is an element right now
            user_reviews = self.train_data[self.train_data[:, self.u] == e]
            pos_user_reviews = user_reviews[user_reviews[:, self.r] > 0]
            
            ## check if the user ratings doesn't have enough positive review
            ## in this case history_buffer_size is 4
            ## get the shape object and 0 denote the row index
            if pos_user_reviews.shape[0] < self.config.history_buffer_size:
                continue
                
            candidate_items = tf.identity(tf.stop_gradient(self.item_embeddings))
            
            ## extracting user embedding tensors
            user_emb = self.user_embeddings[e]
            
            ## fill history buffer with positive item embeddings
            ## and remove item embeddings from candidate item sets
            ignored_items = []
            
            ## history_buffer_size has size of n items
            ## in this case 5
            for i in range(self.config.history_buffer_size):
                emb = candidate_items[tf.cast(pos_user_reviews[i, self.i], dtype='int32')]
                history_buffer.push(tf.identity(tf.stop_gradient(emb)))
                
            ## initialize rewards list
            rewards = []
            
            ## starting item index
            t = 0
            
            ## declaring the needed variable
            state = None
            action = None
            reward = None
            next_state = None
            
            while t < self.config.episode_length:
                ## observing the current state
                ## choose action according to actor network explorations
                
                ## inference calls start here
                
                if eps > self.config.eps:
                    eps -= eps_slope
                else:
                    eps = self.config.eps
                
                ## state is the result of DRRAve model inference
                ## history_buffer.to_list get the list of previous items
                ## state representaton has the size (300, )
                state = self.state_rep_net(user_emb, tf.stack(history_buffer.to_list()))
                if np.random.uniform(0, 1) < eps:
                    action = tf.convert_to_tensor(np.random.rand(1, self.action_shape), dtype='float32') 
                else:
                    action = self.actor_net(tf.reshape(tf.stop_gradient(state), [1, state.shape[0]]), training=False)
                    
                ranking_scores = candidate_items @ tf.reshape(action, (action.shape[1], 1))
                ## calculating ranking scores accross items, discard ignored items
                
                if len(ignored_items) > 0:
                    rec_items = tf.stack(ignored_items)
                else:
                    rec_items = []
            
                ## setting value of negative infinite
                ranking_scores[rec_items] = -99999999
                
                ## get the recommended items
                ## first get the maximum value index
                ## then get the items by index from candidate items
                ranking_scores = tf.reshape(ranking_scores, (ranking_scores.shape[0],))
                rec_item_idx = tf.math.argmax(ranking_scores)
                rec_item_emb = candidate_items[rec_item_idx]
                
                ## get the item reward
                if tf.cast(rec_item_idx, 'float64') in user_reviews[:, self.i]:
                    ## get the reward from rating in the dataset
                    ## if the user is rating the item
                    user_rec_item_idx = np.where(user_reviews[:, self.i] == float(rec_item_idx))[0][0]
                    reward = user_reviews[user_rec_item_idx, self.r]
                else:
                    if self.config.zero_reward:
                        reward = tf.convert_to_tensor(0)
                    else:
                        reward = self.reward_function(tf.convert_to_tensor(e), rec_item_idx)
                
                ## track the episode rewards
                rewards.append(reward.numpy())
                
                ## add item to history buffer if positive reviews
                if reward > 0:
                    history_buffer.push(tf.stop_gradient(rec_item_emb))
                    
                    next_state = self.state_rep_net(user_emb, tf.stack(history_buffer.to_list()), training=False)
                else:
                    ## keep the history buffer the same
                    ## the next state is the current state
                    next_state = tf.stop_gradient(state)
                
                ## remove new items from future recommendation
                ignored_items.append(tf.convert_to_tensor(rec_item_idx))
                
                ## add the (state, action, reward, next_state)
                ## to the experience replay
                replay_buffer.push(state, action, next_state, reward)
                
                ## Inference calling stops here
                ## Training start here
                if(timesteps > self.config.learning_start) and (len(replay_buffer) >= self.config.batch_size) and (timesteps % self.config.learning_freq == 0):
                    critic_loss, actor_loss, critic_params_norm = self.training_step(timesteps,
                                                                                     replay_buffer,
                                                                                     True
                                                                                     )
                    ## storing the losses along the time
                    actor_losses.append(actor_loss)
                    critic_losses.append(critic_loss)
                    
                    ## outputting the result
                    if timesteps % self.config.log_freq == 0:
                        if len(rewards) > 0:
                            print(
                                f'Timestep {timesteps - self.config.learning_start} | '
                                f'Episode {epoch} | '
                                f'Mean Ep R '
                                f'{np.mean(rewards):.4f} | '
                                f'Max R {np.max(rewards):.4f} | '
                                f'Critic Params Norm {critic_params_norm:.4f} | '
                                f'Actor Loss {actor_loss:.4f} | '
                                f'Critic Loss {critic_loss:.4f} | ')
                            sys.stdout.flush()
            
                ## housekeeping
                t += 1
                timesteps += 1
            
                ## end of timesteps
            ## end of episodes
            if timesteps - self.config.learning_start > t:
                epoch += 1
                e_arr.append(epoch)
                epi_rewards.append(np.sum(rewards))
                epi_avg_rewards.append(np.mean(rewards))
        
        print("Training Finished")
        return actor_losses, critic_losses, epi_avg_rewards
    
    def training_step(self, t, replay_buffer, training):
        ## create batches
        ## from utils programs
        # Create batches by calling sample methods
        transitions, indicies, weights = replay_buffer.sample(self.config.batch_size, beta=self.config.beta)
        
        weights = tf.convert_to_tensor(weights, dtype='float32')
        ## create the tuple using Transition function     
        batch = Transition(*zip(*transitions))
        
        ## preparing the batch for each data
        ## the concat function will flatten the data
        ## the reshape will reshape the data so that it receive 64 rows
        next_state_batch = tf.reshape(tf.concat(batch.next_state, 0), [self.config.batch_size, -1])
        state_batch = tf.reshape(tf.concat(batch.state, 0), [self.config.batch_size, -1])
        action_batch = tf.reshape(tf.concat(batch.action, 0), [self.config.batch_size, -1])
        reward_batch = tf.reshape(tf.concat(batch.reward, 0), [self.config.batch_size, -1])
        
        ## updating the critic networks
        with tf.GradientTape() as tape:
            critic_loss, new_priorities = self.compute_prioritized_dqn_loss(tf.stop_gradient(state_batch),
                                                                            action_batch,
                                                                            reward_batch,
                                                                            next_state_batch,
                                                                            weights)
        ## apply the gradient
        grads = tape.gradient(critic_loss, self.critic_net.trainable_variables)
        
        replay_buffer.update_priorities(indicies, new_priorities.numpy())
        ## critic norm clipping
        critic_param_norm = [tf.clip_by_norm(layer.get_weights()[0] ,self.config.clip_val) for layer in self.critic_net.layers]
        critic_param_norm = tf.norm(critic_param_norm[0])
        ## step the optimizers
        self.critic_optimizer.apply_gradients(zip(grads, self.critic_net.trainable_variables))
        
        ## updating the actor networks
        with tf.GradientTape(persistent=True) as tape:
            actions_pred = self.actor_net(state_batch, training=True)
            actor_loss = -tf.reduce_mean(self.critic_net(tf.stop_gradient(state_batch), actions_pred, training=True))
            
        ## compute the gradient
        grads = tape.gradient(actor_loss, self.actor_net.trainable_variables)
        ## apply the step to the optimizers
        self.actor_optimizer.apply_gradients(zip(grads, self.actor_net.trainable_variables))
        ## traceback the variables
#         grads = tape.gradient(actor_loss, self.state_rep_net.trainable_variables)
        ## apply the step
#         self.state_rep_optimizer.apply_gradients(zip(grads, self.state_rep_net.trainable_variables))
        ## minimizing the loss
        del tape
        ## updating the target networks
        self.soft_update(self.critic_net, self.target_critic_net, self.config.tau)
        self.soft_update(self.actor_net, self.target_actor_net, self.config.tau)
        
        return critic_loss.numpy(), actor_loss.numpy(), critic_param_norm
         
    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: model which the weights will be copied from
            target_model: model which weights will be copied to
            tau (float): interpolation parameter
        """
        for t_layer, layer in zip(target_model.layers, local_model.layers):
            ## initiate list
            temp_w_arr = []
            for t_weights, weights in zip(t_layer.get_weights(), layer.get_weights()):
                ## fill the array list
                temp_w_arr.append(weights * tau + (1.0 + tau) * t_weights)
            ## copy the weights
            t_layer.set_weights(temp_w_arr)
    
#     @tf.function    
    def compute_prioritized_dqn_loss(self,
                                     state_batch,
                                     action_batch,
                                     reward_batch,
                                     next_state_batch,
                                     weights):
        '''
        :param state_batch: (tensor) shape = (batch_size x state_dims),
                The batched tensor of states collected during
                training (i.e. s)
        :param action_batch: (LongTensor) shape = (batch_size,)
                The actions that you actually took at each step (i.e. a)
        :param reward_batch: (tensor) shape = (batch_size,)
                The rewards that you actually got at each step (i.e. r)
        :param next_state_batch: (tensor) shape = (batch_size x state_dims),
                The batched tensor of next states collected during
                training (i.e. s')
        :param weights: (tensor) shape = (batch_size,)
                Weights for each batch item w.r.t. prioritized experience replay buffer
        :return: loss: (torch tensor) shape = (1),
                 new_priorities: (numpy array) shape = (batch_size,)
        '''
        ## create batches
        ## forward pass through target actor network
        next_action = self.target_actor_net(next_state_batch, training=False)
        q_target = self.target_critic_net(next_state_batch, next_action, training=False)
        ## y or target value that needs to be retreived
        y = reward_batch + self.config.gamma * q_target
        ## get q values from the current state
        q_vals = self.critic_net(state_batch, action_batch, training=True)
    
        ## calculate loss
        loss = tf.convert_to_tensor(y - q_vals)
        ## because loss is tensor shape
        ## we can extract the numpy value
        loss = tf.pow(loss, 2)
        weights_ten = tf.convert_to_tensor(weights)
        loss = tf.reshape(loss, (self.config.batch_size,)) * weights_ten
        ## stop the weights to be gradiented
        weights_ten = tf.stop_gradient(weights_ten)
        
        ## calculate new priorities
        new_priorities = tf.stop_gradient(loss + 1e-5)
        loss = tf.convert_to_tensor(tf.math.reduce_mean(loss))
        
        return loss, new_priorities

In [82]:
class config():
    ## hyperparameters
    ## setting the batch_size
    batch_size = 64
    gamma = 0.9
    replay_buffer_size = 100000
    history_buffer_size = 5
    learning_start = 32
    learning_freq = 1
    ## learning rate for each model networks
    lr_state_rep = 0.001
    lr_actor = 0.0001
    lr_critic = 0.001
    
    eps_start = 1
    eps = 0.1
    eps_steps = 10000
    eps_eval = 0.1
    episode_length = 10
    
    tau = 0.01 # inital 0.001
    beta = 0.4
    prob_alpha = 0.3
    
    max_timesteps_train = 260000
    max_epochs_offline = 500
    max_timesteps_online = 20000
    embedding_feature_size = 100
    
    train_ratio = 0.8
    weight_decay = 0.01
    clip_val = 1.0
    log_freq = 100
    saving_freq = 1000
    zero_reward = False
    
## First importing the data
users = pickle.load(open('Dataset/user_id_to_num_mov.pkl', 'rb'))
items = pickle.load(open('Dataset/movie_id_to_num_mov.pkl', 'rb'))
data = np.load('Dataset/data.npy')

## hold the length of the data
n_users = len(users)
n_items = len(items)

## don't forget to normalize the data first
data[:, 2] = 0.5 * (data[:, 2] - 3)

## split and shuffle the data
np.random.shuffle(data)
## split the data
## ratio should be 0.8
train_data = tf.convert_to_tensor(data[:int(config.train_ratio * data.shape[0])])
test_data = tf.convert_to_tensor(data[int(config.train_ratio * data.shape[0]):])
print("Train Data:{}, Test Data:{}".format(np.shape(train_data), np.shape(test_data)))

## hold the PMF model
## get the user and item embeddings
reward_function = PMF(n_users, n_items, config.embedding_feature_size)
## need to flow some data to build the model
reward_function(1, 1)
## loading the whole layer weights
reward_function.load_weights('trained/adam/pmf_150_adam')
## freeze the model, because it will be used for inference
reward_function.trainable = False

## take the embedding layers weight
## and split the user and item weights
user_embeddings = tf.convert_to_tensor(reward_function.user_embedding.get_weights()[0])
item_embeddings = tf.convert_to_tensor(reward_function.item_embedding.get_weights()[0])
## output
print("user embedding has shape {} and item embedding has shape {}"
      .format(np.shape(user_embeddings[0]), np.shape(item_embeddings[0])))

## hold the model in the variable
## so it can be tracked
state_rep_function = DRRAveStateRepresentation
actor_function = Actor
critic_function = Critic

## initialize DRRTrain Class
trainer = DRRTrainer(config,
                     actor_function,
                     critic_function,
                     state_rep_function,
                     reward_function,
                     users,
                     items,
                     train_data,
                     test_data,
                     user_embeddings,
                     item_embeddings)

print("Start Training")
trainer.learn()

Train Data:(80000, 3), Test Data:(20000, 3)
user embedding has shape (100,) and item embedding has shape (100,)
Actor-Critic model has successfully instantiated
DRR Instantiazed
Start Training
NPRM and History Buffer Initialized


TypeError: 'tensorflow.python.framework.ops.EagerTensor' object does not support item assignment

In [32]:
state_batch = tf.convert_to_tensor(np.random.rand(64, 300), dtype='float32')
state_batch

<tf.Tensor: shape=(64, 300), dtype=float32, numpy=
array([[0.8003254 , 0.1580655 , 0.8195448 , ..., 0.45767805, 0.61804754,
        0.1651512 ],
       [0.9722305 , 0.5930467 , 0.72755915, ..., 0.04491422, 0.9201831 ,
        0.29480112],
       [0.52159023, 0.3600026 , 0.74447376, ..., 0.18060647, 0.76685077,
        0.8315051 ],
       ...,
       [0.1523655 , 0.41763544, 0.87739044, ..., 0.11766145, 0.9641829 ,
        0.36040053],
       [0.00584211, 0.08027299, 0.67973286, ..., 0.87598336, 0.7782308 ,
        0.51407045],
       [0.97023594, 0.35064337, 0.39672542, ..., 0.47496226, 0.8215673 ,
        0.44155088]], dtype=float32)>

In [None]:
next_state_batch = tf.convert_to_tensor(np.random.rand(64, 300), dtype='float32')
next_state_batch

In [None]:
reward_batch = tf.convert_to_tensor(np.random.rand(64, 1), dtype='float32')
reward_batch

In [None]:
action_batch = tf.convert_to_tensor(np.random.rand(64, 100), dtype='float32')
action_batch

In [36]:
weights = tf.convert_to_tensor(np.ones(64,), dtype='float32')
weights

<tf.Tensor: shape=(64,), dtype=float32, numpy=
array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], dtype=float32)>

In [37]:
actor_net = Actor(300, 100)
target_actor_net = Actor(300, 100)

In [38]:
critic_net = Critic(100, 300, 1)
target_critic_net = Critic(100, 300, 1)

In [40]:
action = actor_net(state_batch)
t_action = target_actor_net(state_batch)

In [None]:
critic_net(state_batch, action_batch)
target_critic_net(state_batch, action_batch)

In [44]:
def soft_update(local_model, target_model, tau):
    for target_weights, local_weights in zip(target_model.layers, local_model.layers):
        temp_w = local_weights.get_weights()[0] * tau + (1.0 - tau) * target_weights.get_weights()[0]
        target_weights.set_weights([temp_w])

In [None]:
for t_layer, layer in zip(target_model.layers, local_model.layers):
    temp_w_arr = []
    for t_weights, weights in zip(t_layer.get_weights(), layer.get_weights()):
        temp_w_arr.append(weights * tau + (1.0 + tau) * t_weights)
    t_layer.set_weights(temp_w_arr)


In [None]:
soft_update(actor_net, target_actor_net, 0.9)

In [None]:
critic_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

In [None]:
with tf.GradientTape() as tape:
    ## create batches
    ## forward pass through target actor network
    
    ## gradient tape will watch the input
#     tape.watch(state_batch)
    
    next_action = target_actor_net(next_state_batch, training=False)
    q_target = target_critic_net(next_state_batch, next_action, training=False)
    
    ## y or target value that needs to be retreived
    y = reward_batch + 0.9 * q_target
    
    ## get q values from the current state
    q_vals = critic_net(state_batch, action_batch, training=True)
    
    ## calculate loss
    loss = tf.convert_to_tensor(y - q_vals)
    
    ## because loss is tensor shape
    ## we can extract the numpy value
    loss = tf.pow(loss, 2)
    weights_ten = tf.convert_to_tensor(weights)
    loss = loss * weights_ten
    
    ## stop the weights to be gradiented
    weights_ten = tf.stop_gradient(weights_ten)
    
    ## calculate new priorities
    new_priorities = tf.stop_gradient(loss + 1e-5)
    
    loss = tf.reduce_mean(loss)

## apply the gradient
## the gradients dloss,dx
grads = tape.gradient(loss, critic_net.trainable_variables)

## step the optimizers
critic_optimizer.apply_gradients(zip(grads, critic_net.trainable_variables))

In [166]:
actor_net

<model.Actor at 0x1c1f043d8b0>

In [168]:
actor_net.summary(expand_nested=True)

Model: "actor_54"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_370 (Dense)           multiple                  90300     
                                                                 
 dense_371 (Dense)           multiple                  90300     
                                                                 
 dense_372 (Dense)           multiple                  30100     
                                                                 
Total params: 210,700
Trainable params: 210,700
Non-trainable params: 0
_________________________________________________________________


In [150]:
actor_net.layers[0].set_weights([state_weights[0], state_weights[3]])

In [165]:
for layer in actor_net.layers:
    print(layer.get_weights())

[array([[ 0.12920594,  0.04746147,  0.01159474, ..., -0.04332206,
        -0.0399523 , -0.01784163],
       [ 0.04280023,  0.07425916, -0.02163045, ..., -0.01079362,
         0.02177624, -0.05013077],
       [ 0.00946331, -0.0721656 ,  0.0362711 , ...,  0.0158407 ,
        -0.103851  ,  0.02021474],
       ...,
       [-0.01317058,  0.07530317,  0.01302899, ...,  0.05384679,
         0.03160053, -0.04293234],
       [-0.10235567,  0.07339463,  0.04874316, ...,  0.09172105,
         0.06981438,  0.01396277],
       [ 0.00879064, -0.02434867,  0.03486311, ...,  0.03629652,
        -0.0247658 ,  0.07643944]], dtype=float32), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,

In [156]:
len(state_weights[0])

300

In [131]:
actor_net.layers[0].get_weights()

[array([[ 0.0022795 ,  0.00252163,  0.00229887, ...,  0.00260937,
          0.00362607,  0.00277955],
        [ 0.00257418,  0.00293488,  0.00259659, ...,  0.00376521,
          0.00504805,  0.00370631],
        [ 0.00374366,  0.0041279 ,  0.00382073, ...,  0.00620195,
          0.00979411,  0.00718564],
        ...,
        [ 0.0024325 ,  0.00281001,  0.00251335, ...,  0.00360964,
          0.00461915,  0.0034384 ],
        [ 0.00378533,  0.00419091,  0.00395471, ...,  0.00732636,
          0.01150232,  0.00886642],
        [ 0.00121229,  0.00153037,  0.00136205, ...,  0.00023458,
         -0.00045106, -0.00034669]], dtype=float32),
 array([ 1.20913200e-01,  1.16120107e-01,  1.01726502e-01,  1.39423817e-01,
         1.37664318e-01,  9.64591429e-02,  1.01730131e-01,  1.16164431e-01,
         1.24847375e-01,  8.63403752e-02,  1.35276631e-01,  1.38140172e-01,
         1.34562790e-01,  8.05982649e-02,  1.41219109e-01,  1.48850337e-01,
         1.43470481e-01,  1.05250999e-01,  1.30354658e

In [118]:
state_weights = np.load('trained/actor_net_weight.npy', allow_pickle=True)
# state_weights = np.reshape(state_weights, (-1, 1))

array([[ 0.0022795 ,  0.00252163,  0.00229887, ...,  0.00260937,
         0.00362607,  0.00277955],
       [ 0.00257418,  0.00293488,  0.00259659, ...,  0.00376521,
         0.00504805,  0.00370631],
       [ 0.00374366,  0.0041279 ,  0.00382073, ...,  0.00620195,
         0.00979411,  0.00718564],
       ...,
       [ 0.0024325 ,  0.00281001,  0.00251335, ...,  0.00360964,
         0.00461915,  0.0034384 ],
       [ 0.00378533,  0.00419091,  0.00395471, ...,  0.00732636,
         0.01150232,  0.00886642],
       [ 0.00121229,  0.00153037,  0.00136205, ...,  0.00023458,
        -0.00045106, -0.00034669]], dtype=float32)

In [123]:
state_weights[0]

array([[ 0.0022795 ,  0.00252163,  0.00229887, ...,  0.00260937,
         0.00362607,  0.00277955],
       [ 0.00257418,  0.00293488,  0.00259659, ...,  0.00376521,
         0.00504805,  0.00370631],
       [ 0.00374366,  0.0041279 ,  0.00382073, ...,  0.00620195,
         0.00979411,  0.00718564],
       ...,
       [ 0.0024325 ,  0.00281001,  0.00251335, ...,  0.00360964,
         0.00461915,  0.0034384 ],
       [ 0.00378533,  0.00419091,  0.00395471, ...,  0.00732636,
         0.01150232,  0.00886642],
       [ 0.00121229,  0.00153037,  0.00136205, ...,  0.00023458,
        -0.00045106, -0.00034669]], dtype=float32)

In [103]:
state_rep_net = DRRAveStateRepresentation(100, 100)

In [111]:
state_rep_net.set_weights([state_weights])

In [114]:
state_rep_net.save_weights('state_rep_weights')