In [1]:
import tensorflow as tf
import os 
import numpy as np
import pandas as pd
from model import CNN_Model,Params

  from ._conv import register_converters as _register_converters


In [2]:
class experience_replay_buffer:
    def __init__(self,size,dtype):
        self.buffer=np.empty(size,dtype=dtype)
        self.num_items=0
        self.capacity=size
    def add_experience(self,experience):
        
        if self.num_items<self.capacity:
            self.buffer[self.num_items]=experience
            self.num_items+=1
        else:
            ind=np.random.randint(low=0,high=self.capacity,size=1,dtype=np.int32)
            self.buffer[ind]=experience
    
    def get_batch(self,batch_size):
        inds=np.random.randint(low=0,high=self.capacity,size=batch_size,dtype=np.int32)
        experiences=self.experience_replay_buffer[inds]
        df=pd.DataFrame(experiences, columns=['states', 'actions', 'next_states','rewards'])
        return np.array(df.states),np.array(df.actions),np.array(df.next_states),np.array(df.rewards)

In [8]:
class Q_Network(CNN_Model):
    def __init__(self,max_experience_buffer_len=120,param_dict={},restore_params=False,pickle_file_path=""):
        CNN_Model.__init__(self,param_dict,restore_params,pickle_file_path)
        self.max_experience_buffer_len=max_experience_buffer_len
        
    def form_loss(self,logits,targets):
        entropies=self.params.loss_fn(labels=targets,logits=logits)
        return entropies
        
    def Build_model(self):
        self.build_model_till_logits()
        with tf.variable_scope(self.params.name_scope):
            #logits are q values]
#             self.max_q_value_actions=tf.argmax(self.logits,axis=1) #value which has the highest q value
#             self.max_q_value_actions_one_hot=tf.one_hot(self.max_q_value_actions,depth=self.params.num_outputs)
#             self.max_q_values=self.logits*self.max_q_value_actions_one_hot
            self.max_q_value_actions=tf.squeeze(tf.argmax(self.logits,axis=1)) #value which has the highest q value
            self.max_q_values=tf.reduce_max(self.logits,axis=1)
            
#             print (self.max_q_values.shape)
            #placeholder for action at current timestep
#             self.one_hot_actions=self.form_placeholder((None,self.params.num_outputs),tf.float32) #actions are not one hot
#             self.one_hot_actions=tf.one_hot(indices=self.actions,depth=self.params.num_outputs)
#             q_vals=self.logits*self.one_hot_actions
            self.actions=self.form_placeholder((None),tf.int32)
            one_hot_actions=tf.one_hot(indices=self.actions,depth=self.params.num_outputs)
            q_vals=tf.reduce_sum(self.logits*one_hot_actions,axis=1)
            
            
            
            #placeholder for max next state q values,rewards and discount rate
#             self.max_q_values_next_state=self.form_placeholder((None,self.params.num_outputs),tf.float32)
            self.max_q_values_next_state=self.form_placeholder((None),tf.float32)
            self.rewards=self.form_placeholder((None),tf.float32)
            self.discount_rate=self.form_placeholder([],tf.float32)
            
            self.loss=tf.reduce_mean(tf.square(q_vals-(self.rewards+(self.discount_rate*self.max_q_values_next_state))))
            #computing gradients 
            optimizer=self.params.optimizer_fn(learning_rate=self.lr_placeholder)
            self.grads_and_vars=optimizer.compute_gradients(loss=self.loss,var_list=self.model_trainable_variables)
            
            self.train_op=optimizer.apply_gradients(grads_and_vars=self.grads_and_vars,global_step=self.step_no)
            
            self.initializer=tf.global_variables_initializer()
        def add_to_experience_replay(self,state,action,next_state,reward):
            experience=tuple(state,action,next_state,reward)
            if not hasattr("experience_replay_buffer"):
                self.experience_replay_buffer=experience_replay_buffer(size=self.max_experience_buffer_len,dtype=type(experience))
            self.experience_replay_buffer.add_experience(experience)
        
       
     
        def train(self,sess,episodes,steps,epsilon,batch_size,env,save_dir,save_every_n_iter,log_every_n_iter,initialize=False,set_logging=True):
            if initialize:
                sess.run([self.initializer])
            if set_logging:
                log_dir,set_logging=self.create_log_directory_if_doesnt_exist(save_dir)
            if set_logging: #creating file handlers if dir cretaed or found in above statement
                print("logging called but no code implemented")
#                 train_writer = tf.summary.FileWriter(os.path.join(log_dir,'train'), sess.graph)
#                 validation_writer = tf.summary.FileWriter(os.path.join(log_dir ,'validation'))
            [step_no]=sess.run([self.step_no]) 
            
            for episode in np.arange(episodes):
                state=env.reset()
                for step in np.arange(steps):
                    #choosing action 
                    action=-1 
                    if (np.random.random(1)<epsilon):
                        action=np.random.randint(low=0,high=self.params.num_outputs,size=1,dtype=np.int32)
                    else:
                        feed_dict={self.X:np.expand_dims(state,axis=0),self.lr_placeholder:self.params.learning_rate,self.training_mode:True}
                        action=sess.run([self.max_q_value_actions],feed_dict=feed_dict)
                        action=action[0]
                        
                    next_state,reward,done,info=env.step(action)
                    
                    if done:
                        next_state=np.nan
                        self.add_to_experience_replay(state,action,next_state,reward)
                        break
                    else:
                        self.add_to_experience_replay(state,action,next_state,reward)
                        state=next_state
                        
                    #performing training step
                    states,actions,next_states,rewards=self.experience_replay_buffer.get_batch(batch_size=batch_size)
                    
                    #finding where the next states are nans (episode ended)
                    nan_inds=np.isnan(next_states)
#                     filetered_next_states=next_states[np.logical_not(nan_inds)]
                    next_states[nan_inds]=np.zeros(shape=self.params.input_shape[1:],dtype=np.float32)
                    
                    #finding vals of next states
                    feed_dict={self.X:next_states,self.lr_placeholder:self.params.learning_rate,self.training_mode:True}
                    max_q_vals_next_state=sess.run([self.max_q_values],feed_dict=feed_dict)
                    max_q_vals_next_state[nan_inds]=0
                    
                    #calculating loss and running train op
                    feed_dict={self.X:states,self.actions:actions,self.max_q_values_next_state:max_q_values_next_state,self.rewards:rewards,self.lr_placeholder:self.params.learning_rate,self.training_mode:True}
                    loss,_=sess.run([self.loss,self.train_op],feed_dict=feed_dict)
#             

In [9]:
params={
    'input_shape':[None, 35, 190, 1],
    'num_outputs':3,
    
    'layer_hierarchy':[
        {'layer_type':'conv_layer','kernel_size':8,'kernel_strides':1,'num_filters':16,'padding':'valid'},
        {'layer_type':'batch_normalization_layer'},
        {'layer_type':'activation_layer'},
        {'layer_type':'conv_layer','kernel_size':4,'kernel_strides':1,'num_filters':32,'padding':'valid'},
        {'layer_type':'batch_normalization_layer'},
        {'layer_type':'activation_layer'},
        {'layer_type':'flattening_layer'},
        {'layer_type':'fc_layer','num_hidden_units':256},
        {'layer_type':'batch_normalization_layer'},
        {'layer_type':'activation_layer'},
        {'layer_type':'dropout_layer','dropout_probability':0.5},
        {'layer_type':'fc_layer','num_hidden_units':100},
        {'layer_type':'batch_normalization_layer'},
        {'layer_type':'activation_layer'},
        {'layer_type':'dropout_layer','dropout_probability':0.5}
        
    ],
    'initializer_fn':tf.contrib.layers.variance_scaling_initializer,
    'activation_fn':tf.nn.relu,
    'loss_fn':tf.nn.sparse_softmax_cross_entropy_with_logits, #carefull
    'learning_rate':0.001,
    'optimizer_fn':tf.train.AdamOptimizer,
    'logdir':'/tf_logs_rnn/run/',
    'name_scope':'neural_network_bn'
}

In [10]:
# self,max_experience_buffer_len=120,param_dict={},restore_params=False,pickle_file_path=""
tf.reset_default_graph()
model=Q_Network(120,params)
model.Build_model()