# Reinforcement Learning
## Importing Libraries

In [1]:
import gym
from keras.models import Sequential
from keras.layers import Dense, Dropout 
from keras.optimizers import Adam
#from keras.constraints import MaxNorm
import numpy as np
import tensorflow as tf
import gc
import pickle
from keras.models import load_model
from time import time

Using TensorFlow backend.


## Tensor Board auxiliary functions

In [2]:
def variable_summaries(var):
    #Attach a lot of summaries to a Tensor (for TensorBoard visualization).
    for i in range(len(var)):
        with tf.name_scope("Layer_"+str(i)):
            with tf.name_scope('summaries'):
                mean = tf.reduce_mean(var[i])
                tf.summary.scalar('mean', mean)
                with tf.name_scope('stddev'):
                    stddev = tf.sqrt(tf.reduce_mean(tf.square(var[i] - mean)))
                tf.summary.scalar('stddev', stddev)
                tf.summary.scalar('max', tf.reduce_max(var[i]))
                tf.summary.scalar('min', tf.reduce_min(var[i]))
                tf.summary.histogram('histogram', var[i])


In [3]:
def variable_summaries2(var):
    #Attach a lot of summaries to a Tensor (for TensorBoard visualization).
    
    with tf.name_scope('summaries'):
        tf.summary.scalar('value', var)
        tf.summary.histogram('histogram', var)


### Checking the use of GPUS

In [4]:
from keras import backend as K
print(K.tensorflow_backend._get_available_gpus())
sess= K.get_session()

['/gpu:0']


## Initializing Environment

In [5]:
env = gym.make('Humanoid-v2')
So = env.reset()
A = env.action_space.sample()
number_of_neurons= 64
layer_dropout = 0.5
input_dropout = 0.1

## Setting learning hyperparameters 

In [6]:
#Policy
policy_alpha = 0.01
policy_lambda =  0
#Value
value_alpha = 0.01
value_lambda = 0

## Creating the model for policy mean

In [7]:
policy_model_mean = Sequential()
policy_model_mean.add(Dropout(input_dropout,input_shape=So.shape))
policy_model_mean.add(Dense(number_of_neurons ,kernel_initializer='random_uniform', activation = 'relu' ))
policy_model_mean.add(Dropout(layer_dropout))
policy_model_mean.add(Dense(number_of_neurons ,kernel_initializer='random_uniform', activation = 'relu' ))
policy_model_mean.add(Dropout(layer_dropout))
policy_model_mean.add(Dense(number_of_neurons,kernel_initializer='random_uniform', activation = 'relu'))
policy_model_mean.add(Dropout(layer_dropout))
policy_model_mean.add(Dense( A.shape[0],kernel_initializer='random_uniform', activation='linear' ))

adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
policy_model_mean.compile(loss="mse", optimizer= adam)
print("Policy Mean network")
print(policy_model_mean.summary())

Policy Mean network
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dropout_1 (Dropout)          (None, 376)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                24128     
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_3 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_4 (Dropout)          (None, 64)                0

## Creating the model for policy std

In [8]:
policy_model_std = Sequential()
policy_model_std.add(Dropout(input_dropout,input_shape=So.shape))
policy_model_std.add(Dense(number_of_neurons ,kernel_initializer='random_uniform', activation = 'relu' ))
policy_model_std.add(Dropout(layer_dropout))
policy_model_std.add(Dense(number_of_neurons ,kernel_initializer='random_uniform', activation = 'relu' ))
policy_model_std.add(Dropout(layer_dropout))
policy_model_std.add(Dense(number_of_neurons,kernel_initializer='random_uniform', activation = 'relu'))
policy_model_std.add(Dropout(layer_dropout))
policy_model_std.add(Dense( A.shape[0],kernel_initializer='random_uniform', activation='linear' ))

adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
policy_model_std.compile(loss="mse", optimizer= adam)
print("Policy std network")
print(policy_model_std.summary())

Policy std network
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dropout_5 (Dropout)          (None, 376)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 64)                24128     
_________________________________________________________________
dropout_6 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_7 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_8 (Dropout)          (None, 64)                0 

## Creating model for values 

In [9]:
value_model = Sequential()
value_model.add(Dropout(input_dropout,input_shape=So.shape))
value_model.add(Dense(number_of_neurons, kernel_initializer='random_uniform',activation = 'relu' ))
value_model.add(Dropout(layer_dropout))
value_model.add(Dense(number_of_neurons, kernel_initializer='random_uniform',activation = 'relu'))
value_model.add(Dropout(layer_dropout))
value_model.add(Dense(number_of_neurons, kernel_initializer='random_uniform',activation = 'relu'))
value_model.add(Dropout(layer_dropout))
value_model.add(Dense(1,kernel_initializer='random_uniform', activation= 'linear'))

print("Value Model")
adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
value_model.compile(loss="mse", optimizer= adam)
print(value_model.summary())

Value Model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dropout_9 (Dropout)          (None, 376)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 64)                24128     
_________________________________________________________________
dropout_10 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 64)                4160      
_________________________________________________________________
dropout_11 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 64)                4160      
_________________________________________________________________
dropout_12 (Dropout)         (None, 64)                0        

## Eligibility Traces for the models

In [10]:
sess.run(tf.global_variables_initializer())

with tf.name_scope("eligibility_traces"):
    with tf.name_scope("MEAN"):
        policy_eligibility_traces_mean= [tf.Variable(np.zeros(shape = sess.run(tf.shape(tensor))), dtype= tf.float32) for tensor in policy_model_mean.trainable_weights]
        variable_summaries(policy_eligibility_traces_mean)
    with tf.name_scope("STD"):
        policy_eligibility_traces_std= [tf.Variable(np.zeros(shape = sess.run(tf.shape(tensor))), dtype= tf.float32) for tensor in policy_model_std.trainable_weights]
        variable_summaries(policy_eligibility_traces_std)
    with tf.name_scope("VALUE"):
        value_eligibility_traces= [tf.Variable(np.zeros(shape = sess.run(tf.shape(tensor))), dtype= tf.float32) for tensor in value_model.trainable_weights]
        variable_summaries(value_eligibility_traces)
    

## Defining other hyperparameters concerning the reinforcement learning

In [11]:
average_reward = 5 #pickle.load(open('average_reward.ferch','rb'))
cumulative_reward=0
etaa = 0.0001
clip_norm = 1

In [12]:
if(False):
    policy_model_mean = load_model('policy_mean.h5')
    policy_model_std = load_model('policy_std.h5')
    value_model = load_model('value.h5')

## Update weights

In [13]:
with tf.name_scope("weights"):
    
    with tf.name_scope("VALUE"):
        value_weights = value_model.trainable_weights
        variable_summaries(value_weights)
    with tf.name_scope("MEAN"):
        policy_weights_mean = policy_model_mean.trainable_weights
        variable_summaries(policy_weights_mean)
    with tf.name_scope("STD"):
        policy_weights_std = policy_model_std.trainable_weights
        variable_summaries(policy_weights_std)


In [14]:
with tf.name_scope("Rewards"):
    with tf.name_scope("R"):
        R = tf.Variable(0, dtype= tf.float32)
        variable_summaries2(R)
    with tf.name_scope("Delta_R"):
        Delta_R = tf.Variable(0, dtype= tf.float32)
        variable_summaries2(Delta_R)
    with tf.name_scope("Average_R"):
        Average_R = tf.Variable(0, dtype= tf.float32)
        variable_summaries2(Average_R)
    
sess.run(tf.global_variables_initializer())

In [15]:
#VALUE OPS
global value_eligibility_traces , value_weights , policy_weights_std
value_gradients = value_model.optimizer.get_gradients(value_model.output, value_weights)
Delta_rewards = tf.placeholder(tf.float32)
value_gradients = [tf.clip_by_norm(gradient, clip_norm) for gradient in value_gradients]
value_eligibility_traces_op = [value_eligibility_traces[i].assign(tf.add(tf.multiply(value_eligibility_traces[i], tf.constant(value_lambda, dtype = tf.float32)),value_gradients[i] )) for i in range(len(value_eligibility_traces))] 
value_weights_op =  [value_weights[i].assign(tf.add(value_weights[i], tf.multiply(value_alpha*Delta_rewards, value_eligibility_traces[i]))) for i in range(len(value_weights))]
    

In [16]:
#MEAN OPS
global policy_eligibility_traces_mean, policy_eligibility_traces_std , policy_weights_mean
Action = tf.placeholder(tf.float32, shape = A.shape)
Policy_std = tf.placeholder(tf.float32)
loss_mean = tf.divide(tf.subtract(Action, policy_model_mean.output), Policy_std*Policy_std)
policy_gradients_mean = policy_model_mean.optimizer.get_gradients(loss_mean, policy_weights_mean)
policy_gradients_mean = [tf.clip_by_norm(gradient, clip_norm) for gradient in policy_gradients_mean]
policy_eligibility_traces_op_mean = [policy_eligibility_traces_mean[i].assign(tf.add(tf.multiply(tf.constant(policy_lambda, dtype = tf.float32), policy_eligibility_traces_mean[i]), policy_gradients_mean[i])) for i in range(len(policy_gradients_mean))]

policy_weights_op_mean =  [policy_weights_mean[i].assign(tf.add(policy_weights_mean[i], tf.multiply(policy_eligibility_traces_mean[i],policy_alpha*Delta_rewards))) for i in range(len(policy_weights_mean))]

In [17]:
#STD OPS
global policy_eligibility_traces_mean, policy_eligibility_traces_std , policy_weights_mean
Policy_mean =tf.placeholder(tf.float32)
loss_std = 2* tf.subtract(tf.divide(tf.square(tf.subtract(Action,Policy_mean)), tf.pow(policy_model_std.output, 3)), tf.divide(1, policy_model_std.output))
policy_gradients_std = policy_model_std.optimizer.get_gradients(loss_std,policy_weights_std)
policy_gradients_std = [tf.clip_by_norm(gradient, clip_norm) for gradient in policy_gradients_std]
policy_eligibility_traces_op_std = [policy_eligibility_traces_std[i].assign(tf.add(tf.multiply(tf.constant(policy_lambda, dtype = tf.float32), policy_eligibility_traces_std[i]), policy_gradients_std[i])) for i in range(len(policy_gradients_std))]

policy_weights_op_std =  [policy_weights_std[i].assign(tf.add(policy_weights_std[i], tf.multiply( policy_eligibility_traces_std[i],policy_alpha*Delta_rewards))) for i in range(len(policy_weights_std))]

In [18]:
def update_weights(previous_state, action , reward, state,  terminal, skip_count):
    
    #Compute rewards
    global average_reward,rewardz_received,delta_rewardz_received, cumulative_reward 
    delta_rewards = reward - average_reward*skip_count + value_model.predict(np.array([state]))[0][0] - value_model.predict(np.array([previous_state]))[0][0]    
    cumulative_reward = (cumulative_reward + reward)/2
    print(value_model.predict(np.array([previous_state]))[0][0] ,"=====",value_model.predict(np.array([state]))[0][0],"====",delta_rewards)
    if(terminal):
        delta_rewards = reward - average_reward*skip_count + 0  - value_model.predict(np.array([previous_state]))[0][0]
    thresh = average_reward + cumulative_reward
    if(delta_rewards>=thresh):
        delta_rewards =thresh
    if(delta_rewards<=-thresh):
        delta_rewards=-thresh
    average_reward = average_reward + etaa*delta_rewards
    sess.run(Delta_R.assign(delta_rewards))
    sess.run(Average_R.assign(average_reward))
    
    #Compute value updates (eligibility traces and weights)
    sess.run(value_eligibility_traces_op,feed_dict={value_model.input:np.array([state])})    
    sess.run(value_weights_op, feed_dict={Delta_rewards:delta_rewards})
    
    #Compute policy updates (eligibility traces and weights)

    policy_mean = policy_model_mean.predict(np.array([previous_state]))[0]
    policy_std = policy_model_std.predict(np.array([previous_state]))[0]
    policy_std= policy_std*policy_std
    
    ##################################################  MEAN ##################################################################
    sess.run(policy_eligibility_traces_op_mean, feed_dict={policy_model_mean.input:np.array([state]), Action:action, Policy_std:policy_std})
    sess.run(policy_weights_op_mean, feed_dict={Delta_rewards:delta_rewards})    
    
    ##################################################   STD ###################################################################
    sess.run(policy_eligibility_traces_op_std, feed_dict={policy_model_std.input:np.array([state]), Action:action, Policy_mean:policy_mean})
    sess.run(policy_weights_op_std, feed_dict={Delta_rewards:delta_rewards} )
    
    

## Sample action 

In [19]:
def select_action(state):
    
    mean = policy_model_mean.predict(np.array([state]))[0]
    std = policy_model_std.predict(np.array([state]))[0]
    std= std*std
    #print(mean)
    covariance = np.zeros([mean.shape[0],mean.shape[0]])
    
    for i in range(std.shape[0]):
        covariance[i][i]= std[i]
        
    action = np.random.multivariate_normal(mean, covariance)
    #print(action)

    return action


In [20]:
'''      
    #FIXING ACTION INCONSISTENCIES
    low = env.action_space.low
    high = env.action_space.high
    
    for i in range(action.shape[0]):
        if(action[i]>high[i]):
            action[i]= high[i]+0.1
        if(action[i]<low[i]):
            action[i]=low[i]+0.1
            \    high = env.action_space.high
    global average_reward_variance
    print(average_reward_variance)
    if(np.abs(average_reward_variance)<0.1):
        print("Sampling randomly")
        return env.action_space.sample()
''' 

'      \n    #FIXING ACTION INCONSISTENCIES\n    low = env.action_space.low\n    high = env.action_space.high\n    \n    for i in range(action.shape[0]):\n        if(action[i]>high[i]):\n            action[i]= high[i]+0.1\n        if(action[i]<low[i]):\n            action[i]=low[i]+0.1\n            \\    high = env.action_space.high\n    global average_reward_variance\n    print(average_reward_variance)\n    if(np.abs(average_reward_variance)<0.1):\n        print("Sampling randomly")\n        return env.action_space.sample()\n'

# Run Episodes
## Setting up variables :D

In [21]:
S = env.reset()
#action_count=0
episode_count=0
save= True
total_reward =0 
total_reward_list=[]
skip_count = 0
skip_reward=0
action_count= 0

In [22]:
merged = tf.summary.merge_all()
writer = tf.summary.FileWriter("./logz/",sess.graph)

In [None]:
policy_eligibility_traces_mean_reset= [policy_eligibility_traces_mean[i].assign(np.zeros(shape = sess.run(tf.shape(policy_eligibility_traces_mean[i])))) for i in range(len(policy_eligibility_traces_mean))]
policy_eligibility_traces_std_reset=  [policy_eligibility_traces_std[i].assign(np.zeros(shape = sess.run(tf.shape(policy_eligibility_traces_std[i])))) for i in range(len(policy_eligibility_traces_std))]
value_eligibility_traces_reset=  [value_eligibility_traces[i].assign(np.zeros(shape = sess.run(tf.shape(value_eligibility_traces[i])))) for i in range(len(value_eligibility_traces))]

## Main loop 

In [None]:
while(True):
    
    #renders environment 
    #env.render()
    
    high = env.action_space.high
    #Selects action according to stochastic policy
    action = select_action(S%high[0])
    if(action_count%50==0):
        summary = sess.run(merged)
        writer.add_summary(summary, action_count)
    action_count+=1
    
    
    #Takes action 
    S1, reward, done, info = env.step(action)
    skip_reward += reward
    skip_count += 1
    sess.run(R.assign(reward))
    
    #Updates weights
    #if(np.random.randint(10)%5==0):
    if(True):
        update_weights(S , action, skip_reward, S1 , done, skip_count)
        skip_reward = 0
        skip_count =0
    
    S = S1
    total_reward +=reward
    
    if(done):
        
        #print('We are now at '+str(episode_count))
        gc.collect()
        print(str(episode_count)+' --total reward ='+ str(total_reward))
        print(str(episode_count)+' --average reward ='+ str(average_reward))
        total_reward_list.append(total_reward)
        total_reward=0
        
        #Resets episode
        S = env.reset()
        #action_count=0
        episode_count+=1
        
        
        
        if(episode_count%10==0):
            
            if(save):
                print('saving models')
                print('average reward '+ str(average_reward))
                
                policy_model_mean.save('policy_mean.h5')
                policy_model_std.save('policy_std.h5')
                value_model.save('value.h5')
                pickle.dump(total_reward_list,open('totalz_hist','wb')) 
                
                
                #Restarting keras session 
                '''
                K.clear_session()
                sess = K.get_session()
                policy_model_mean = load_model('policy_mean.h5')
                policy_model_std = load_model('policy_std.h5')
                value_model = load_model('value.h5')
                '''

        
        #Resets eligibility traces
        sess.run([policy_eligibility_traces_mean_reset,policy_eligibility_traces_std_reset,value_eligibility_traces_reset])
       

0.000511149 ===== 0.000504193 ==== -0.000334515088331
0.000500695 ===== 0.000513432 ==== -0.000314132384291
0.000510146 ===== 0.00062529 ==== -0.00021117556882
0.00062308 ===== 0.000648572 ==== -0.000301812302636
0.000645432 ===== 0.000714968 ==== -0.000257717175486
0.000712271 ===== 0.000742662 ==== -0.000297177064658
0.000739553 ===== 0.00082582 ==== -0.000240192371992
0.00082331 ===== 0.000808136 ==== -0.000341736411198
0.00080457 ===== 0.000916214 ==== -0.000215034655642
0.000913971 ===== 0.000939238 ==== -0.000301263152289
0.000936076 ===== 0.242206 ==== -1.1345637387
-0.0800078 ===== -0.0391091 ==== -0.157661149425
-0.0610922 ===== -0.0343816 ==== 0.0226755556953
-0.032843 ===== -0.0252555 ==== 0.0316362230676
-0.0239016 ===== -0.0242112 ==== 0.0246760135365
-0.023213 ===== -0.0251616 ==== 0.0205092774106
-0.0242209 ===== -0.0263062 ==== 0.0196812094075
-0.025298 ===== -0.0257095 ==== 0.0243215027871
-0.024381 ===== -0.025938 ==== 0.0296161653008
-0.0243169 ===== -0.0240848 ==== 

0.76251 ===== 0.369479 ==== -0.121317499824
0.360556 ===== 0.20946 ==== 0.161165800153
0.212947 ===== 0.494326 ==== 0.597174187145
0.546079 ===== 0.371697 ==== 0.165098477242
0.381175 ===== 0.476945 ==== 0.438380873721
0.511533 ===== 0.447588 ==== 0.296504236657
0.468208 ===== 0.457618 ==== 0.360006353838
0.482701 ===== 0.583585 ==== 0.484543267324
0.630465 ===== 0.532041 ==== 0.314504758501
5 --total reward =117.217232152
5 --average reward =5.00156636049
0.185476 ===== 0.194675 ==== -0.00766936411243
0.194565 ===== 0.197825 ==== -0.0136325185674
0.19763 ===== 0.195536 ==== -0.0188508187724
0.195267 ===== 0.196175 ==== -0.0162095411549
0.195906 ===== 0.193069 ==== -0.0199471134964
0.192781 ===== 0.197591 ==== -0.0118268259885
0.197422 ===== 0.199826 ==== -0.0142521925807
0.199622 ===== 0.194405 ==== -0.0222033109088
0.194069 ===== 0.196349 ==== -0.0148275137278
0.196109 ===== 4.46573 ==== 3.05365337366
7.52008 ===== 3.69897 ==== -3.9800785925
1.8794 ===== 0.731328 ==== -1.00816926911
