# Reinforcement Learning
## Importing Libraries

In [2]:
import gym
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
#from keras.constraints import MaxNorm
import numpy as np
import tensorflow as tf
import gc
import pickle
from keras.models import load_model

### Checking the use of GPUS

In [1]:
from keras import backend as K
print(K.tensorflow_backend._get_available_gpus())
sess= K.get_session()

Using TensorFlow backend.


['/gpu:0']


## Initializing Environment

In [3]:
env = gym.make('Humanoid-v2')
So = env.reset()
A = env.action_space.sample()

## Setting learning hyperparameters 

In [4]:
#Policy
policy_alpha = 0.1
policy_lambda =  0.9
#Value
value_alpha = 0.1
value_lambda = 0.9

## Creating the model for policy mean

In [7]:
policy_model_mean = Sequential()
policy_model_mean.add(Dense(100,input_shape = So.shape ,kernel_initializer='random_uniform', activation = 'relu' ))
policy_model_mean.add(Dense(30,kernel_initializer='random_uniform', activation = 'relu'))
policy_model_mean.add(Dense( A.shape[0],kernel_initializer='random_uniform', activation='linear' ))

adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
policy_model_mean.compile(loss="mse", optimizer= adam)
print("Policy Mean network")
print(policy_model_mean.summary())

Policy Mean network
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 100)               37700     
_________________________________________________________________
dense_8 (Dense)              (None, 30)                3030      
_________________________________________________________________
dense_9 (Dense)              (None, 17)                527       
Total params: 41,257
Trainable params: 41,257
Non-trainable params: 0
_________________________________________________________________
None


## Creating the model for policy std

In [8]:
policy_model_std = Sequential()
policy_model_std.add(Dense(100,input_shape = So.shape ,kernel_initializer='random_uniform', activation = 'relu' ))
policy_model_std.add(Dense(30,kernel_initializer='random_uniform', activation = 'relu'))
policy_model_std.add(Dense( A.shape[0],kernel_initializer='random_uniform', activation='exponential' ))

adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
policy_model_std.compile(loss="mse", optimizer= adam)
print("Policy std network")
print(policy_model_std.summary())

Policy std network
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_10 (Dense)             (None, 100)               37700     
_________________________________________________________________
dense_11 (Dense)             (None, 30)                3030      
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 31        
Total params: 40,761
Trainable params: 40,761
Non-trainable params: 0
_________________________________________________________________
None


## Creating model for values 

In [10]:
value_model = Sequential()
value_model.add(Dense(100,input_shape = So.shape, kernel_initializer='random_uniform',activation = 'relu' ))
value_model.add(Dense(30, kernel_initializer='random_uniform',activation = 'relu'))
value_model.add(Dense(1,kernel_initializer='random_uniform', activation= 'linear'))

print("Value Model")
value_model.compile(loss="mse", optimizer= "sgd")
print(value_model.summary())

Value Model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_16 (Dense)             (None, 100)               37700     
_________________________________________________________________
dense_17 (Dense)             (None, 30)                3030      
_________________________________________________________________
dense_18 (Dense)             (None, 1)                 31        
Total params: 40,761
Trainable params: 40,761
Non-trainable params: 0
_________________________________________________________________
None


## Eligibility Traces for the models

In [11]:
sess.run(tf.global_variables_initializer())

policy_eligibility_traces_mean= [tf.zeros(shape = tensor.eval(session = sess).shape) for tensor in policy_model_mean.trainable_weights]
policy_eligibility_traces_std= [tf.zeros(shape = tensor.eval(session = sess).shape) for tensor in policy_model_std.trainable_weights]
value_eligibility_traces= [tf.zeros(shape = tensor.eval(session = sess).shape) for tensor in value_model.trainable_weights]

## Defining other hyperparameters concerning the reinforcement learning

In [12]:
average_reward = 5 #pickle.load(open('average_reward.ferch','rb'))
etaa = 0.01

## Auxiliary functions for weights update


In [13]:
def set_weights(model, weights):  
    if(model=='value'):
        i=0
        for layer in value_model.layers:
            if 'dropout' not in layer.name:
                layer.set_weights([weights[i],weights[i+1]])
                i+=2
    elif(model=='policy_mean'):
        i=0
        for layerz in policy_model_mean.layers:
            if 'dropout' not in layerz.name:
                layerz.set_weights([weights[i],weights[i+1]])
                i+=2
                
    elif(model=='policy_std'):
        i=0
        for layerz in policy_model_std.layers:
            if 'dropout' not in layerz.name:
                layerz.set_weights([weights[i],weights[i+1]])
                i+=2
    else:
        print('error')

def get_value_weights():
    weights =[]
    for w in value_model.layers:
        weights+= w.weights
    return weights

def get_policy_weights_mean():
    weights =[]
    for w in policy_model_mean.layers:
        weights+= w.weights
    return weights

def get_policy_weights_std():
    weights =[]
    for w in policy_model_std.layers:
        weights+= w.weights
    return weights

## Update weights

In [None]:
def update_weights(previous_state, action , reward, state,  terminal, skip_count):
    
    #Compute rewards
    global average_reward,rewardz_received,delta_rewardz_received
    delta_rewards = reward - average_reward*skip_count + value_model.predict(np.array([state]))[0][0] - value_model.predict(np.array([previous_state]))[0][0]    
    rewardz_received.append(reward)
    delta_rewardz_received.append(delta_rewards)
    print(delta_rewards)
    if(terminal):
        delta_rewards = reward - average_reward*skip_count + 0 - value_model.predict(np.array([previous_state]))[0][0]
    thresh = average_reward*5
    if(delta_rewards>=thresh):
        delta_rewards =thresh
    if(delta_rewards<=-thresh):
        delta_rewards=-thresh
    average_reward = average_reward + etaa*delta_rewards
    
    #Compute value updates (eligibility traces and weights)
    global value_eligibility_traces
    value_gradients = value_model.optimizer.get_gradients(value_model.output, value_model.trainable_weights)
    value_gradients = [tf.clip_by_norm(gradient, 10) for gradient in value_gradients]
    value_eligibility_traces_op = [tf.add(tf.multiply(value_eligibility_traces[i], tf.constant(value_lambda,dtype= tf.float32)),value_gradients[i] ) for i in range(len(value_eligibility_traces))] 
    value_eligibility_traces = [tf.convert_to_tensor(x) for x in sess.run(value_eligibility_traces_op,feed_dict={value_model.input:np.array([previous_state])})]       
    value_weights = get_value_weights()
    value_weights_op =  [tf.add(value_weights[i], tf.multiply(tf.constant(value_alpha*delta_rewards,dtype=tf.float32), value_eligibility_traces[i])) for i in range(len(value_weights))]
    set_weights('value', sess.run(value_weights_op))
    
    #Compute policy updates (eligibility traces and weights)
    
    global policy_eligibility_traces_mean
    policy_mean = policy_model_mean.predict(np.array([previous_state]))[0]
    policy_std = policy_model_std.predict(np.array([previous_state]))[0]
    
    ##################################################  MEAN ##################################################################
    loss_mean = tf.divide(tf.subtract(tf.constant(action, dtype=tf.float32), policy_model_mean.output), policy_std*policy_std)
    policy_gradients_mean = policy_model_mean.optimizer.get_gradients(loss_mean, policy_model_mean.trainable_weights)
    policy_gradients_mean = [tf.clip_by_norm(gradient, 10) for gradient in policy_gradients_mean]

    policy_eligibility_traces_op_mean = [tf.add(tf.multiply(tf.constant(policy_lambda,dtype=tf.float32), policy_eligibility_traces_mean[i]), policy_gradients_mean[i]) for i in range(len(policy_gradients_mean))]
    policy_eligibility_traces_mean = [tf.convert_to_tensor(x) for x in sess.run(policy_eligibility_traces_op_mean, feed_dict={policy_model_mean.input:np.array([np.append(previous_state)])})]
    policy_weights_mean = get_policy_weights_mean()
    policy_weights_op_mean =  [tf.add(policy_weights_mean[i], tf.multiply( policy_eligibility_traces_mean[i],tf.constant(policy_alpha*delta_rewards,dtype=tf.float32))) for i in range(len(policy_weights_mean))]
    set_weights('policy_mean', sess.run(policy_weights_op_mean))    
    
    ##################################################   STD ###################################################################
    loss_std = tf.subtract(tf.divide(tf.square(tf.subtract(tf.constant(action, dtype=tf.float32)),tf.constant(policy_mean, dtype= tf.float32)), tf.pow(policy_model_std.output, 3)), tf.divide(1, policy_model_std.output))
    policy_gradients_std = policy_model_std.optimizer.get_gradients(loss_std, policy_model_std.trainable_weights)
    policy_gradients_std = [tf.clip_by_norm(gradient, 10) for gradient in policy_gradients_std]

    policy_eligibility_traces_op_std = [tf.add(tf.multiply(tf.constant(policy_lambda,dtype=tf.float32), policy_eligibility_traces_std[i]), policy_gradients_std[i]) for i in range(len(policy_gradients_std))]
    policy_eligibility_traces_std = [tf.convert_to_tensor(x) for x in sess.run(policy_eligibility_traces_op_std, feed_dict={policy_model_std.input:np.array([np.append(previous_state)])})]
    policy_weights_std = get_policy_weights_std()
    policy_weights_op_std =  [tf.add(policy_weights_std[i], tf.multiply( policy_eligibility_traces_std[i],tf.constant(policy_alpha*delta_rewards,dtype=tf.float32))) for i in range(len(policy_weights_std))]
    set_weights('policy_std', sess.run(policy_weights_op_std))    
    
    