# Reinforcement Learning
## Importing Libraries

In [19]:
import gym
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
#from keras.constraints import MaxNorm
import numpy as np
import tensorflow as tf
import gc
import pickle
from keras.models import load_model

### Checking the use of GPUS

In [20]:
from keras import backend as K
print(K.tensorflow_backend._get_available_gpus())
sess= K.get_session()

['/gpu:0']


## Initializing Environment

In [21]:
env = gym.make('Humanoid-v2')
So = env.reset()
A = env.action_space.sample()

## Setting learning hyperparameters 

In [22]:
#Policy
policy_alpha = 0.01
policy_lambda =  0.9
#Value
value_alpha = 0.01
value_lambda = 0.9

## Creating the model for policy mean

In [23]:
policy_model_mean = Sequential()
policy_model_mean.add(Dense(100,input_shape = So.shape ,kernel_initializer='random_uniform', activation = 'relu' ))
policy_model_mean.add(Dropout(0.5))
policy_model_mean.add(Dense(100,kernel_initializer='random_uniform', activation = 'relu'))
policy_model_mean.add(Dropout(0.5))
policy_model_mean.add(Dense(100,kernel_initializer='random_uniform', activation = 'relu'))
policy_model_mean.add(Dropout(0.5))
policy_model_mean.add(Dense(100,kernel_initializer='random_uniform', activation = 'relu'))
policy_model_mean.add(Dense( A.shape[0],kernel_initializer='random_uniform', activation='linear' ))

adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
policy_model_mean.compile(loss="mse", optimizer= adam)
print("Policy Mean network")
print(policy_model_mean.summary())

Policy Mean network
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_16 (Dense)             (None, 100)               37700     
_________________________________________________________________
dropout_10 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_17 (Dense)             (None, 100)               10100     
_________________________________________________________________
dropout_11 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_18 (Dense)             (None, 100)               10100     
_________________________________________________________________
dropout_12 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_19 (Dense)             (None, 100)               1

## Creating the model for policy std

In [24]:
policy_model_std = Sequential()
policy_model_std.add(Dense(100,input_shape = So.shape ,kernel_initializer='random_uniform', activation = 'relu' ))
policy_model_std.add(Dropout(0.5))
policy_model_std.add(Dense(100,kernel_initializer='random_uniform', activation = 'relu'))
policy_model_std.add(Dropout(0.5))
policy_model_std.add(Dense(100,kernel_initializer='random_uniform', activation = 'relu'))
policy_model_std.add(Dropout(0.5))
policy_model_std.add(Dense(100,kernel_initializer='random_uniform', activation = 'relu'))
policy_model_std.add(Dense( A.shape[0],kernel_initializer='random_uniform', activation='linear' ))

adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
policy_model_std.compile(loss="mse", optimizer= adam)
print("Policy std network")
print(policy_model_std.summary())

Policy std network
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_21 (Dense)             (None, 100)               37700     
_________________________________________________________________
dropout_13 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_22 (Dense)             (None, 100)               10100     
_________________________________________________________________
dropout_14 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_23 (Dense)             (None, 100)               10100     
_________________________________________________________________
dropout_15 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_24 (Dense)             (None, 100)               10

## Creating model for values 

In [25]:
value_model = Sequential()
value_model.add(Dense(100,input_shape = So.shape, kernel_initializer='random_uniform',activation = 'relu' ))
value_model.add(Dropout(0.5))
value_model.add(Dense(100, kernel_initializer='random_uniform',activation = 'relu'))
value_model.add(Dropout(0.5))
value_model.add(Dense(100, kernel_initializer='random_uniform',activation = 'relu'))
value_model.add(Dropout(0.5))
value_model.add(Dense(100, kernel_initializer='random_uniform',activation = 'relu'))
value_model.add(Dense(1,kernel_initializer='random_uniform', activation= 'linear'))

print("Value Model")
adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
value_model.compile(loss="mse", optimizer= adam)
print(value_model.summary())

Value Model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_26 (Dense)             (None, 100)               37700     
_________________________________________________________________
dropout_16 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_27 (Dense)             (None, 100)               10100     
_________________________________________________________________
dropout_17 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_28 (Dense)             (None, 100)               10100     
_________________________________________________________________
dropout_18 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_29 (Dense)             (None, 100)               10100    

## Eligibility Traces for the models

In [26]:
sess.run(tf.global_variables_initializer())

policy_eligibility_traces_mean= [tf.zeros(shape = tensor.eval(session = sess).shape) for tensor in policy_model_mean.trainable_weights]
policy_eligibility_traces_std= [tf.zeros(shape = tensor.eval(session = sess).shape) for tensor in policy_model_std.trainable_weights]
value_eligibility_traces= [tf.zeros(shape = tensor.eval(session = sess).shape) for tensor in value_model.trainable_weights]

## Defining other hyperparameters concerning the reinforcement learning

In [27]:
average_reward = 5 #pickle.load(open('average_reward.ferch','rb'))
cumulative_reward=0
average_reward_variance = 0 
etaa = 0.01

In [28]:
if(False):
    policy_model_mean = load_model('policy_mean.h5')
    policy_model_std = load_model('policy_std.h5')
    value_model = load_model('value.h5')

## Auxiliary functions for weights update


In [29]:
def set_weights(model, weights):  
    if(model=='value'):
        i=0
        for layer in value_model.layers:
            if 'dropout' not in layer.name:
                layer.set_weights([weights[i],weights[i+1]])
                i+=2
    elif(model=='policy_mean'):
        i=0
        for layerz in policy_model_mean.layers:
            if 'dropout' not in layerz.name:
                layerz.set_weights([weights[i],weights[i+1]])
                i+=2
                
    elif(model=='policy_std'):
        i=0
        for layerz in policy_model_std.layers:
            if 'dropout' not in layerz.name:
                layerz.set_weights([weights[i],weights[i+1]])
                i+=2
    else:
        print('error')

def get_value_weights():
    weights =[]
    for w in value_model.layers:
        weights+= w.weights
    return weights

def get_policy_weights_mean():
    weights =[]
    for w in policy_model_mean.layers:
        weights+= w.weights
    return weights

def get_policy_weights_std():
    weights =[]
    for w in policy_model_std.layers:
        weights+= w.weights
    return weights

In [30]:
def check_action(action):
    #FIXING ACTION INCONSISTENCIES
    low = env.action_space.low
    high = env.action_space.high
    
    for i in range(action.shape[0]):
        if(action[i]>high[i]):
            return True
        if(action[i]<low[i]):
            return True
    return False

## Update weights

In [31]:
value_weights = get_value_weights()
policy_weights_mean = get_policy_weights_mean()
policy_weights_std = get_policy_weights_std()

In [32]:
def update_weights(previous_state, action , reward, state,  terminal, skip_count):
    
    #Compute rewards
    global average_reward,rewardz_received,delta_rewardz_received, cumulative_reward , average_reward_variance
    delta_rewards = reward - average_reward*skip_count + value_model.predict(np.array([state]))[0][0] - value_model.predict(np.array([previous_state]))[0][0]    
    cumulative_reward = (cumulative_reward + reward)/2
    print(value_model.predict(np.array([previous_state]))[0][0] ,"=====",value_model.predict(np.array([state]))[0][0],"====",delta_rewards)
    if(terminal):
        delta_rewards = reward - average_reward*skip_count + 0 - value_model.predict(np.array([previous_state]))[0][0]
    thresh = average_reward + cumulative_reward
    if(delta_rewards>=thresh):
        delta_rewards =thresh
    if(delta_rewards<=-thresh):
        delta_rewards=-thresh
    average_reward = average_reward + etaa*delta_rewards
    average_reward_variance = (np.abs(delta_rewards) + average_reward_variance)/2
    
    #Compute value updates (eligibility traces and weights)
    global value_eligibility_traces, value_weights
    value_gradients = value_model.optimizer.get_gradients(value_model.output, value_model.trainable_weights)
    value_gradients = [tf.clip_by_norm(gradient, 1) for gradient in value_gradients]
    value_eligibility_traces_op = [tf.add(tf.multiply(value_eligibility_traces[i], tf.constant(value_lambda,dtype= tf.float32)),value_gradients[i] ) for i in range(len(value_eligibility_traces))] 
    value_eligibility_traces = [tf.convert_to_tensor(x) for x in sess.run(value_eligibility_traces_op,feed_dict={value_model.input:np.array([previous_state])})]       
    if(np.random.randint(10)%5==0):
        value_weights = get_value_weights()
    value_weights_op =  [tf.add(value_weights[i], tf.multiply(tf.constant(value_alpha*delta_rewards,dtype=tf.float32), value_eligibility_traces[i])) for i in range(len(value_weights))]
    if(np.random.randint(10)%5==0):
        set_weights('value', sess.run(value_weights_op))
    
    #Compute policy updates (eligibility traces and weights)
    
    global policy_eligibility_traces_mean, policy_eligibility_traces_std, policy_weights_mean, policy_weights_std
    policy_mean = policy_model_mean.predict(np.array([previous_state]))[0]
    policy_std = policy_model_std.predict(np.array([previous_state]))[0]
    policy_std= policy_std*policy_std
    
    ##################################################  MEAN ##################################################################
    loss_mean = tf.divide(tf.subtract(tf.constant(action, dtype=tf.float32), policy_model_mean.output), policy_std*policy_std)
    policy_gradients_mean = policy_model_mean.optimizer.get_gradients(loss_mean, policy_model_mean.trainable_weights)
    policy_gradients_mean = [tf.clip_by_norm(gradient, 1) for gradient in policy_gradients_mean]

    policy_eligibility_traces_op_mean = [tf.add(tf.multiply(tf.constant(policy_lambda,dtype=tf.float32), policy_eligibility_traces_mean[i]), policy_gradients_mean[i]) for i in range(len(policy_gradients_mean))]
    policy_eligibility_traces_mean = [tf.convert_to_tensor(x) for x in sess.run(policy_eligibility_traces_op_mean, feed_dict={policy_model_mean.input:np.array([previous_state])})]
    if(np.random.randint(10)%5==0):
        policy_weights_mean = get_policy_weights_mean()
    policy_weights_op_mean =  [tf.add(policy_weights_mean[i], tf.multiply( policy_eligibility_traces_mean[i],tf.constant(policy_alpha*delta_rewards,dtype=tf.float32))) for i in range(len(policy_weights_mean))]
    if(np.random.randint(10)%5==0):
        set_weights('policy_mean', sess.run(policy_weights_op_mean))    
    
    ##################################################   STD ###################################################################
    loss_std = 2* tf.subtract(tf.divide(tf.square(tf.subtract(tf.constant(action, dtype=tf.float32),tf.constant(policy_mean, dtype= tf.float32))), tf.pow(policy_model_std.output, 3)), tf.divide(1, policy_model_std.output))
    policy_gradients_std = policy_model_std.optimizer.get_gradients(loss_std, policy_model_std.trainable_weights)
    policy_gradients_std = [tf.clip_by_norm(gradient, 1) for gradient in policy_gradients_std]

    policy_eligibility_traces_op_std = [tf.add(tf.multiply(tf.constant(policy_lambda,dtype=tf.float32), policy_eligibility_traces_std[i]), policy_gradients_std[i]) for i in range(len(policy_gradients_std))]
    policy_eligibility_traces_std = [tf.convert_to_tensor(x) for x in sess.run(policy_eligibility_traces_op_std, feed_dict={policy_model_std.input:np.array([previous_state])})]
    if(np.random.randint(10)%5==0):
        policy_weights_std = get_policy_weights_std()
    policy_weights_op_std =  [tf.add(policy_weights_std[i], tf.multiply( policy_eligibility_traces_std[i],tf.constant(policy_alpha*delta_rewards,dtype=tf.float32))) for i in range(len(policy_weights_std))]
    if(np.random.randint(10)%5==0):
        set_weights('policy_std', sess.run(policy_weights_op_std))    
    
    

## Sample action 

In [33]:
def select_action(state):
    
    mean = policy_model_mean.predict(np.array([state]))[0]
    std = policy_model_std.predict(np.array([state]))[0]
    std= std*std
    #print(mean)
    covariance = np.zeros([mean.shape[0],mean.shape[0]])
    
    for i in range(std.shape[0]):
        covariance[i][i]= std[i]
        
    action = np.random.multivariate_normal(mean, covariance)
    #print(action)

    return action


In [34]:
'''      
    #FIXING ACTION INCONSISTENCIES
    low = env.action_space.low
    high = env.action_space.high
    
    for i in range(action.shape[0]):
        if(action[i]>high[i]):
            action[i]= high[i]+0.1
        if(action[i]<low[i]):
            action[i]=low[i]+0.1
            \    high = env.action_space.high
    global average_reward_variance
    print(average_reward_variance)
    if(np.abs(average_reward_variance)<0.1):
        print("Sampling randomly")
        return env.action_space.sample()
''' 

'      \n    #FIXING ACTION INCONSISTENCIES\n    low = env.action_space.low\n    high = env.action_space.high\n    \n    for i in range(action.shape[0]):\n        if(action[i]>high[i]):\n            action[i]= high[i]+0.1\n        if(action[i]<low[i]):\n            action[i]=low[i]+0.1\n            \\    high = env.action_space.high\n    global average_reward_variance\n    print(average_reward_variance)\n    if(np.abs(average_reward_variance)<0.1):\n        print("Sampling randomly")\n        return env.action_space.sample()\n'

# Run Episodes
## Setting up variables :D

In [None]:
S = env.reset()
#action_count=0
episode_count=0
save= True
total_reward =0 
total_reward_list=[]
skip_count = 0
skip_reward=0

## Main loop 

In [None]:
while(True):
    
    #renders environment 
    #env.render()
    
    high = env.action_space.high
    #Selects action according to stochastic policy
    action = select_action(S%high[0])
    #action_count+=1
    
    
    #Takes action 
    S1, reward, done, info = env.step(action)
    
    skip_reward += reward
    skip_count += 1
    
    #Updates weights
    #if(np.random.randint(10)%5==0):
    if(True):
        update_weights(S , action, skip_reward, S1 , done, skip_count)
        skip_reward = 0
        skip_count =0
    
    S = S1
    total_reward +=reward
    
    if(done):
        
        #print('We are now at '+str(episode_count))
        gc.collect()
        print(str(episode_count)+' --total reward ='+ str(total_reward))
        print(str(episode_count)+' --average reward ='+ str(average_reward))
        total_reward_list.append(total_reward)
        total_reward=0
        
        #Resets episode
        S = env.reset()
        #action_count=0
        episode_count+=1
        
        
        
        if(episode_count%3==0):
            
            if(save):
                print('saving models')
                print('average reward '+ str(average_reward))
                
                policy_model_mean.save('policy_mean.h5')
                policy_model_std.save('policy_std.h5')
                value_model.save('value.h5')
                pickle.dump(total_reward_list,open('totalz_hist','wb'))
                

                #Restarting keras session 
                K.clear_session()
                sess = K.get_session()
                policy_model_mean = load_model('policy_mean.h5')
                policy_model_std = load_model('policy_std.h5')
                value_model = load_model('value.h5')

        
        #Resets eligibility traces
        global value_eligibility_traces, policy_eligibility_traces_mean, policy_eligibility_traces_std
        policy_eligibility_traces_mean= [tf.zeros(shape = tensor.eval(session = sess).shape) for tensor in policy_model_mean.trainable_weights]
        policy_eligibility_traces_std= [tf.zeros(shape = tensor.eval(session = sess).shape) for tensor in policy_model_std.trainable_weights]
        value_eligibility_traces= [tf.zeros(shape = tensor.eval(session = sess).shape) for tensor in value_model.trainable_weights]
    

-0.000290107 ===== -0.000277261 ==== 0.000192143399133
-0.000277261 ===== -0.000265659 ==== 0.000188979078285
-0.000265659 ===== -0.000285316 ==== 0.000155818087766
-0.000285316 ===== -0.000266248 ==== 0.000192959990418
-0.000266248 ===== -0.000245468 ==== 0.000192738902758
-0.000245468 ===== -0.000239632 ==== 0.000175903462929
-0.000231008 ===== -0.000208118 ==== 0.000191152755753
-0.00019768 ===== -0.000184458 ==== 0.000179538612227
-0.000184458 ===== -0.000169192 ==== 0.000179915029879
-0.000169192 ===== -0.00967943 ==== -4.15767305484
-0.00967943 ===== -0.00343698 ==== -0.667514656582
-0.00343698 ===== -0.000716749 ==== -0.0385658435435
-0.000716749 ===== -4.6259e-05 ==== 0.0616351683332
-4.6259e-05 ===== 0.000200709 ==== 0.07536691406
0.000200709 ===== 0.000519277 ==== 0.0746515184426
0.000519277 ===== 0.000857505 ==== 0.0723688385408
0.0104467 ===== 0.0110872 ==== 0.0725947969221
0.0110872 ===== 0.0115869 ==== 0.0756849875813
