# Reinforcement Learning
## Importing Libraries

In [1]:
import gym
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
#from keras.constraints import MaxNorm
import numpy as np
import tensorflow as tf
import gc
import pickle
from keras.models import load_model
from time import time

Using TensorFlow backend.


### Checking the use of GPUS

In [2]:
from keras import backend as K
print(K.tensorflow_backend._get_available_gpus())
sess= K.get_session()

['/gpu:0']


## Initializing Environment

In [3]:
env = gym.make('Humanoid-v2')
So = env.reset()
A = env.action_space.sample()

## Setting learning hyperparameters 

In [4]:
#Policy
policy_alpha = 0.01
policy_lambda =  0.9
#Value
value_alpha = 0.01
value_lambda = 0.9

## Creating the model for policy mean

In [5]:
policy_model_mean = Sequential()
policy_model_mean.add(Dropout(0.1,input_shape=So.shape))
policy_model_mean.add(Dense(30 ,kernel_initializer='random_uniform', activation = 'relu' ))
policy_model_mean.add(Dropout(0.5))
policy_model_mean.add(Dense(30,kernel_initializer='random_uniform', activation = 'relu'))
policy_model_mean.add(Dropout(0.5))
policy_model_mean.add(Dense(30,kernel_initializer='random_uniform', activation = 'relu'))
policy_model_mean.add(Dropout(0.5))
policy_model_mean.add(Dense( A.shape[0],kernel_initializer='random_uniform', activation='linear' ))

adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
policy_model_mean.compile(loss="mse", optimizer= adam)
print("Policy Mean network")
print(policy_model_mean.summary())

Policy Mean network
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dropout_1 (Dropout)          (None, 376)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 30)                11310     
_________________________________________________________________
dropout_2 (Dropout)          (None, 30)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 30)                930       
_________________________________________________________________
dropout_3 (Dropout)          (None, 30)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 30)                930       
_________________________________________________________________
dropout_4 (Dropout)          (None, 30)                0

## Creating the model for policy std

In [6]:
policy_model_std = Sequential()
policy_model_std.add(Dropout(0.1,input_shape=So.shape))
policy_model_std.add(Dense(30 ,kernel_initializer='random_uniform', activation = 'relu' ))
policy_model_std.add(Dropout(0.5))
policy_model_std.add(Dense(30,kernel_initializer='random_uniform', activation = 'relu'))
policy_model_std.add(Dropout(0.5))
policy_model_std.add(Dense(30,kernel_initializer='random_uniform', activation = 'relu'))
policy_model_std.add(Dropout(0.5))
policy_model_std.add(Dense( A.shape[0],kernel_initializer='random_uniform', activation='linear' ))

adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
policy_model_std.compile(loss="mse", optimizer= adam)
print("Policy std network")
print(policy_model_std.summary())

Policy std network
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dropout_5 (Dropout)          (None, 376)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 30)                11310     
_________________________________________________________________
dropout_6 (Dropout)          (None, 30)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 30)                930       
_________________________________________________________________
dropout_7 (Dropout)          (None, 30)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 30)                930       
_________________________________________________________________
dropout_8 (Dropout)          (None, 30)                0 

## Creating model for values 

In [7]:
value_model = Sequential()
value_model.add(Dropout(0.1,input_shape=So.shape))
value_model.add(Dense(30, kernel_initializer='random_uniform',activation = 'relu' ))
value_model.add(Dropout(0.5))
value_model.add(Dense(30, kernel_initializer='random_uniform',activation = 'relu'))
value_model.add(Dropout(0.5))
value_model.add(Dense(30, kernel_initializer='random_uniform',activation = 'relu'))
value_model.add(Dropout(0.5))
value_model.add(Dense(1,kernel_initializer='random_uniform', activation= 'linear'))

print("Value Model")
adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
value_model.compile(loss="mse", optimizer= adam)
print(value_model.summary())

Value Model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dropout_9 (Dropout)          (None, 376)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 30)                11310     
_________________________________________________________________
dropout_10 (Dropout)         (None, 30)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 30)                930       
_________________________________________________________________
dropout_11 (Dropout)         (None, 30)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 30)                930       
_________________________________________________________________
dropout_12 (Dropout)         (None, 30)                0        

## Eligibility Traces for the models

In [8]:
sess.run(tf.global_variables_initializer())

policy_eligibility_traces_mean= [tf.Variable(np.zeros(shape = sess.run(tf.shape(tensor))), dtype= tf.float32) for tensor in policy_model_mean.trainable_weights]
policy_eligibility_traces_std= [tf.Variable(np.zeros(shape = sess.run(tf.shape(tensor))), dtype= tf.float32) for tensor in policy_model_std.trainable_weights]
value_eligibility_traces= [tf.Variable(np.zeros(shape = sess.run(tf.shape(tensor))), dtype= tf.float32) for tensor in value_model.trainable_weights]

sess.run(tf.global_variables_initializer())

## Defining other hyperparameters concerning the reinforcement learning

In [9]:
average_reward = 5 #pickle.load(open('average_reward.ferch','rb'))
cumulative_reward=0
etaa = 0.01

In [10]:
if(True):
    policy_model_mean = load_model('policy_mean.h5')
    policy_model_std = load_model('policy_std.h5')
    value_model = load_model('value.h5')

## Auxiliary functions for weights update


In [11]:
def set_weights(model, weights):  
    if(model=='value'):
        i=0
        for layer in value_model.layers:
            if 'dropout' not in layer.name:
                layer.set_weights([weights[i],weights[i+1]])
                i+=2
    elif(model=='policy_mean'):
        i=0
        for layerz in policy_model_mean.layers:
            if 'dropout' not in layerz.name:
                layerz.set_weights([weights[i],weights[i+1]])
                i+=2
                
    elif(model=='policy_std'):
        i=0
        for layerz in policy_model_std.layers:
            if 'dropout' not in layerz.name:
                layerz.set_weights([weights[i],weights[i+1]])
                i+=2
    else:
        print('error')

def get_value_weights():
    weights =[]
    for w in value_model.layers:
        weights+= w.weights
    return weights

def get_policy_weights_mean():
    weights =[]
    for w in policy_model_mean.layers:
        weights+= w.weights
    return weights

def get_policy_weights_std():
    weights =[]
    for w in policy_model_std.layers:
        weights+= w.weights
    return weights

In [12]:
def check_action(action):
    #FIXING ACTION INCONSISTENCIES
    low = env.action_space.low
    high = env.action_space.high
    
    for i in range(action.shape[0]):
        if(action[i]>high[i]):
            return True
        if(action[i]<low[i]):
            return True
    return False

## Update weights

In [13]:
value_weights = get_value_weights()
policy_weights_mean = get_policy_weights_mean()
policy_weights_std = get_policy_weights_std()

In [14]:
def update_weights(previous_state, action , reward, state,  terminal, skip_count):
    
    #Compute rewards
    global average_reward,rewardz_received,delta_rewardz_received, cumulative_reward 
    delta_rewards = reward - average_reward*skip_count + value_model.predict(np.array([state]))[0][0] - value_model.predict(np.array([previous_state]))[0][0]    
    cumulative_reward = (cumulative_reward + reward)/2
    print(value_model.predict(np.array([previous_state]))[0][0] ,"=====",value_model.predict(np.array([state]))[0][0],"====",delta_rewards)
    if(terminal):
        delta_rewards = reward - average_reward*skip_count + 0  - value_model.predict(np.array([previous_state]))[0][0]
    thresh = average_reward + cumulative_reward
    if(delta_rewards>=thresh):
        delta_rewards =thresh
    if(delta_rewards<=-thresh):
        delta_rewards=-thresh
    average_reward = average_reward + etaa*delta_rewards
    
    #Compute value updates (eligibility traces and weights)
    global value_eligibility_traces , value_weights , policy_weights_std
    value_gradients = value_model.optimizer.get_gradients(value_model.output, value_model.trainable_weights)
    value_gradients = [tf.clip_by_norm(gradient, 1) for gradient in value_gradients]
    value_eligibility_traces_op = [value_eligibility_traces[i].assign(tf.add(tf.multiply(value_eligibility_traces[i], tf.constant(value_lambda, dtype = tf.float32)),value_gradients[i] )) for i in range(len(value_eligibility_traces))] 
    sess.run(value_eligibility_traces_op,feed_dict={value_model.input:np.array([previous_state])})    
    value_weights_op =  [value_weights[i].assign(tf.add(value_weights[i], tf.multiply(tf.constant(value_alpha*delta_rewards, dtype= tf.float32), value_eligibility_traces[i]))) for i in range(len(value_weights))]
    sess.run(value_weights_op)
    
    #Compute policy updates (eligibility traces and weights)
    
    global policy_eligibility_traces_mean, policy_eligibility_traces_std , policy_weights_mean
    policy_mean = policy_model_mean.predict(np.array([previous_state]))[0]
    policy_std = policy_model_std.predict(np.array([previous_state]))[0]
    policy_std= policy_std*policy_std
    
    ##################################################  MEAN ##################################################################
    loss_mean = tf.divide(tf.subtract(tf.constant(action, dtype = tf.float32), policy_model_mean.output), tf.constant(policy_std*policy_std, dtype = tf.float32))
    policy_gradients_mean = policy_model_mean.optimizer.get_gradients(loss_mean, policy_model_mean.trainable_weights)
    policy_gradients_mean = [tf.clip_by_norm(gradient, 1) for gradient in policy_gradients_mean]

    policy_eligibility_traces_op_mean = [policy_eligibility_traces_mean[i].assign(tf.add(tf.multiply(tf.constant(policy_lambda, dtype = tf.float32), policy_eligibility_traces_mean[i]), policy_gradients_mean[i])) for i in range(len(policy_gradients_mean))]
    sess.run(policy_eligibility_traces_op_mean, feed_dict={policy_model_mean.input:np.array([previous_state])})
    policy_weights_op_mean =  [policy_weights_mean[i].assign(tf.add(policy_weights_mean[i], tf.multiply(policy_eligibility_traces_mean[i],tf.constant(policy_alpha*delta_rewards, dtype = tf.float32)))) for i in range(len(policy_weights_mean))]
    sess.run(policy_weights_op_mean)    
    
    ##################################################   STD ###################################################################
    loss_std = 2* tf.subtract(tf.divide(tf.square(tf.subtract(tf.constant(action, dtype = tf.float32),policy_mean)), tf.pow(policy_model_std.output, 3)), tf.divide(1, policy_model_std.output))
    policy_gradients_std = policy_model_std.optimizer.get_gradients(loss_std, policy_model_std.trainable_weights)
    policy_gradients_std = [tf.clip_by_norm(gradient, 1) for gradient in policy_gradients_std]

    policy_eligibility_traces_op_std = [policy_eligibility_traces_std[i].assign(tf.add(tf.multiply(tf.constant(policy_lambda, dtype = tf.float32), policy_eligibility_traces_std[i]), policy_gradients_std[i])) for i in range(len(policy_gradients_std))]
    sess.run(policy_eligibility_traces_op_std, feed_dict={policy_model_std.input:np.array([previous_state])})
    policy_weights_op_std =  [policy_weights_std[i].assign(tf.add(policy_weights_std[i], tf.multiply( policy_eligibility_traces_std[i],tf.constant(policy_alpha*delta_rewards, dtype = tf.float32)))) for i in range(len(policy_weights_std))]
    sess.run(policy_weights_op_std)
    
    

## Sample action 

In [15]:
def select_action(state):
    
    mean = policy_model_mean.predict(np.array([state]))[0]
    std = policy_model_std.predict(np.array([state]))[0]
    std= std*std
    #print(mean)
    covariance = np.zeros([mean.shape[0],mean.shape[0]])
    
    for i in range(std.shape[0]):
        covariance[i][i]= std[i]
        
    action = np.random.multivariate_normal(mean, covariance)
    #print(action)

    return action


In [16]:
'''      
    #FIXING ACTION INCONSISTENCIES
    low = env.action_space.low
    high = env.action_space.high
    
    for i in range(action.shape[0]):
        if(action[i]>high[i]):
            action[i]= high[i]+0.1
        if(action[i]<low[i]):
            action[i]=low[i]+0.1
            \    high = env.action_space.high
    global average_reward_variance
    print(average_reward_variance)
    if(np.abs(average_reward_variance)<0.1):
        print("Sampling randomly")
        return env.action_space.sample()
''' 

'      \n    #FIXING ACTION INCONSISTENCIES\n    low = env.action_space.low\n    high = env.action_space.high\n    \n    for i in range(action.shape[0]):\n        if(action[i]>high[i]):\n            action[i]= high[i]+0.1\n        if(action[i]<low[i]):\n            action[i]=low[i]+0.1\n            \\    high = env.action_space.high\n    global average_reward_variance\n    print(average_reward_variance)\n    if(np.abs(average_reward_variance)<0.1):\n        print("Sampling randomly")\n        return env.action_space.sample()\n'

# Run Episodes
## Setting up variables :D

In [None]:
S = env.reset()
#action_count=0
episode_count=0
save= True
total_reward =0 
total_reward_list=[]
skip_count = 0
skip_reward=0

## Main loop 

In [None]:
while(True):
    
    #renders environment 
    #env.render()
    
    high = env.action_space.high
    #Selects action according to stochastic policy
    action = select_action(S%high[0])
    #action_count+=1
    
    
    #Takes action 
    S1, reward, done, info = env.step(action)
    skip_reward += reward
    skip_count += 1
    
    #Updates weights
    #if(np.random.randint(10)%5==0):
    if(True):
        update_weights(S , action, skip_reward, S1 , done, skip_count)
        skip_reward = 0
        skip_count =0
    
    S = S1
    total_reward +=reward
    
    if(done):
        
        #print('We are now at '+str(episode_count))
        gc.collect()
        print(str(episode_count)+' --total reward ='+ str(total_reward))
        print(str(episode_count)+' --average reward ='+ str(average_reward))
        total_reward_list.append(total_reward)
        total_reward=0
        
        #Resets episode
        S = env.reset()
        #action_count=0
        episode_count+=1
        
        
        
        if(episode_count%1==0):
            
            if(save):
                print('saving models')
                print('average reward '+ str(average_reward))
                
                policy_model_mean.save('policy_mean.h5')
                policy_model_std.save('policy_std.h5')
                value_model.save('value.h5')
                pickle.dump(total_reward_list,open('totalz_hist','wb')) 
                
                
                #Restarting keras session 
                K.clear_session()
                sess = K.get_session()
                policy_model_mean = load_model('policy_mean.h5')
                policy_model_std = load_model('policy_std.h5')
                value_model = load_model('value.h5')


        
        #Resets eligibility traces
        global value_eligibility_traces, policy_eligibility_traces_mean, policy_eligibility_traces_std
        
        policy_eligibility_traces_mean= [tf.Variable(np.zeros(shape = sess.run(tf.shape(tensor))), dtype= tf.float32) for tensor in policy_model_mean.trainable_weights]
        policy_eligibility_traces_std= [tf.Variable(np.zeros(shape = sess.run(tf.shape(tensor))), dtype= tf.float32) for tensor in policy_model_std.trainable_weights]
        value_eligibility_traces= [tf.Variable(np.zeros(shape = sess.run(tf.shape(tensor))), dtype= tf.float32) for tensor in value_model.trainable_weights]

        value_weights = get_value_weights()
        policy_weights_mean = get_policy_weights_mean()
        policy_weights_std = get_policy_weights_std()
        
        
        sess.run(tf.global_variables_initializer())

0.141645 ===== 0.146498 ==== 0.0112097904481
0.146615 ===== 0.140383 ==== -0.00147634406962
0.140354 ===== 0.141884 ==== 0.0111572555472
0.1422 ===== 0.152121 ==== 0.00886367043355
0.152441 ===== 0.144999 ==== 0.000646924287369
0.145026 ===== 0.144466 ==== 0.00453353249318
0.144689 ===== 0.145164 ==== 0.00451126343992
0.145411 ===== 0.14282 ==== 0.00574620815404
0.143161 ===== 0.149676 ==== 0.00582055281112
0.150055 ===== 1.66802 ==== -0.475511735568
1.60048 ===== 1.65026 ==== -0.82161593423
1.3203 ===== 0.5197 ==== -0.961756823936
0.276677 ===== 0.199198 ==== -0.231446451188
0.147224 ===== 0.118266 ==== -0.171987295258
0.0794302 ===== 0.00506609 ==== -0.161575880207
-0.0193725 ===== 0.0026481 ==== -0.0969589490909
-0.0144996 ===== -0.015064 ==== -0.105538501336
-0.0337295 ===== -0.0287898 ==== -0.114992282381
-0.0488598 ===== -0.0586602 ==== -0.137735158473
-0.0790989 ===== -0.0769805 ==== -0.120360201908
-0.0943724 ===== -0.0659443 ==== -0.150154783329
-0.0884705 ===== -0.098183 ====

-0.0704034 ===== -0.0720911 ==== 0.0850020003152
-0.0625489 ===== -0.0616665 ==== 0.0844009865386
-0.0521208 ===== -0.053974 ==== 0.0904591091626
-0.0437251 ===== -0.0427704 ==== 0.0879401857659
-0.0326488 ===== -0.0315947 ==== 0.0942251887712
-0.0208148 ===== -0.0235275 ==== 0.101896894477
-0.0117259 ===== -0.00916659 ==== 0.111687976113
0.00432723 ===== 0.00740679 ==== 0.112426548196
0.0216461 ===== 0.0193348 ==== 0.109415024741
0.0334162 ===== 0.0362371 ==== 0.113702218649
