In [1]:
import numpy as np
import tensorflow as tf
import gym
import os
import matplotlib.pyplot as plt

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters


In [2]:
def form_placeholders(input_shape):
    X=tf.placeholder(dtype=tf.float32,shape=input_shape)
    return X

In [3]:
def form_convolutional_heirarchy(inputs,conv_hierachy,activation_fn,initializer):
    layer_input=inputs
    for layer in conv_hierachy:
        if layer['layer_type']=='conv_layer':
            layer_output=tf.layers.conv2d(
                    inputs=layer_input,
                    filters=layer['num_filters'],
                    kernel_size=layer['kernel_size'],
                    strides=layer['kernel_strides'],
                    padding=layer['padding'],
                    kernel_initializer=initializer,
                    activation=activation_fn
            )
            layer_input=layer_output
        elif layer['layer_type']=='pool_layer':
            layer_output=tf.layers.max_pooling2d(
                    inputs=layer_input,
                    pool_size=layer['pool_size'],
                    strides=layer['pool_strides'])
            layer_input=layer_output
        
        
    return layer_output 

In [4]:
def form_hidden_layers(inputs,num_neurons_in_all_layers,activation_fn,initializer):
    layer_inputs=inputs
    for num_neurons in num_neurons_in_all_layers:
        layer_outputs=tf.layers.dense(layer_inputs,num_neurons,activation=activation_fn,kernel_initializer=initializer)
        layer_inputs=layer_outputs
    return layer_outputs

In [5]:
def form_loss(logits,targets,loss_fn):
    entropies=loss_fn(labels=targets,logits=logits)
    return entropies

In [6]:
def get_gradients_and_optimizer(loss,learning_rate,optimizer_fn):
    optimizer=optimizer_fn(learning_rate)
    grads_and_vars=optimizer.compute_gradients(loss)
    return optimizer,grads_and_vars

In [7]:
def update_weights(optimizer,grads_and_vars):
    grad_placeholders_list=[]
    grads_and_vars_feed=[]
    for grad,var in grads_and_vars:
        grad_placeholder=tf.placeholder(tf.float32,shape=grad.get_shape())
        grad_placeholders_list.append(grad_placeholder)
        grads_and_vars_feed.append((grad_placeholder,var))
    train_op=optimizer.apply_gradients(grads_and_vars_feed)
    return grad_placeholders_list,grads_and_vars_feed,train_op

In [8]:
class CNN:
    def __init__(self,params):
        input_shape=params['input_shape']
        num_outputs=params['num_outputs']
        num_neurons_in_hidden_layers=params['num_neurons']
        conv_hierarchy=params['conv_hierarchy']
        activation_fn=params.get('activation_fn',tf.nn.relu)
        loss_fn=params['loss_fn']
        learning_rate=params['learning_rate']
        optimizer_fn=params['optimizer_fn']
        logdir=params['logdir']
        
        
        tf.reset_default_graph()
        initializer_fn=tf.contrib.layers.variance_scaling_initializer()
        self.X=form_placeholders(input_shape)
        
        conv_output=form_convolutional_heirarchy(self.X,conv_hierarchy,activation_fn,initializer_fn)
        
        flat_conv_output=tf.contrib.layers.flatten(conv_output)
        
        last_hidden_output=form_hidden_layers(flat_conv_output,num_neurons_in_hidden_layers,activation_fn,initializer_fn)
        
        logits=tf.layers.dense(last_hidden_output,num_outputs,kernel_initializer=initializer_fn)
        self.logits=logits
        self.outputs=tf.nn.softmax(self.logits)
        self.action=tf.multinomial(tf.log(self.outputs),num_samples=1)
        
        targets=self.action[:][0]
        
        self.entropies=form_loss(logits,targets,loss_fn)
        self.optimizer,grads_and_vars=get_gradients_and_optimizer(self.entropies,learning_rate,optimizer_fn)
        self.gradients=[grad for grad,variable in grads_and_vars]
        
        self.grad_placeholders_list,self.grads_and_vars_feed,self.train_op=update_weights(self.optimizer,grads_and_vars)

        
        
        
        
        self.initializer=tf.global_variables_initializer()
        self.saver=tf.train.Saver()
#         summ=tf.summary.scalar(self.entropies)
#         self.summaries=tf.summary.merge_all()
#         self.file_writer=tf.summary.FileWriter(logdir,tf.get_default_graph())

In [9]:
def get_cumulative_discounted_rewards(rewards,discount_rate):#cumulates rewards for a single episode/game
    disc_rewards=np.empty(len(rewards))
    cum_rewards=0
    for step in reversed(range(len(rewards))):
        cum_rewards=rewards[step]+cum_rewards*discount_rate
        disc_rewards[step]=cum_rewards
    return disc_rewards  #returning dicounted rewards(same shape as the rewards in the parameters )
        

In [10]:
def normalize_and_cumulate_rewards(all_rewards,discount_rate):#cumulates and normalizes rewards over many episodes/games
    E=0.00001 #to prevent division by 0
    all_discounted_rewards=[get_cumulative_discounted_rewards(episode_reward,discount_rate) for episode_reward in all_rewards]
    flat_rewards=np.concatenate(all_discounted_rewards)
    reward_mean=flat_rewards.mean()
    reward_std=flat_rewards.std()
    return [(discounted_episode_rewards-reward_mean)/(reward_std+E) for discounted_episode_rewards in all_discounted_rewards]

In [11]:
def preprocess_img(img):
    img=img[20:-12:2,::2]
    img=img.mean(axis=2,keepdims=True)
    img=(img-128)/128-1# normalize from -1. to 1.
    return img.reshape(1,*img.shape)

In [12]:


env=gym.make("SpaceInvaders-v0")
obs=env.reset()
input_shape=preprocess_img(obs).shape[1:]

n_iter=50
n_games_per_iter=10
n_steps_per_game=100
discount_rate=0.99
savedir="./my_policy_net_pg.ckpt"
save_iterations=10# save the model every 10 training iterations
params={
    'input_shape':[None,*input_shape],
    'num_outputs':env.action_space.n,
    'num_neurons':[50],
    'conv_hierarchy':[
        {'layer_type':'conv_layer','kernel_size':4,'kernel_strides':1,'num_filters':5,'padding':'valid'},
#         {'layer_type':'pool_layer','pool_size':3,'pool_strides':1},
        {'layer_type':'conv_layer','kernel_size':4,'kernel_strides':1,'num_filters':10,'padding':'valid'},
        
#         {'layer_type':'conv_layer','kernel_size':3,'kernel_strides':1,'num_filters':10,'padding':'valid'},
#         {'layer_type':'pool_layer','pool_size':3,'pool_strides':1}
       
    ],
    'activation_fn':tf.nn.relu,
    'loss_fn':tf.nn.sparse_softmax_cross_entropy_with_logits,
    'learning_rate':0.01,
    'optimizer_fn':tf.train.AdamOptimizer,
    'logdir':'/tf_logs_rnn/run/'
}

In [None]:
#training code
model=CNN(params)

with tf.Session() as sess:
    if os.path.isfile(savedir):
        print ('restoring model')
        model.saver.restore(sess,savedir) 
    else: 
        model.initializer.run()
    for iteration in range(n_iter):
        all_rewards=[]# all sequences of raw rewards for each episode
        all_gradients=[]# gradients saved at each step of each episode
        mean_entr=[]
        print ('current iteration '+str(iteration))
        for game in range(n_games_per_iter):
            current_rewards=[]# all raw rewards from the current episode
            current_gradients=[]# all gradients from the current episode
            obs=env.reset()
            for step in range(n_steps_per_game):
                print ('\t step '+str(step))
                pro_obs=preprocess_img(obs)
#                 pro_obs=pro_obs.reshape
                action_val,gradients_val,entropies=sess.run(
                [model.action,model.gradients,model.entropies],
                feed_dict={model.X:pro_obs})# one obs
                obs,reward,done,info=env.step(action_val[0][0])
                current_rewards.append(reward)
                current_gradients.append(gradients_val)
                mean_entr.append(entropies)
                if done:
                    break
            all_rewards.append(current_rewards)
            all_gradients.append(current_gradients)

        # At this point we have run the policy for 10 episodes, and we are
        # ready for a policy update using the algorithm described earlier.
        all_rewards=normalize_and_cumulate_rewards(all_rewards,discount_rate)
        feed_dict={}
        for var_index,grad_placeholder in enumerate(model.grad_placeholders_list):
            # multiply the gradients by the action scores, and compute the mean
            mean_gradients=np.mean(
            [reward*all_gradients[game_index][step][var_index]
            for game_index,rewards in enumerate(all_rewards)
            for step,reward in enumerate(rewards)],
            axis=0)
            feed_dict[grad_placeholder]=mean_gradients
        sess.run([model.train_op],feed_dict=feed_dict)
        if iteration%save_iterations==0:
            model.saver.save(sess,savedir)
        print ('entropy =')
        print (np.mean(np.array(mean_entr)))
        
    n_steps=100
    obs=env.reset()
    for i in range(n_steps):
        pro_obs=preprocess_img(obs)
        feed_dict={model.X:pro_obs}
        outputs_r=sess.run(model.outputs,feed_dict=feed_dict)
        action=np.argmax(outputs_r,axis=1)
        obs,reward,done,info=env.step(action)
        env.render()
        print ('step no '+str(i))
        if done:
            print ('lost')
            break

In [14]:
model=CNN(params)

with tf.Session() as sess:
#     if os.path.isfile(savedir):
    print ('restoring model')
    model.saver.restore(sess,savedir) 

    n_steps=3000
    obs=env.reset()
    for i in range(n_steps):
        pro_obs=preprocess_img(obs)
        feed_dict={model.X:pro_obs}
        outputs_r=sess.run(model.outputs,feed_dict=feed_dict)
        action=np.argmax(outputs_r,axis=1)
        obs,reward,done,info=env.step(action)
        env.render()
        print ('step no '+str(i))
        if done:
            print ('lost')
            break
#     else: 
#         print  ('file not found')
    
        
    

restoring model
INFO:tensorflow:Restoring parameters from ./my_policy_net_pg.ckpt
step no 0
step no 1
step no 2
step no 3
step no 4
step no 5
step no 6
step no 7
step no 8
step no 9
step no 10
step no 11
step no 12
step no 13
step no 14
step no 15
step no 16
step no 17
step no 18
step no 19
step no 20
step no 21
step no 22
step no 23
step no 24
step no 25
step no 26
step no 27
step no 28
step no 29
step no 30
step no 31
step no 32
step no 33
step no 34
step no 35
step no 36
step no 37
step no 38
step no 39
step no 40
step no 41
step no 42
step no 43
step no 44
step no 45
step no 46
step no 47
step no 48
step no 49
step no 50
step no 51
step no 52
step no 53
step no 54
step no 55
step no 56
step no 57
step no 58
step no 59
step no 60
step no 61
step no 62
step no 63
step no 64
step no 65
step no 66
step no 67
step no 68
step no 69
step no 70
step no 71
step no 72
step no 73
step no 74
step no 75
step no 76
step no 77
step no 78
step no 79
step no 80
step no 81
step no 82
step no 83
step