In [1]:
import tensorflow as tf
import os 
import numpy as np
from Env import Env
from q_network import Q_Network

  from ._conv import register_converters as _register_converters


In [2]:
env=Env('SpaceInvaders-v0',convert_to_grayscale=True,crop=False,valid_Y=[20,-10],valid_X=[10,-10],resize=True,resize_Y=84,resize_X=84,normalize=True,num_of_frames_per_stack=4,repeat_action=4)

Resetting Environment...





In [3]:
def make_copy_weight_ops(from_network,to_network,tau):
    tau=tf.placeholder(tf.float32,[])
    op_holder = []
    for f_var,t_var in zip(from_network.model_trainable_variables,to_network.model_trainable_variables):
        value=(f_var.value()*tau)+((1-tau)*t_var.value())
        op_holder.append(t_var.assign(value))
    return tau,op_holder


In [4]:
def train_using_double_deep_Q(main_Q_network,target_Q_network,sess,episodes,steps,initial_epsilon,final_epsilon,epsilon_dec,
                              train_t,discount_rate,batch_size,env,save_dir,save_every_n_iter,log_every_n_iter,
                              initialize=False,set_logging=True,num_frames_to_repeat_action=4,train_main_every_n_steps=5,
                              update_target_every_n_iters=100,tau=0.001):
    tau_placeholder,copy_ops=make_copy_weight_ops(main_Q_network,target_Q_network,tau)
    if initialize:
        print ("Initializing.....\n")
        sess.run([main_Q_network.initializer])
    sess.run([target_Q_network.initializer])
    sess.run(copy_ops,feed_dict={tau_placeholder:1.0})
#     else:
        
    if set_logging:
        print ("Setting up for Logging ...\n")
        log_dir,set_logging=main_Q_network.create_log_directory_if_doesnt_exist(save_dir)
    if set_logging: #creating file handlers if dir cretaed or found in above statement
        print("Logging called but no code implemented")
#                 train_writer = tf.summary.FileWriter(os.path.join(log_dir,'train'), sess.graph)
#                 validation_writer = tf.summary.FileWriter(os.path.join(log_dir ,'validation'))
    print ("Retreiveing step no...\n")
    [iter_no]=sess.run([main_Q_network.step_no]) 
    epsilon=initial_epsilon
    for episode in np.arange(episodes):
        state=env.reset()
        step=0
        episode_reward=0
        episode_loss=0
#         previous_action=None
        for step in np.arange(steps):
            #choosing action 
#             if step%num_frames_to_repeat_action==0:
            if epsilon>final_epsilon and iter_no>train_t:
                    epsilon-=epsilon_dec

#                 if  iter_no<train_t or (np.random.random(1)<epsilon):
            if  (np.random.random(1)<epsilon):
                action=np.random.randint(low=0,high=main_Q_network.params.num_outputs,size=1,dtype=np.int32)
            else:
                feed_dict={main_Q_network.X:np.expand_dims(state,axis=0),main_Q_network.lr_placeholder:main_Q_network.params.learning_rate,main_Q_network.training_mode:True}
                [action]=sess.run([main_Q_network.max_q_value_actions],feed_dict=feed_dict)
            action=np.squeeze(action)
#             else:
#                 action=previous_action
#                 print(action)
            next_state,reward,done,info=env.step(action)
            episode_reward+=reward
            main_Q_network.add_to_experience_replay(state,action,next_state,reward,done)
#             previous_action=action
            episode_has_finished=done

            state=next_state

            if (main_Q_network.experience_replay_buffer.num_items>train_t )and (step%train_main_every_n_steps==0): #perform training if there are enough experiences
#                     print("buffer filled")


                #performing training step
                states,actions,next_states,rewards,dones=main_Q_network.experience_replay_buffer.get_batch(batch_size=batch_size)


                #finding vals of next states
#                     print (next_states.shape)
                feed_dict={target_Q_network.X:next_states,
                           target_Q_network.lr_placeholder:target_Q_network.params.learning_rate,
                           target_Q_network.training_mode:True}
                #double dqn part
                [q_vals]=sess.run([target_Q_network.logits],feed_dict=feed_dict)
                
                feed_dict={main_Q_network.X:next_states,
                           main_Q_network.training_mode:True}
                [target_actions]=sess.run([main_Q_network.max_q_value_actions],feed_dict=feed_dict)
                target_actions=np.squeeze(target_actions)
                max_q_vals_next_state=q_vals[np.arange(q_vals.shape[0]),target_actions]
            
#                 [max_q_vals_next_state]=sess.run([target_Q_network.max_q_values],feed_dict=feed_dict)

                feed_dict={main_Q_network.X:states,main_Q_network.actions:actions,
                           main_Q_network.max_q_values_next_state:max_q_vals_next_state,main_Q_network.rewards:rewards,
                           main_Q_network.notended:((np.logical_not(dones)).astype(np.int32)),
                           main_Q_network.discount_rate:discount_rate,main_Q_network.lr_placeholder:main_Q_network.params.learning_rate,
                           main_Q_network.training_mode:True}
                loss,_=sess.run([main_Q_network.loss,main_Q_network.train_op],feed_dict=feed_dict)
                episode_loss+=loss
                iter_no+=1
                if (iter_no)%update_target_every_n_iters==0:
                    print("updating weights of target network\n")
                    sess.run(copy_ops,feed_dict={tau_placeholder:tau})
                
                if (iter_no)%save_every_n_iter==0:
                    print("^^^^ saving model ^^^^ \n")
                    main_Q_network.save_model(sess,save_dir,main_Q_network.step_no)
                if (iter_no)%log_every_n_iter==0:
                    print ("Trainaing Step:\t Iteration no={} Game Step ={} loss={} ".format(iter_no,step,loss))
            if episode_has_finished:
                break
        print ("===================>Episode {} Ended <===================\n".format(episode)) 
        print ("=======>\t Episode Length={} \t<=======\n".format(step))   
        print ("=======>\t Episode Reward={} \t<=======\n".format(episode_reward))
        print ("=======>\t Mean Episode Loss={} \t<=======\n".format(episode_loss/step))

In [5]:
main_q_network_params={
    'input_shape':[None, *env.image_shape],
    'num_outputs':env.action_space,
    
    'layer_hierarchy':[
        {'layer_type':'conv_layer','kernel_size':8,'kernel_strides':4,'num_filters':32,'padding':'valid'},
        {'layer_type':'activation_layer'},
        {'layer_type':'conv_layer','kernel_size':4,'kernel_strides':2,'num_filters':64,'padding':'valid'},
        {'layer_type':'activation_layer'},
        {'layer_type':'conv_layer','kernel_size':3,'kernel_strides':2,'num_filters':64,'padding':'valid'},
        {'layer_type':'activation_layer'},
        {'layer_type':'flattening_layer'},
        {'layer_type':'fc_layer','num_hidden_units':512},
        {'layer_type':'activation_layer'}
        
    ],
    'initializer_fn':tf.contrib.layers.variance_scaling_initializer,
    'activation_fn':tf.nn.elu,
    'learning_rate':0.001,
    'optimizer_fn':tf.train.AdamOptimizer,
    'logdir':'/tf_logs_rnn/run/',
    'name_scope':'main_q_network_with_frames'
}

In [6]:
target_q_network_params={
    'input_shape':[None, *env.image_shape],
    'num_outputs':env.action_space,
    
    'layer_hierarchy':[
        {'layer_type':'conv_layer','kernel_size':8,'kernel_strides':4,'num_filters':32,'padding':'valid'},
        {'layer_type':'activation_layer'},
        {'layer_type':'conv_layer','kernel_size':4,'kernel_strides':2,'num_filters':64,'padding':'valid'},
        {'layer_type':'activation_layer'},
        {'layer_type':'conv_layer','kernel_size':3,'kernel_strides':2,'num_filters':64,'padding':'valid'},
        {'layer_type':'activation_layer'},
        {'layer_type':'flattening_layer'},
        {'layer_type':'fc_layer','num_hidden_units':512},
        {'layer_type':'activation_layer'}
    ],
    'initializer_fn':tf.contrib.layers.variance_scaling_initializer,
    'activation_fn':tf.nn.elu,
    'learning_rate':0.001,
    'optimizer_fn':tf.train.AdamOptimizer,
    'logdir':'/tf_logs_rnn/run/',
    'name_scope':'target_q_network_with_frames'
}

In [None]:
n_episodes=50
max_steps=50000
save_every_n_iter=50
log_every_n_iter=50
initialize=True#False
save_dir="deep_q_saves"
max_experience_buffer_len=10000
initial_epsilon=0.5#1
final_epsilon=0.01
epsilon_dec=0.0001
train_t=1000
discount_rate=0.99
batch_size=200
pickle_file_path_main_network="deep_q_saves/main_q_network_with_frames/model_object.pkl"
pickle_file_path_target_network="deep_q_saves/target_q_network_with_frames/model_object.pkl"
tf.reset_default_graph()

    
main_Q_network=Q_Network(max_experience_buffer_len,main_q_network_params,restore_params=not initialize,pickle_file_path=pickle_file_path_main_network)
target_Q_network=Q_Network(0,target_q_network_params,restore_params=False,pickle_file_path=pickle_file_path_target_network)
main_Q_network.Build_model()
target_Q_network.Build_model()

with tf.Session() as sess:
    
    if(not initialize):
        main_Q_network.restore_model(sess,save_dir)

    
    train_using_double_deep_Q(main_Q_network=main_Q_network,target_Q_network=target_Q_network,sess=sess,episodes=n_episodes,
                              steps=max_steps,initial_epsilon=initial_epsilon,final_epsilon=final_epsilon,
                              epsilon_dec=epsilon_dec,train_t=train_t,discount_rate=discount_rate,batch_size=batch_size,
                              env=env,save_dir=save_dir,save_every_n_iter=save_every_n_iter,
                              log_every_n_iter=log_every_n_iter,initialize=initialize,set_logging=True,
                              num_frames_to_repeat_action=4,train_main_every_n_steps=1,update_target_every_n_iters=50,tau=0.001)
#     model.test(initialize=True,env=env)
    env.close()

Initializing.....

Setting up for Logging ...

Logging called but no code implemented
Retreiveing step no...

Resetting Environment...





Resetting Environment...





Resetting Environment...





Resetting Environment...





Resetting Environment...





Resetting Environment...





Resetting Environment...

updating weights of target network

^^^^ saving model ^^^^ 

Trainaing Step:	 Iteration no=50.0 Game Step =156 loss=0.32996001839637756 




Resetting Environment...

updating weights of target network

^^^^ saving model ^^^^ 

Trainaing Step:	 Iteration no=100.0 Game Step =5 loss=0.35774868726730347 
updating weights of target network

^^^^ saving model ^^^^ 

Trainaing Step:	 Iteration no=150.0 Game Step =55 loss=0.18389570713043213 
updating weights of target network

^^^^ saving model ^^^^ 

Trainaing Step:	 Iteration no=200.0 Game Step =105 loss=0.17004652321338654 
updating weights of target network

^^^^ saving model ^^^^ 

Trainaing Step:	 Iteration no=250.0 Game Step

In [None]:
# n_episodes=50
# max_steps=50000
# save_every_n_iter=50
# log_every_n_iter=50
# initialize=False
# save_dir="deep_q_saves"
# max_experience_buffer_len=10000
# initial_epsilon=0.5#1
# final_epsilon=0.0001
# epsilon_dec=0.00001
# train_t=1000
# discount_rate=0.9
# batch_size=120
# pickle_file_path_main_network="deep_q_saves/main_q_network_with_frames/model_object.pkl"
# pickle_file_path_target_network="deep_q_saves/target_q_network_with_frames/model_object.pkl"

# tf.reset_default_graph()

    
# main_Q_network=Q_Network(max_experience_buffer_len,main_q_network_params,restore_params=not initialize,pickle_file_path=pickle_file_path_main_network)
# # target_Q_network=Q_Network(0,target_q_network_params,restore_params=not initialize,pickle_file_path=pickle_file_path_target_network)
# main_Q_network.Build_model()
# # target_Q_network.Build_model()

# with tf.Session() as sess:
    
#     if(not initialize):
#         main_Q_network.restore_model(sess,save_dir)

    
    
# #     model.test(initialize=True,env=env)
#     main_Q_network.test(sess=sess,initialize=True,env=env,sleep_time=0.1)
#     env.close()