In [1]:
import threading
import multiprocessing
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow.contrib.slim as slim
import scipy.signal
import scipy
%matplotlib inline
from helper import *
from vizdoom import *

from random import choice
from time import sleep
from time import time
from datetime import datetime as dt

import sys
sys.path.append('C:\\Users\\elind\\Documents\\GitHub\\02460_doom_rl_2019')
sys.path.append('C:\\Users\\elind\\Documents\\GitHub\\02460_doom_rl_2019\\baselines-master')
# The new things 
import gym

import cv2
from gdoom_env import *


  from ._conv import register_converters as _register_converters


In [2]:
# Copies one set of variables to another.
# Used to set worker network parameters to those of global network.
def update_target_graph(from_scope,to_scope):
    from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)
    to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)

    op_holder = []
    for from_var,to_var in zip(from_vars,to_vars):
        op_holder.append(to_var.assign(from_var))
    return op_holder

# Processes Doom screen image to produce cropped and resized image. 
def process_frame(frame):
    sk=frame #s = frame[10:-10,30:-30] So no cropping of the image...
    top=int(round(0.10*sk.shape[1],0))
    down=int(round(0.8125*sk.shape[1],0))
    sk=sk[(top):(down),:]
    # Now:
    #sk = scipy.misc.imresize(sk,[64,64])

    # Before:
    #s=scipy.misc.imresize(s,[84,84]) # This function b idea
    
    # As the gym framework is outputting all the frame when it skips 4, so the observation is 4 frame
    
    sk = np.reshape(sk,[np.prod(sk.shape)]) / 255.0
    return sk
def get_last_frame(frame):
    sp=frame.shape
    #print(sp)
    return frame[:,:,sp[2]-1]

# Discounting function used to calculate discounted returns.
def discount(x, gamma):
    return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]

#Used to initialize weights for policy and value output layers
def normalized_columns_initializer(std=1.0):
    def _initializer(shape, dtype=None, partition_info=None):
        out = np.random.randn(*shape).astype(np.float32)
        out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
        return tf.constant(out)
    return _initializer

In [3]:
class AC_Network():
    def __init__(self,s_size,a_size,scope,trainer):
        with tf.variable_scope(scope):
            #Input and visual encoding layers
            self.inputs = tf.placeholder(shape=[None,s_size],dtype=tf.float32)
            #self.imageIn = tf.reshape(self.inputs,shape=[-1,84,84,1])
            #self.imageIn = tf.reshape(self.inputs,shape=[-1,64,64,1]) # Changed the values to 64, dont know why (Properly because of the screenresolution).
            #68*96
            self.imageIn = tf.reshape(self.inputs,shape=[-1,68,96,1])
            self.conv1 = slim.conv2d(activation_fn=tf.nn.elu,
                inputs=self.imageIn,num_outputs=16,
                kernel_size=[8,8],stride=[4,4],padding='VALID')
            self.conv2 = slim.conv2d(activation_fn=tf.nn.elu,
                inputs=self.conv1,num_outputs=32,
                kernel_size=[4,4],stride=[2,2],padding='VALID')
            hidden = slim.fully_connected(slim.flatten(self.conv2),256,activation_fn=tf.nn.elu)
            
            #Recurrent network for temporal dependencies
            lstm_cell = tf.contrib.rnn.BasicLSTMCell(256,state_is_tuple=True)
            c_init = np.zeros((1, lstm_cell.state_size.c), np.float32)
            h_init = np.zeros((1, lstm_cell.state_size.h), np.float32)
            self.state_init = [c_init, h_init]
            c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c])
            h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h])
            self.state_in = (c_in, h_in)
            rnn_in = tf.expand_dims(hidden, [0])
            step_size = tf.shape(self.imageIn)[:1]
            state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in)
            lstm_outputs, lstm_state = tf.nn.dynamic_rnn(
                lstm_cell, rnn_in, initial_state=state_in, sequence_length=step_size,
                time_major=False)
            lstm_c, lstm_h = lstm_state
            self.state_out = (lstm_c[:1, :], lstm_h[:1, :])
            rnn_out = tf.reshape(lstm_outputs, [-1, 256])
            
            #Output layers for policy and value estimations
            self.policy = slim.fully_connected(rnn_out,a_size,
                activation_fn=tf.nn.softmax,
                weights_initializer=normalized_columns_initializer(0.01),
                biases_initializer=None)
            self.value = slim.fully_connected(rnn_out,1,
                activation_fn=None,
                weights_initializer=normalized_columns_initializer(1.0),
                biases_initializer=None)
            
            #Only the worker network need ops for loss functions and gradient updating.
            if scope != 'global':
                self.actions = tf.placeholder(shape=[None],dtype=tf.int32)
                self.actions_onehot = tf.one_hot(self.actions,a_size,dtype=tf.float32)
                self.target_v = tf.placeholder(shape=[None],dtype=tf.float32)
                self.advantages = tf.placeholder(shape=[None],dtype=tf.float32)

                self.responsible_outputs = tf.reduce_sum(self.policy * self.actions_onehot, [1])

                #Loss functions
                self.value_loss = 0.5 * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value,[-1])))
                self.entropy = - tf.reduce_sum(self.policy * tf.log(self.policy))
                self.policy_loss = -tf.reduce_sum(tf.log(self.responsible_outputs)*self.advantages)
                self.loss = 0.5 * self.value_loss + self.policy_loss - self.entropy * 0.01

                #Get gradients from local network using local losses
                local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
                self.gradients = tf.gradients(self.loss,local_vars)
                self.var_norms = tf.global_norm(local_vars)
                grads,self.grad_norms = tf.clip_by_global_norm(self.gradients,40.0)
                
                #Apply local gradients to global network
                global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
                self.apply_grads = trainer.apply_gradients(zip(grads,global_vars))

In [4]:
class Worker():
    # Defining input
    ## game is framework of the game
    ## name Of the worker worker_1, 2 3.
    ## s_size  So tensorflow values ?? But what kind?
    ## a_size So tensorflow values ?? But what kind?
    ## trainer 
    ## model_path  Is where the model should go in
    ## global_episodes  Number of global episodes
    def __init__(self,game,name,s_size,a_size,trainer,model_path,global_episodes,hyperparametersstring):
        # The name
        self.name = "worker_" + str(name)
        # The number is 0,1,2,3
        self.number = name        
        # Model path
        self.model_path = model_path
        # So this is the adamoptimizer
        self.trainer = trainer
        
        self.global_episodes = global_episodes
        # The increment of episode ## Could this be something else than 1? That would be weird.
        self.increment = self.global_episodes.assign_add(1)
        self.episode_rewards = []
        self.episode_kills = []
        self.episode_lengths = []
        self.episode_mean_values = []
                                                    # This is where 
        self.summary_writer = tf.summary.FileWriter("train_"+hyperparametersstring+str(self.number))

        #Create the local copy of the network and the tensorflow op to copy global paramters to local network
        self.local_AC = AC_Network(s_size,a_size,self.name,trainer)
        self.update_local_ops = update_target_graph('global',self.name)        
        
        #The Below code is related to setting up the Doom environment
#         game.set_doom_scenario_path("basic.wad") #This corresponds to the simple task we will pose our agent
#         game.set_doom_map("map01")
#         game.set_screen_resolution(ScreenResolution.RES_160X120)
#         game.set_screen_format(ScreenFormat.GRAY8)
#         game.set_render_hud(False)
#         game.set_render_crosshair(False)
#         game.set_render_weapon(True)
#         game.set_render_decals(False)
#         game.set_render_particles(False)
#         game.add_available_button(Button.MOVE_LEFT)
#         game.add_available_button(Button.MOVE_RIGHT)
#         game.add_available_button(Button.ATTACK)
#         game.add_available_game_variable(GameVariable.AMMO2)
#         game.add_available_game_variable(GameVariable.POSITION_X)
#         game.add_available_game_variable(GameVariable.POSITION_Y)
#         game.set_episode_timeout(300)
#         game.set_episode_start_time(10)
#         game.set_window_visible(False)
#         game.set_sound_enabled(False)
#         game.set_living_reward(-1)
#         game.set_mode(Mode.PLAYER)
#         game.init()
        self.actions = self.actions = np.identity(a_size,dtype=bool).tolist() 
        ## Think has to be converted into The gym environment....
        #End Doom set-up
        self.env = game
        
    def train(self,rollout,sess,gamma,bootstrap_value):
        rollout = np.array(rollout)
        observations = rollout[:,0]
        actions = rollout[:,1]
        rewards = rollout[:,2]
        next_observations = rollout[:,3]
        values = rollout[:,5]
        
        # Here we take the rewards and values from the rollout, and use them to 
        # generate the advantage and discounted returns. 
        # The advantage function uses "Generalized Advantage Estimation"
        self.rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
        discounted_rewards = discount(self.rewards_plus,gamma)[:-1]
        self.value_plus = np.asarray(values.tolist() + [bootstrap_value])
        advantages = rewards + gamma * self.value_plus[1:] - self.value_plus[:-1]
        advantages = discount(advantages,gamma)

        # Update the global network using gradients from loss
        # Generate network statistics to periodically save
        feed_dict = {self.local_AC.target_v:discounted_rewards,
            self.local_AC.inputs:np.vstack(observations),
            self.local_AC.actions:actions,
            self.local_AC.advantages:advantages,
            self.local_AC.state_in[0]:self.batch_rnn_state[0],
            self.local_AC.state_in[1]:self.batch_rnn_state[1]}
        v_l,p_l,e_l,g_n,v_n, self.batch_rnn_state,_ = sess.run([self.local_AC.value_loss,
            self.local_AC.policy_loss,
            self.local_AC.entropy,
            self.local_AC.grad_norms,
            self.local_AC.var_norms,
            self.local_AC.state_out,
            self.local_AC.apply_grads],
            feed_dict=feed_dict)
        return v_l / len(rollout),p_l / len(rollout),e_l / len(rollout), g_n,v_n
        
    def work(self,max_episode_length,gamma,sess,coord,saver,batchsize,Numberofepisodesbeforebreak,hyperparametersstring):
        episode_count = sess.run(self.global_episodes)
        total_steps = 0
        print ("Starting worker " + str(self.number))
        with sess.as_default(), sess.graph.as_default():                 
            while not coord.should_stop():
                sess.run(self.update_local_ops)
                episode_buffer = []
                episode_values = []
                episode_frames = []
                episode_kill = 0
                episode_reward = 0
                episode_step_count = 0
                d = False
                
                # Old: s = self.env.get_state().screen_buffer
                
                s=np.asarray(self.env.reset()) #Changed from self.env.new_episode()
                #s = self.env.get_state().screen_buffer
                
                
                 # Old: episode_frames.append(s)
                #print('s shape',s.shape)
                #print('before',s.shape)
                
                #print('after',s.shape)
                #print('s shape',s.shape)
                for k in range(0,s.shape[2]):
                    episode_frames.append(s[:,:,k])
                    
                s=get_last_frame(s)
                
               
                s = process_frame(s) # This function cropped (not anymore) and raveled the image into 1 dimensional vector  
                
                rnn_state = self.local_AC.state_init
                self.batch_rnn_state = rnn_state
                while d==False:   # Before self.env.is_episode_finished() == False:
                    #Take an action using probabilities from policy network output.
                    a_dist,v,rnn_state = sess.run([self.local_AC.policy,self.local_AC.value,self.local_AC.state_out], 
                        feed_dict={self.local_AC.inputs:[s],
                        self.local_AC.state_in[0]:rnn_state[0],
                        self.local_AC.state_in[1]:rnn_state[1]})
                    a = np.random.choice(a_dist[0],p=a_dist[0])
                    a = np.argmax(a_dist == a)
                    
                    #ob, reward, done, info
                    #print('a=',a)
                    #print('a type',type(a))
                    #print(self.actions[a])
                    
                    state,r,d,misc=self.env.step(int(a)) # So gym output the state, reward, isdone, misc is kills
                    #state # 4 frames
                    #r # Reward
                    #d # Is done if True the episode is done
                    #misc # If the program done...

                    # And time per episode.
                    #r = self.env.make_action(self.actions[a]) / 100.0
                    # Maybe the 100 divided is important!
                    
                    #r=r/100 This was uncommented!
                    
                    #d = self.env.is_episode_finished()
                    if d == False:
                        s1 = np.asarray(state) #self.env.get_state().screen_buffer
                        #episode_frames.append(s1)
                        #print('s1',s1.shape)
                        
                        
                        episode_frames.append(get_last_frame(s1))
                        s1 = process_frame(get_last_frame(s1))
                        #print(s1.shape)
                    else:
                        s1 = s
                        
                    episode_buffer.append([s,a,r,s1,d,v[0,0]])
                    episode_values.append(v[0,0])
                    
                    
                    episode_reward += r
                    s = s1                    
                    total_steps += 1
                    episode_step_count += 1
                    
                    # If the episode hasn't ended, but the experience buffer is full, then we
                    # make an update step using that experience rollout.
                    # So this len(episode_buffer) == 20 is the batch size.
                    if len(episode_buffer) == batchsize and d != True and episode_step_count != max_episode_length - 1:
                        # Since we don't know what the true final return is, we "bootstrap" from our current
                        # value estimation.
                        v1 = sess.run(self.local_AC.value, 
                            feed_dict={self.local_AC.inputs:[s],
                            self.local_AC.state_in[0]:rnn_state[0],
                            self.local_AC.state_in[1]:rnn_state[1]})[0,0]
                        v_l,p_l,e_l,g_n,v_n = self.train(episode_buffer,sess,gamma,v1)
                        episode_buffer = []
                        sess.run(self.update_local_ops)
                    if d == True:
                        break
                        
                episode_kill = misc['kills']
                self.episode_kills.append(episode_kill)
                self.episode_rewards.append(episode_reward)
                self.episode_lengths.append(episode_step_count)
                self.episode_mean_values.append(np.mean(episode_values))
                
                # Update the network using the episode buffer at the end of the episode.
                if len(episode_buffer) != 0:
                    v_l,p_l,e_l,g_n,v_n = self.train(episode_buffer,sess,gamma,0.0)
                                
                    
                # Periodically save gifs of episodes, model parameters, and summary statistics.
                if episode_count % 5 == 0 and episode_count != 0:
                    if self.name == 'worker_0' and episode_count % 25 == 0:
                        time_per_step = 0.05*2 # The 4 is because we had 4 timeskips, now 
                        images = np.array(episode_frames)
                        make_gif(images,'./frames'+str(hyperparametersstring)+'/image'+str(episode_count)+'.gif',
                            duration=len(images)*time_per_step,true_image=True,salience=False)
                    if episode_count % 250 == 0 and self.name == 'worker_0':
                        saver.save(sess,self.model_path+'/model-'+str(episode_count)+'.cptk')
                        print ("Saved Model")

                    mean_reward = np.mean(self.episode_rewards[-5:])
                    mean_length = np.mean(self.episode_lengths[-5:])
                    mean_value = np.mean(self.episode_mean_values[-5:])
                    mean_kill = np.mean(self.episode_kills[-5:])
                    summary = tf.Summary()
                    summary.value.add(tag='Perf/Reward', simple_value=float(mean_reward))
                    summary.value.add(tag='Perf/Length', simple_value=float(mean_length))
                    summary.value.add(tag='Perf/Value', simple_value=float(mean_value))
                    summary.value.add(tag='Perf/Kills', simple_value=float(mean_kill))
                    summary.value.add(tag='Losses/Value Loss', simple_value=float(v_l))
                    summary.value.add(tag='Losses/Policy Loss', simple_value=float(p_l))
                    summary.value.add(tag='Losses/Entropy', simple_value=float(e_l))
                    summary.value.add(tag='Losses/Grad Norm', simple_value=float(g_n))
                    summary.value.add(tag='Losses/Var Norm', simple_value=float(v_n))
                    self.summary_writer.add_summary(summary, episode_count)

                    self.summary_writer.flush()
                if self.name == 'worker_0':
                    sess.run(self.increment)
                if Numberofepisodesbeforebreak == episode_count:
                    break
                episode_count += 1

In [None]:

k=[10, 10, 10]
lrs=[1e-4]
decay = [0.99, 0.9, 0.8]
Numberofepisodesbeforebreak=10000
# For the learning rate
for j in range(0,1):
    # For the batch size.
    for i in range(0,len(k)):
        print(i)
        #%# So there is some parameters for the the neural network. 
        # The max episode length and gamma is used in the worker.work class.
        max_episode_length = 300*2*3/2
        gamma = .99 # discount rate for advantage estimation and reward discounting

        #%# These are for the neural network
        s_size = 68*96#64*64#7056 # Observations are greyscale frames of 84 * 84 * 1
        a_size = 3 # Agent can move Left, Right, or Fire
        load_model = False
        model_path = '.\modelv2s6_lr=0.0001_bs=10_'+str(dt.now().date()) +'_'+ str(dt.now().hour)
        # The learing rate for the run
        lr=lrs[j]
        # What we call the model train
        batchsize=k[i]
        hyperparametersstring='v2s6_lr='+str(lr)+'_'+'bs='+str(batchsize) + '_' + 'd_'+str(decay[i]) + str(dt.now().date()) + '_' + str(dt.now().hour) 
        #model_path = '..\\model'+hyperparametersstring



        # The tensor flow graph.
        tf.reset_default_graph()

        # Makes a directory for the model. if it didn't exist.
        if not os.path.exists(model_path):
            os.makedirs(model_path)

        #Create a directory to save episode playback gifs to
        if not os.path.exists('./frames'+str(hyperparametersstring)):
            os.makedirs('./frames'+str(hyperparametersstring))

        #%# So the with statement, is like a try catch thing, where is does something with a thing,and then ?closes? the 
        # Environment after.
        # tf.device is Wrapper for Graph.device() using the default graph.
        # A wrapper has something to do with being a class on a object  as a class object.
        # So here we are setting up the workers for the code.
        with tf.device("/cpu:0"): 
            # So this is a tensorflow variable, global_episode=0. So here the global episode is =0.
            global_episodes = tf.Variable(0,dtype=tf.int32,name='global_episodes',trainable=False)
            # The optimizer for the nueral network I guess... With a learning rate of 0.001
            trainer = tf.train.RMSPropOptimizer(learning_rate=lr, decay = decay[i])# Was 1e-4
            # So here we are calling the AC_Network. A global network.
            master_network = AC_Network(s_size,a_size,'global',None) # Generate global network
            num_workers = multiprocessing.cpu_count() # Set workers to number of available CPU threads
            workers = []
            # Create worker classes
            # For every worker we make a different doomgame. And set it up
            for i in range(num_workers):
                # Are using the vizdoom standard way of setting it up.
                workers.append(Worker(gym.make("doom_scenario2_96-v0"),i,s_size,a_size,trainer,model_path,global_episodes,hyperparametersstring))
            saver = tf.train.Saver(max_to_keep=5)

        # So these two thing are the same
        # 1
        ## Using the `close()` method.
        #sess = tf.Session()
        #sess.run(...)
        #sess.close()
        # 2 
        ## Using the context manager.
        #with tf.Session() as sess:
        #  sess.run(...)

        # tf.session is where the network, is running. Or it is a session in tensorflow where as 
        with tf.Session() as sess:
            # So the coordinator is a function/class/object, which are handeling the coordiation between the threads.
            # And I guess we for thread for each worker.
            coord = tf.train.Coordinator()
            # so there is a argument for load_model in the input to the code.
            # So if is loading i guess it will start from there it stopped previously.
            if load_model == True:
                print ('Loading Model...')
                print(model_path)
                ckpt = tf.train.get_checkpoint_state(model_path)
                saver.restore(sess,ckpt.model_checkpoint_path)
            else:
                # If not true, it will start randomly.
                # tf.global_variables_initializer
                #tf.initializers.global_variables
                # So I guess 
                sess.run(tf.global_variables_initializer())

            # This is where the asynchronous magic happens.
            # Start the "work" process for each worker in a separate threat # Lols.
            worker_threads = []
            # So for every worker we do the multithreading workflow.
            for worker in workers:
                # Defining the work of the worker
                worker_work = lambda: worker.work(max_episode_length,gamma,sess,coord,saver,batchsize,Numberofepisodesbeforebreak,hyperparametersstring)
                t = threading.Thread(target=(worker_work))
                t.start()
                sleep(0.5)
                worker_threads.append(t)
            # Coordinating the work between the workers. # So is this the "magic"?
            coord.join(worker_threads)

0




Doom> Loading level: C:\Users\elind\Anaconda2\envs\py36\lib\site-packages\vizdoom/scenarios/defend_the_center.cfg
Doom> Loading level: C:\Users\elind\Anaconda2\envs\py36\lib\site-packages\vizdoom/scenarios/defend_the_center.cfg
Doom> Loading level: C:\Users\elind\Anaconda2\envs\py36\lib\site-packages\vizdoom/scenarios/defend_the_center.cfg
Doom> Loading level: C:\Users\elind\Anaconda2\envs\py36\lib\site-packages\vizdoom/scenarios/defend_the_center.cfg
Starting worker 0
Starting worker 1
Starting worker 2
Starting worker 3
pygame 1.9.4
Hello from the pygame community. https://www.pygame.org/contribute.html
MoviePy - Building file ./framesv2s6_lr=0.0001_bs=10_2019-05-15_20/image25.gif with imageio.


                                                                                                                       

MoviePy - Building file ./framesv2s6_lr=0.0001_bs=10_2019-05-15_20/image50.gif with imageio.


                                                                                                                       

In [None]:
env_human = gym.make("doom_scenario2_human-v0")
frame = env_human.reset()


In [None]:
env_cpu.close()
