In [1]:
import gym
import time

import numpy as np
np.random.seed(42)

import multiprocessing as mp
import ctypes

In [2]:
def weight_variable(shape, name=None):
    import tensorflow as tf
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial, name=name)

def bias_variable(shape, name=None):
    import tensorflow as tf
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial, name=name)

In [36]:
def create_variable(name="", input_size=4, output_size=2, n_hidden=2, hidden_size=[128, 64]):
    import tensorflow as tf
    
    variables_dict = {}
    variables_dict["W1" + name] = weight_variable([input_size, hidden_size[0]], name="W1" + name)
    variables_dict["b1" + name] = bias_variable((1, hidden_size[0]), name="b1" + name)
    
    for i in range(n_hidden-1):
        variables_dict["W"+str(i+2) + name] = weight_variable([hidden_size[i], hidden_size[i+1]], name="W"+str(i+2) + name)
        variables_dict["b"+str(i+2) + name] = bias_variable((1, hidden_size[i+1]), name="b"+str(i+2) + name)
    
    variables_dict["Wo" + name] = weight_variable([hidden_size[-1], output_size], name="Wo" + name)
    variables_dict["bo" + name] = bias_variable((1, output_size), name="bo" + name)
    
    variables_dict["input_observation"] = tf.placeholder(tf.float32, shape=[None, input_size], name="i_observation" + name)
    
    variables_dict["y_true"] = tf.placeholder(tf.float32, shape=[None, output_size], name="y_true" + name)
    variables_dict["y_action"] = tf.placeholder(tf.float32, shape=[None, output_size], name="action" + name)
    
    return variables_dict

def build_model(variables_dict, name="", input_size=4, output_size=2, n_hidden=2, hidden_size=[128, 64], 
                learning_rate=0.001):
    """
    Create a simple model
    """
    import tensorflow as tf
    
    y = tf.nn.relu(tf.matmul(variables_dict["input_observation"], variables_dict["W1" + name]) + 
                   variables_dict["b1" + name], name="y1" + name)
    
    for i in range(n_hidden-1):
        y = tf.nn.relu(tf.matmul(y, variables_dict["W"+str(i+2) + name]) + 
                       variables_dict["b"+str(i+2) + name], name="y"+str(i+2) + name)
    
    y = tf.matmul(y, variables_dict["Wo" + name]) + variables_dict["bo" + name]
    
    loss_list = tf.nn.l2_loss(y * variables_dict["y_action"] - variables_dict["y_true"])
    loss = tf.reduce_mean(loss_list)
    
    train_step = tf.train.RMSPropOptimizer(learning_rate).minimize(loss)

    return y,  loss, train_step

def best_choice(variables_dict, observation, sess):
    import tensorflow as tf
    feed_dic = {variables_dict["input_observation"]: observation.reshape((1, -1))}
    print("Je passe")
    reward = sess.run(variables_dict["y"], feed_dict=feed_dic)
    print("Je casse")
    return np.argmax(reward), np.max(reward)

def best_action(variables_dict, observation, sess):
    return best_choice(variables_dict, observation, sess)[0]

def best_reward(variables_dict, observation, sess):
    return best_choice(variables_dict, observation, sess)[1]

def epsilon_greedy_policy(variables_dict, observation, epsilon, env, sess):
    u = np.random.binomial(1, epsilon)
    if u:
        return env.action_space.sample()
    else:
        return best_action(variables_dict, observation, sess)

In [37]:
def assign_value_to_theta(l_theta, variables_dict, sess):
    import tensorflow as tf
    keys = variables_dict.keys()
    keys.sort()
    for i, key in enumerate(keys):
        if key not in ["input_observation", "y_true", "y_action", "y"]:
            with l_theta[i].get_lock():
                l_theta[i].value = sess.run(variables_dict[key])
    return l_theta
    
def read_value_from_theta(l_theta, variables_dict, sess):
    import tensorflow as tf
    keys = variables_dict.keys()
    keys.sort()
    for i, key in enumerate(keys):
        if key not in ["input_observation", "y_true", "y_action", "y"]:
            variables_dict[key] = tf.assign(variables_dict[key], l_theta[i].value)
    return variables_dict

def initialise(input_size=4, output_size=2, n_hidden=2, hidden_size=[128, 64]):
    l_theta = []
    
    shapes = [(input_size, hidden_size[0])]
    for i in range(n_hidden - 1):
        shapes.append((hidden_size[i], hidden_size[i+1]))
    shapes.append((hidden_size[-1], output_size))
    
    shapes.append((1, hidden_size[0]))
    for i in range(n_hidden - 1):
        shapes.append((1, hidden_size[i+1]))
    shapes.append((1, output_size))
    
    for i, shape in enumerate(shapes):
        l_theta.append(mp.Array(ctypes.c_double, shape))
        l_theta[i].value = np.random.uniform(low=-0.01, high=0.01, size=shape)
        
    return l_theta

In [38]:
class slave_worker(mp.Process):
    
    def __init__(self, T_max=100, Itarget=15, Iasyncupdate=10, gamma=0.9, learning_rate=0.001, 
                   env_name="CartPole-v0", model_option={"n_hidden":1, "hidden_size":[10]}, verbose=False, **kwargs):
        import tensorflow as tf
        
        super(slave_worker, self).__init__(**kwargs)
        self.T_max = T_max
        self.Itarget = Itarget
        self.Iasyncupdate = Iasyncupdate
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.env = gym.make(env_name)
        
        self.variables_dict = create_variable(n_hidden=model_option["n_hidden"], hidden_size=model_option["hidden_size"])
        temp = build_model(self.variables_dict, n_hidden=model_option["n_hidden"], 
                                 hidden_size=model_option["hidden_size"])
        self.variables_dict["y"], self.loss, self.train_step = temp
        
        self.variables_dict_minus = create_variable(name="_minus", n_hidden=model_option["n_hidden"], 
                                                   hidden_size=model_option["hidden_size"])
        temp = build_model(self.variables_dict_minus, name="_minus", n_hidden=model_option["n_hidden"], 
                                       hidden_size=model_option["hidden_size"])
        self.variables_dict_minus["y"], self.loss_minus, self.train_step_minus = temp
        
        self.sess = tf.Session()
        
        self.sess.run(tf.global_variables_initializer())
        
        
        
        
    def run(self):
        global T, l_theta, l_theta_minus
        import tensorflow as tf
        
        self.variables_dict = read_value_from_theta(l_theta, self.variables_dict, self.sess)

        self.variables_dict_minus = read_value_from_theta(l_theta_minus, self.variables_dict_minus, self.sess)

        epsilon = 0.9
        t = 0
        x_batch = 0
        y_batch = []
        nb_env = 0
        firstiter=True
        observation = self.env.reset()
        t_init = t
        
        print("OK")

        while T.value<self.T_max:
            
            print(T.value)

            self.variables_dict_minus = read_value_from_theta(l_theta_minus, self.variables_dict_minus, self.sess)

            action = epsilon_greedy_policy(self.variables_dict, observation, epsilon, self.env, self.sess)

            observationprime, reward, done, info = self.env.step(action) 

            if t - t_init > 200:
                done = True
            print("Checkpoint 1")

            if done:
                y = reward
                observationprime = self.env.reset()
                t_init = t + 1
                nb_env += 1
            else:
                print("Je passe")
                y = reward + self.gamma * best_reward(self.variables_dict_minus, observationprime, self.sess)
                print("Je casse")
            
            print("Checkpoint 2")
            
            if firstiter:
                firstiter=False
                observation_batch = observation.reshape((1, -1))
                action_batch = [action]
            else:
                observation_batch = np.vstack((observation_batch, observation.reshape((1, -1))))
                action_batch.append(action)
                
            print("Checkpoint 3")
            
            y_batch.append(y)
            observation = observationprime
            with T.get_lock():
                T.value += 1
                
            print("Checkpoint 4")
            
            t += 1
            if T.value %self.Itarget == 0:
                for i, theta_minus in enumerate(l_theta_minus):
                    with theta_minus.get_lock():
                        l_theta_minus[i].value = l_theta[i].value
                        
            print("Checkpoint 5")
            
            if t %self.Iasyncupdate == 0:
                action_batch_arr = np.array(action_batch)
                action_batch_multiplier = np.eye(2)[action_batch_arr]
                sampling = np.random.randint(0, len(y_batch), 64)
                
                y_batch_arr = np.array(y_batch).reshape((-1, 1)) * action_batch_multiplier

                self.variables_dict = read_value_from_theta(l_theta, self.variables_dict, self.sess)
                
                feed_dict = {self.variables_dict["input_observation"]: observation_batch[sampling, :],
                             self.variables_dict["y_true"]: y_batch_arr[sampling, :], 
                             self.variables_dict["y_action"]: action_batch_multiplier[sampling, :]}
                self.sess.run(self.train_step, feed_dict=feed_dict)


                l_theta = assign_value_to_theta(l_theta, self.variables_dict, self.sess)

            if epsilon>0.01:
                epsilon -= 0.895/50000

        return

In [39]:
class master_worker(mp.Process):
    
    def __init__(self, T_max=100, nb_env=10, env_name="CartPole-v0", model_option={"n_hidden":1, "hidden_size":[10]}, 
                 verbose=False, **kwargs):
        import tensorflow as tf
        
        super(master_worker, self).__init__(**kwargs)
        self.T_max = T_max
        self.env = gym.make(env_name)
        self.nb_env = nb_env
        
        self.variables_dict = create_variable(n_hidden=model_option["n_hidden"], hidden_size=model_option["hidden_size"])
        temp = build_model(self.variables_dict, n_hidden=model_option["n_hidden"], 
                                 hidden_size=model_option["hidden_size"])
        self.variables_dict["y"], self.loss, self.train_step = temp
        
        self.sess = tf.Session()
        
        self.sess.run(tf.global_variables_initializer())
        
        
        
        
    def run(self):
        global T, l_theta, l_theta_minus
        import tensorflow as tf
        
        self.variables_dict = read_value_from_theta(l_theta, self.variables_dict, self.sess)

        epsilon = 0.01
        observation = self.env.reset()

        for i in range(self.nb_env):
            t = 0
            while t<self.T_max:
                t += 1
                env.render()

                action = epsilon_greedy_policy(self.variables_dict, observation, epsilon, self.env, self.sess)

                observation, reward, done, info = self.env.step(action) 

                if done:
                    print("Environment completed in %s timesteps"%t)
        return

In [42]:
def main(nb, T_max=100,  model_option={"n_hidden":1, "hidden_size":[10]}, env_name="CartPole-v0"):
    global T, l_theta, l_theta_minus
    T = mp.Value('i', 0)
    
    l_theta = initialise(n_hidden=model_option["n_hidden"], hidden_size=model_option["hidden_size"])
    l_theta_minus = initialise(n_hidden=model_option["n_hidden"], hidden_size=model_option["hidden_size"])
    
    jobs = []
    for i in range(nb):
        print("Process %s starting"%i)
        job = slave_worker(T_max=T_max, model_option=model_option, env_name=env_name)
        job.run()
        jobs.append(job)
    
    t_init = time.time()
    while T.value<T_max:
        if time.time() - t_init > 10:
            print(T.value)
            t_init = time.time()
    print("Training completed")
    
    exemple = master_worker(T_max=T_max, model_option=model_option, env_name=env_name)
    exemple.run()
    
    """model.set_weights(theta.value)
    
    env = gym.make(env_name)
    observation = env.reset()

    for t in range(100000):
        env.render()
        #print(model.predict(np.append(observation, [0])), model.predict(np.append(observation, [1])))
        action = epsilon_greedy_policy(model, observation, 0.01)
        #action = weighted_choice(model, observation)[0]
        #print("NEXT")
        observation, reward, done, info = env.step(action)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break
    """

global T, theta, theta_minus
model_option={"n_hidden":1, "hidden_size":[10]}
env_name="CartPole-v0"
T = mp.Value('i', 0)

model = build_model(n_hidden=model_option["n_hidden"], hidden_size=model_option["hidden_size"])
theta = mp.Array(ctypes.c_double, len(model.get_weights()))
theta.value = model.get_weights()
theta_minus  = mp.Array(ctypes.c_double, len(theta.value))
theta_minus.value = theta.value

theta.value

In [43]:
main(1)

[2017-02-23 10:42:39,437] Making new env: CartPole-v0


Process 0 starting
OK
0
Je passe
Je casse
Checkpoint 1
Je passe
Je passe
Je casse
Je casse
Checkpoint 2
Checkpoint 3
Checkpoint 4
Checkpoint 5
1
Checkpoint 1
Je passe
Je passe
Je casse
Je casse
Checkpoint 2
Checkpoint 3
Checkpoint 4
Checkpoint 5
2
Checkpoint 1
Je passe
Je passe
Je casse
Je casse
Checkpoint 2
Checkpoint 3
Checkpoint 4
Checkpoint 5
3
Checkpoint 1
Je passe
Je passe
Je casse
Je casse
Checkpoint 2
Checkpoint 3
Checkpoint 4
Checkpoint 5
4
Checkpoint 1
Je passe
Je passe
Je casse
Je casse
Checkpoint 2
Checkpoint 3
Checkpoint 4
Checkpoint 5
5
Checkpoint 1
Je passe
Je passe
Je casse
Je casse
Checkpoint 2
Checkpoint 3
Checkpoint 4
Checkpoint 5
6
Checkpoint 1
Je passe
Je passe
Je casse
Je casse
Checkpoint 2
Checkpoint 3
Checkpoint 4
Checkpoint 5
7
Checkpoint 1
Je passe
Je passe
Je casse
Je casse
Checkpoint 2
Checkpoint 3
Checkpoint 4
Checkpoint 5
8
Je passe
Je casse
Checkpoint 1
Je passe
Je passe
Je casse
Je casse
Checkpoint 2
Checkpoint 3
Checkpoint 4
Checkpoint 5
9
Checkpoint 1


KeyboardInterrupt: 

In [None]:
a = ["W1", "W2", "Wo", "b1", "b2", "bo"]
a.sort()
a

In [None]:
def one_step_slave(T_max=100, Itarget=15, Iasyncupdate=10, gamma=0.9, learning_rate=0.001, 
                   env_name="CartPole-v0", model_option={"n_hidden":1, "hidden_size":[10]}, verbose=False):
    global T, l_theta, l_theta_minus

    env = gym.make(env_name)
    
    print("Environnment created")
    
    rmsprop = RMSprop(lr=learning_rate)
    
    print("Optimiser initialized")
    
    model = build_model(n_hidden=model_option["n_hidden"], hidden_size=model_option["hidden_size"])
    
    print("Model created")
    

    shapes = [weight.shape for weight in model.get_weights()]
    weights = read_value(l_theta, shapes)
    
    print("Weights loaded")
    
    model.set_weights(weights)
    
    print("Weights Initialized")
    
    model.compile(loss='mse', optimizer=rmsprop)
    
    print("Model compiled")
    
    modelminus = build_model(n_hidden=model_option["n_hidden"], hidden_size=model_option["hidden_size"])
    modelminus.set_weights(weights)
    modelminus.compile(loss='mse', optimizer=rmsprop)
    
    print("Modelminus compiled")
    
    epsilon = 0.9
    t = 0
    x_batch = 0
    y_batch = []
    nb_env = 0
    firstiter=True
    observation = env.reset()
    t_init = t
    
    print("T_max = %s"%T_max)
    
    while T.value<T_max:
        
        modelminus.set_weights(read_value(l_theta_minus, shapes))
        
        action = epsilon_greedy_policy(model, observation, epsilon, env)
        
        observationprime, reward, done, info = env.step(action) 
        
        if t - t_init > 200:
            done = True

        if done:
            y = reward
            observationprime = env.reset()
            t_init = t + 1
            nb_env += 1
        else:
            y = reward + gamma * best_reward(modelminus, observationprime)

        if firstiter:
            firstiter=False
            observation_batch = observation
            action_batch = [action]
        else:
            observation_batch = np.vstack((observation_batch, observation))
            action_batch.append(action)
        y_batch.append(y)
        observation = observationprime
        with T.get_lock():
            T.value += 1
        t += 1
        if T.value %Itarget == 0:
            for i, theta_minus in enumerate(l_theta_minus):
                with theta_minus.get_lock():
                    l_theta_minus[i].value = l_theta[i].value
        if t %Iasyncupdate == 0:
            action_batch_arr = np.array(action_batch).reshape((-1, 1))
            sampling = np.random.randint(0, len(y_batch), 64)
                    
            model.set_weights(read_value(theta, shapes))
            model.train_on_batch([observation_batch[sampling, :], action_batch_arr[sampling, :]], 
                      np.array(y_batch).reshape((-1, 1))[sampling, :])
            
            
            for i in range(len(l_theta)):
                with l_theta[i].get_lock():
                    l_theta[i].value = model.get_weights()[i]

        if epsilon>0.01:
            epsilon -= 0.895/50000
        
    return

In [None]:
gamma = 0.9
Itarget = 15
Iasyncupdate = 10
nb_iter = 200000
learning_rate = 0.001
history = []

# Initialisation
env = gym.make('CartPole-v0')
#model = build_model(n_hidden=3, hidden_size=[128, 64, 32])
model = build_model()
rmsprop = RMSprop(lr=learning_rate)
model.compile(loss='mse', optimizer=rmsprop)

#modelminus = build_model(n_hidden=3, hidden_size=[128, 64, 32])
modelminus = build_model()
modelminus.set_weights(model.get_weights())
modelminus.compile(loss='mse', optimizer=rmsprop)

theta = model.get_weights()
T = 0
t = 0
epsilon = 0.9
x_batch = 0
y_batch = []
nb_env = 0
verbose = 20
max_env = 50000
firstiter=True
observation = env.reset()
t_init = t


callback = History()
last_reward = np.zeros(100)
iteration = 0
while nb_env<max_env:
    iteration += 1
    if nb_env %verbose == 0:
        env.render()
    if iteration%10000 == 0:
        print("Iteration %s"%iteration)
    
    
    action = epsilon_greedy_policy(model, observation, epsilon)
    observationprime, reward, done, info = env.step(action) 
    if t - t_init > 200:
        done = True

    if done:
        y = reward
        observationprime = env.reset()
        if nb_env %verbose == 0:
            if t-t_init < 195:
                print("Episode %s finished in %s frames"%(nb_env, t-t_init))
            else:
                print("Episode %s achieved with %s frames"%(nb_env, t-t_init))
        last_reward = np.roll(last_reward, 1)
        last_reward[0] = t - t_init
        if np.mean(last_reward)>195:
            print("Environnement completed in %s iterations and %s episode"%(iteration, nb_env))
            break
        t_init = t+1
        nb_env += 1
    else:
        y = reward + gamma * best_reward(modelminus, observationprime)
    
    if firstiter:
        firstiter=False
        observation_batch = observation
        action_batch = [action]
    else:
        observation_batch = np.vstack((observation_batch, observation))
        action_batch.append(action)
    y_batch.append(y)
    #history.append(model.loss(observation, y, action))
    observation = observationprime
    T += 1
    t += 1
    if T %Itarget == 0:
        modelminus.set_weights(model.get_weights())
    if t %Iasyncupdate == 0:
        action_batch_arr = np.array(action_batch).reshape((-1, 1))
        
        sampling = np.random.randint(0, len(y_batch), 64)
        permut = np.random.permutation(Iasyncupdate)
        model.train_on_batch([observation_batch[sampling, :], action_batch_arr[sampling, :]], 
                  np.array(y_batch).reshape((-1, 1))[sampling, :])
        #y_batch = []
        #firstiter=True
        
    if epsilon>0.01:
        epsilon -= 0.895/80000
        
    #if nb_env %600 == 0:
        #print("RESET")
        #epsilon = 0.9

In [None]:
model = build_model()

In [None]:
model_bis = keras.models.Model.from_config(model.get_config())

In [None]:
class my_worker(multiprocessing.Process):
    
    def __init__(self, **args):
        multiprocessing.Process.__init__(self, args)

In [None]:
a = my_worker(group=None)

In [None]:
import threading
import Queue
import time
import cython
import multiprocessing as mp

import numpy as np

In [None]:
theta=mp.Array(ctypes.c_double,6)

In [None]:
theta.value = np.array([1, 2, 3, 4, 5])

In [None]:
print(type(theta))

In [None]:
model = build_model()
a = model.get_weights()
len(a)

In [None]:
theta.value=a
#theta.value

In [None]:
n = 100
m = 100000

In [None]:
def target_fonction(queue):
    for i in range(n):
        counter = 0
        while counter<m:
            counter += 1
        queue.put(1)
    print("Done")
    return

In [None]:
class my_worker(multiprocessing.Process):
    
    def __init__(self, n, nb, **kwargs):
        self.a = 0
        self.n=n
        self.nb=nb
        super(my_worker, self).__init__(**kwargs)
    
    def run(self):
        global a
        while a.value < self.nb*self.n:
            counter = 0
            while counter<m:
                counter += 1
            with a.get_lock():
                a.value += 1
        print("Done")
        return

In [None]:
p = my_worker(200, 10)

In [None]:
def master_function(nb):
    global b
    global a
    jobs = []
    for i in range(nb):
        p = my_worker(n)
        p.start()
        jobs.append(p)
    counter = 0
    while a.value < nb * n:
        counter += 1
        if counter %50000000 == 0:
            print("a = %s"%a.value)
    b = 1
    print("a = %s"%a.value)
    return

In [None]:
def main(nb):
    global a
    a = multiprocessing.Value('i', 0)
    jobs = []
    for i in range(nb):
        p = my_worker(n, nb)
        p.start()
        jobs.append(p)
    for job in jobs:
        job.join()
    print("a = %s"%a.value)

In [None]:
t_init = time.time()
main(3)
print("Done in %s seconds"%(time.time()-t_init))

In [None]:
t_init = time.time()
main(3)
print("Done in %s seconds"%(time.time()-t_init))

In [None]:
def main_bis(nb):
    a = 0
    for i in range(nb * n):
        counter = 0
        while counter<m:
            counter += 1
        a += 1
    print("a = %s"%a)

In [None]:
t_init = time.time() 
main_bis(3)
print("Done in %s seconds"%(time.time()-t_init))

In [None]:
t_init = time.time() 
main_bis(3)
print("Done in %s seconds"%(time.time()-t_init))

In [None]:
t_init = time.time() 
counter = 0
while counter<m:
    counter += 1
print("Done in %s seconds"%(time.time()-t_init))