In [89]:
import logging
from models import *
import gym
import numpy as np
from agents import *
import itertools
import multiprocessing as mp

from collections import namedtuple
from queue import Queue
RewardsItem = namedtuple('RewardsItem', field_names=['reward', 'steps'])

In [90]:
env = gym.make("CartPole-v0")
obs = env.reset()

In [91]:
agent = Agent(env = env)
param_queue = mp.Queue()
param_queue.put(agent.net.state_dict())
rewards_queue = mp.Queue()



In [95]:
p = mp.Process(target=worker, args=("CartPole-v0", 10, param_queue, rewards_queue))
p.start()
rewards_queue.empty()

True

In [96]:
param_queue.put(agent.net.state_dict())

In [81]:
RewardsItem = namedtuple('RewardsItem', field_names=['seed', 'pos_reward', 'neg_reward', 'steps'])

def worker(env_id, worker_id, param_queue, rewards_queue):
    print("lol")
    env = gym.make(env_id)
    agent = Agent(env = env)        
    while not param_queue.empty():
        params = param_queue.get()  

        agent.net.load_state_dict(params)

        for _ in range(es.iters_per_update):
            # get a random seed
            seed = np.random.randint(1e6)
            # set the new seed
            np.random.seed(seed)

            noise = np.array(es.sample_noise(agent))
            pos_rew, pos_steps = es.evaluate_noisy(agent, noise, env)
            neg_rew, neg_steps = es.evaluate_noisy(agent, -noise, env)
            rewards_queue.put(RewardsItem(seed=seed, pos_reward=pos_rew, neg_reward=neg_rew, steps=pos_steps+neg_steps))
        break
    pass

class EvolutionStrategies():
    def __init__(self, 
                 env_id,
                 iters_per_update = 10,
                 noise_std = 10,
                 lr = 1e-03,
                 num_workers = 4,
                 population_size = 50
                ):
        
        self.env_id = env_id
        self.policy = Agent(env = env)
        self.iters_per_update = iters_per_update
        self.SIGMA = noise_std
        self.POPULATION = population_size
        self.num_workers = num_workers
        self.lr = lr
        
    def evaluate(self, policy, env):
        obs = env.reset()
        total_r = 0
        total_steps = 0
        for t in itertools.count():
            actions, values, _ = policy.act(obs)
            obs, reward, done, info = env.step(actions.numpy())
            total_r += reward
            total_steps += t
            if done:
                break
        return total_r, total_steps
    
    def evaluate_noisy(self, policy, noise, env):
        weights = policy.net.state_dict()
        # add the noise to each parameter of the NN

        for n, p in zip(noise, policy.net.parameters()):
            p.data += torch.FloatTensor(n * self.SIGMA)
            
        reward, steps = self.evaluate(policy, env)
        policy.net.load_state_dict(weights)
        
        return reward, steps
       
    
    def sample_noise(self, policy):
        noise_list = list()
        for param in policy.net.parameters():
            noise = np.random.normal(size = param.data.numpy().shape)
            noise_list.append(noise)
        return noise_list

    
    def train_step(self, policy, batch_noise, batch_rewards, step_idx):
        """
        Optimizes the weights of the NN based on the rewards and noise gathered
        """
        # normalize rewards to have zero mean and unit variance
            
        norm_reward = np.array(batch_reward)
        norm_reward = (norm_reward - np.mean(norm_reward)) / np.std(norm_reward)

        weighted_noise = None
        for noise, reward in zip(batch_noise, norm_reward):
            if weighted_noise is None:
                weighted_noise = [reward * p_n for p_n in noise]
            else:
                for w_n, p_n in zip(weighted_noise, noise):
                    w_n += reward * p_n


        for p, p_update in zip(policy.net.get_weights(), weighted_noise):
            update = p_update / (len(batch_reward)*self.SIGMA)
            p += self.lr * update
            
    def learn(self):
        iterations = 100
        
        params_queues = [mp.Queue(maxsize=1) for _ in range(4)]
        rewards_queue = mp.Queue(maxsize=self.iters_per_update)
        
        workers = []
                
        for idx, params_queue in enumerate(params_queues):
            proc = mp.Process(target=worker, args=(self.env, idx, params_queue, rewards_queue))
            print(proc)
            proc.start()
            workers.append(proc)
    
        step_idx = 0
        reward_history = []
        reward_max =[]
        reward_std = []
        print(self.env)
        
        for step_idx in range(iterations):
            # broadcasting network params
            params = self.policy.net.state_dict()
            for q in params_queues:
                q.put(params)
                           
            batch_noise = []
            batch_reward = []
            batch_steps_data = []
            batch_steps = 0
            results = 0
            print(batch_reward)
            while True: 
                print(rewards_queue.empty())
                while not rewards_queue.empty():
                    reward = rewards_queue.get_nowait()
                    np.random.seed(reward.seed) # sets the seed of the current worker rewards
                    noise, neg_noise = sample_noise(brain)
                    batch_noise.append(noise)
                    batch_reward.append(reward.pos_reward)
                    batch_noise.append(neg_noise)
                    batch_reward.append(reward.neg_reward)
                    results += 1
                    batch_steps += reward.steps

                if results == self.num_workers * self.iters_per_update:
                    break
                    
                step_idx += 1
                m_reward = np.mean(batch_reward)
                reward_history.append(m_reward)
                reward_std.append(np.std(batch_reward))
                if m_reward > 199:
                    print("\nSolved the environment in {} steps".format(step_idx))
                    break
                    
                print(batch_reward)
                self.train_step(self.policy, batch_noise, batch_reward, step_idx)

                print("\rStep: {}, Mean_Reward: {:.2f}".format(step_idx, m_reward), end = "", flush = True)
                
            for w, p_queue in zip(workers, params_queues):
                p_queue.put(None)
                w.join()



            

In [69]:
es = EvolutionStrategies(env)

In [70]:
es.learn()

<Process(Process-53, initial)>
<Process(Process-54, initial)>
<Process(Process-55, initial)>
<Process(Process-56, initial)>
<TimeLimit<CartPoleEnv<CartPole-v0>>>
[]
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


KeyboardInterrupt: 