# Continuous Control

---

In this notebook, you will learn how to use the Unity ML-Agents environment for the second project of the [Deep Reinforcement Learning Nanodegree](https://www.udacity.com/course/deep-reinforcement-learning-nanodegree--nd893) program.

### 1. Start the Environment

We begin by importing the necessary packages.  If the code cell below returns an error, please revisit the project instructions to double-check that you have installed [Unity ML-Agents](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Installation.md) and [NumPy](http://www.numpy.org/).

In [147]:
######################## DIFFERENCE BETWEEN PONG AND OUR ENVIRONMENT ##############################
# num_agent : 20 -> 20 agent
# Size of each action : 4 -> One action is size(4) AND continuous.
# Difference Not (1) -> But
# In PONG : 1 last output probability of action 5 or 4.
#     # convert states to policy (or probability)
# new_probs = pong_utils.states_to_prob(policy, states)
# new_probs = torch.where(actions == pong_utils.RIGHT, new_probs, 1.0-new_probs)
# the output is the probability of moving right
# P(left) = 1-P(right)
# outi​={xi​yi​​if conditioni​otherwise​}

In [4]:
env = UnityEnvironment(file_name="C:/Users/gabyc/Desktop/Reinforcment_TP/deep-reinforcement-learning/p2_continuous-control/Multi_agent/Reacher_Windows_x86_64/Reacher.exe")

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from unityagents import UnityEnvironment
import collections
from multiprocessing import Process
import torch.optim as optim

In [148]:
class Policy(nn.Module):

    def __init__(self,input_size,nb_action):
        super(Policy, self).__init__()
        self.nb_action = nb_action
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.fc1 = nn.Linear(input_size,150)
        self.fc2 = nn.Linear(150,75)
        self.fc3 = nn.Linear(75,nb_action)
        self.fc3bis = nn.Linear(75,nb_action)
        
        
    def forward(self, x):
        x = x.float()
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        mu = F.tanh(self.fc3(x)) # Tanh because action_values between -1 and 1.
        #sigma = F.softplus(self.fc3bis(x))# Activation to stay always >= 0
        #sigma = torch.clamp(sigma,0.001) # Activation to stay always > 0
        sigma = torch.ones(self.nb_action,requires_grad=False).to(self.device)
        m = torch.distributions.normal.Normal(mu,sigma,False) # False, whereas constraint on mu = 0
        return m

In [149]:
def New_prob(policy,states,actions,device):
    # The Gradient FLOW on action
    # The Gradient fon't FLOW on state yet
    # No Clipping.
    Tab = []
    Action_sample_tab = []
    m = policy(states[0])
    
    proba = m.log_prob(actions[0])
    #probab = torch.exp(proba)
    #probab = torch.clamp(probab,0.001) ## Don't why there is negative Probability
    # Maybe deal with the Log without going to the exponential because of numeric diff
    
    # MAYBE CLIPPING AND MAYBE STILL SAMPLE SOMETHING TO DO (At -at)
    #action_sample = torch.clip(sample.detach(), -1, 1)
    #sample = m.sample()#.detach()
    
    # STORE
    Tab.append(proba)
    Action_sample_tab.append(actions[0])
    
    # Loop over the state and action (a,s)
    for state_iter,action_iter in zip(states[1:],actions[1:]):
        m = policy(state_iter)
        #sample = m.sample()#.detach()
        proba = m.log_prob(action_iter) # Prob on the previous action but new policy
        #probab = torch.exp(proba)
        #probab = torch.clamp(probab,0.001)
        
        # STORE
        Tab.append(proba)
        Action_sample_tab.append(action_iter)

    return torch.stack(Tab),torch.stack(Action_sample_tab)

In [150]:
def clipped_surrogate(device,policy, old_probs,actions, states, rewards,batch_size,
                      discount = 0.995, epsilon=0.1, beta=0.01):
    
    old_probs = torch.stack(old_probs)
    
    # Convert REWARD TO REWARD FUTURE
    rewards = np.asarray(rewards)
    reward_futur = np.zeros((rewards.shape[0],rewards.shape[1]))
    longueur = rewards.shape[0] - 1
    reward_futur[longueur] = rewards[longueur]
    new_discount = 0
    for i in range(1,rewards.shape[0]):
        new_discount = discount**(longueur-i) 
        reward_futur[longueur-i] = reward_futur[longueur-(i-1)] + rewards[longueur-i]*new_discount
        
    # Compute normalize reward
    mean = np.mean(reward_futur, axis=1)
    std = np.std(reward_futur, axis=1)+1.0e-10
    normalized_rewards = (reward_futur-mean[:, np.newaxis])/std[:, np.newaxis]
    normalized_rewards = torch.from_numpy(normalized_rewards).float().to(device)
    normalized_rewards = normalized_rewards.unsqueeze(2)
    normalized_rewards = normalized_rewards.repeat(1, 1, old_probs.shape[2])
    
    ### SHUFFLE AND MAKING CHUNK ##
    indexes = torch.randperm(old_probs.shape[0])
    indexes_numpy = indexes.numpy().astype('int')
    
    #states = np.asarray(states)[indexes_numpy]
    #actions = np.asarray(actions)[indexes_numpy]
    #normalized_rewards = normalized_rewards[indexes]
    #old_probs = old_probs[indexes]
    Nb_split = int(old_probs.shape[0]/batch_size)
    
    indices = torch.split(torch.from_numpy(np.arange(0,old_probs.shape[0],1)),batch_size,0)
    
    for chunks in indices:
        chunk = chunks.long()
        chunk_numpy = chunk.numpy().astype('int')
        
        states_chunk = torch.stack(states)[chunk]
        actions_chunk = torch.stack(actions)[chunk]
        normalized_rewards_chunk = normalized_rewards[chunk]
        old_prob_chunk = old_probs[chunk]
        
        new_prob_chunk,action_sample_chunk = New_prob(policy, states_chunk,actions_chunk,device)
    
        # Compute each 
        Fraction = torch.exp(new_prob_chunk-(old_prob_chunk+1e-10))
        Cote1 = normalized_rewards_chunk*Fraction #*(action_sample-mu) 
        Cote2 = normalized_rewards_chunk*torch.clamp(Fraction, 1-epsilon, 1+epsilon) #*(action_sample-mu)
        Cote1 = Cote1[:, :,:, None]
        Cote2 = Cote2[:, :,:, None]
        comp = torch.cat((Cote1, Cote2),3)
        Gradient = torch.min(comp,3)[0].to(device)
        #print("There is Nan Gradient")
        #print(torch.isnan(Gradient).any())
        #print(Gradient)


        entropy = -(torch.exp(new_prob_chunk)*old_prob_chunk+1.e-10)+ \
            (1.0-torch.exp(new_prob_chunk))*(1.0-old_prob_chunk+1.e-10) # Non definit si une valeur est inférieure à 0
        #print("There is Nan entropy") 
        #print(torch.isnan(entropy).any())
        #print(torch.mean(beta*(entropy) + Gradient))
        L = - torch.mean(beta*(entropy) + Gradient)
        #print(L)

        optimizer.zero_grad()
        L.backward()
        optimizer.step()
        del L
        

In [151]:
def collect_trajectories(env,env_info,policy,device,tmax):
    # DEAL WITH THAT OLD_PROB AND ACTION ARE DIFFERENT NOW.
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    state = env_info.vector_observations # get the current state (for each agent)
    states_tab , action_tab, reward_tab, prob_tab = [],[],[], []
    t = 0
    while True:
        state = torch.from_numpy(state).to(device)
        policy.eval()
        with torch.no_grad(): # Everything with torch no grad.
            #proba,action_sample,mu = policy(state) # Batch of 21
            m = policy(state) 

        
            # Sample maybe on gradient as to check that
            sample = m.sample() 
            action_tab.append(sample) # No clip and store

            # Proba not on clip and detach from Gradient.
            proba = m.log_prob(sample)
            #proba = torch.exp(proba) #Proba on CUDA no detach
            
            # Interact with the environment 
            sample = torch.clip(sample.detach().cpu(), -1, 1) # CLIP BEFORE TAKING THE PROBA OR AFTER?
            sample = sample.numpy()


            # Step the environment
            env_info = env.step(sample)[brain_name]           # send all actions to the environment
            next_states = env_info.vector_observations         # get next state (for each agent)
            rewards = env_info.rewards                         # get reward (for each agent)
            dones = env_info.local_done                        # see if episode finished

            # Store values
            prob_tab.append(proba)
            reward_tab.append(np.asarray(rewards))
            states_tab.append(state)

            # BREAK IF END OF THE EPISODE
            if np.any(dones):                                  # exit loop if episode finished
                break
            if t >= tmax:
                break
            state = next_states
            t +=1
    return states_tab, action_tab, reward_tab,prob_tab

# Launch Main code

In [159]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
env_info = env.reset(train_mode=True)[brain_name]  
states = env_info.vector_observations # get the current state (for each agent
num_agents = len(states)
scores = np.zeros(num_agents)                          # initialize the score (for each agent)
nb_states = len(states[0])
action_size = brain.vector_action_space_size
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
policy = Policy(nb_states,action_size).to(device)
optimizer = optim.Adam(policy.parameters(), lr=1e-3)

In [160]:
###################################################### MAdddIN_CODE #################################################
# training loop max iterations
episode = 3000

# widget bar to display progress
#!pip install progressbar
#import progressbar as pb
#widget = ['training loop: ', pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA() ]
#timer = pb.ProgressBar(widgets=widget, maxval=episode).start()

tmax = 200
discount_rate = .99
epsilon = 0.1
beta = .01
SGD_epoch = 4
batch_size = 200

# keep track of progress
mean_rewards = []

for e in range(episode):

    # collect trajectories
    states, actions, rewards,prob = collect_trajectories(env,env_info, policy,device,tmax)
    total_rewards = np.mean(rewards)
    print(total_rewards)

    # gradient ascent step
    for _ in range(SGD_epoch):
        
        # uncomment to utilize your own clipped function!
        clipped_surrogate(device,policy,prob,actions, states, rewards,batch_size, epsilon=epsilon, beta=beta)
        #L.requires_grad_() # I needed to do that to compute something but maybe that means that there is a bug.
    
    # the clipping parameter reduces as time goes on
    #epsilon*=.9999
    
    # the regulation term also reduces
    # this reduces exploration in later runs
    #beta*=.9999
    
    # get the average reward of the parallel environments
    mean_rewards.append(np.mean(total_rewards))
    
    # display some progress every 20 iterations
    if (e+1)%20 ==0 :
        print("################################")
        print("Episode: {0:d}, score: {1:f}".format(e+1,np.mean(total_rewards)))
        print(total_rewards)
        
    # update progress widget bar
    #timer.update(e+1)
    
#timer.finish()

2.4875621334533785e-06
0.0012313432560594223
0.00091293530297739
0.0006716417760324122
0.00020202019750469862
0.0008980099301766697
0.0010124377883155252
0.0009004974923101231
0.0011094527115202068
0.0013959390550927462
0.0008930348059097629
0.0011716417648565413
0.0009950248533813514
0.0006542288410982386
0.0008959390662686171
0.0005771144149611839
0.001077114403785313
0.0018805969728907543
0.0014577114102036798
0.0008324872910371287
################################
Episode: 20, score: 0.000832
0.0008324872910371287
0.0010099502261820717
0.0013109452443299306
0.0007860696341712676
0.0010721392795184062
0.0010939086049908612
0.0007164178944345731
0.00128109449872849
0.0011766168891234482
0.0013830845462000786
0.001253807078574212
0.0009577114213795508
0.0006019900362957177
0.0011318407707212872
0.0012487561909935961
0.001479695398398311
0.0013980099190007988
0.0017338308070170049
0.0009751243563137245
0.0009477611728457373
0.0010329949007686322
################################
Episode:

0.0021370557897965317
################################
Episode: 320, score: 0.002137
0.0021370557897965317
0.0017960198603533394
0.0021641790561044393
0.0008134328176392548
0.0023905472102486967
0.0016243654459261046
0.0016119402624777894
0.002176616866771706
0.0021716417425047996


KeyboardInterrupt: 

# Hyper-Search parameter

In [None]:
episode = 150
tmax = 200
SGD_epoch_tab = np.array([4,6,10])
lr_tab = np.array([1e-4,5e-4,1e-3])
discount_rate_tab = np.array([.99,.95,.90])
epsilon_tab = np.array([0.5,0.3,0.1])
beta_tab = np.array([.01,.03,.06])

In [78]:
Data = []
for SGD_epoch in SGD_epoch_tab:
    for lr in lr_tab:
        for discount_rate in discount_rate_tab:
            for epsilon in epsilon_tab:
                for beta in beta_tab:
                    param_dico = {"SGD_epoch":SGD_epoch,"lr":lr,"discount_rate":discount_rate,"epsilon":epsilon,"beta":beta}
                    # get the default brain
                    brain_name = env.brain_names[0]
                    brain = env.brains[brain_name]
                    env_info = env.reset(train_mode=True)[brain_name]  
                    states = env_info.vector_observations # get the current state (for each agent
                    num_agents = len(states)
                    scores = np.zeros(num_agents)                          # initialize the score (for each agent)
                    nb_states = len(states[0])
                    action_size = brain.vector_action_space_size
                    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
                    policy = Policy(nb_states,action_size).to(device)
                    optimizer = optim.Adam(policy.parameters(), lr=lr)

                    ###################################################### MAIN_CODE #################################################
                    # training loop max iterations


                    # widget bar to display progress
                    #!pip install progressbar
                    #import progressbar as pb
                    #widget = ['training loop: ', pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA() ]
                    #timer = pb.ProgressBar(widgets=widget, maxval=episode).start()


                    #discount_rate = .99
                    #epsilon = 0.5
                    #beta = .01
                    #tmax = 320
                    #SGD_epoch = 6

                    # keep track of progress
                    mean_rewards = []
                    print(param_dico)
                    epsilon2 = epsilon

                    for e in range(episode):

                        # collect trajectories
                        states, actions, rewards,prob = collect_trajectories(env,env_info, policy,device,tmax)
                        total_rewards = np.mean(rewards)

                        # gradient ascent step
                        for _ in range(SGD_epoch):

                            # uncomment to utilize your own clipped function!
                            L = -clipped_surrogate(device,policy,prob,actions, states, rewards,discount =discount_rate, epsilon=epsilon, beta=beta)
                            #L.requires_grad_() # I needed to do that to compute something but maybe that means that there is a bug.
                            optimizer.zero_grad()
                            L.backward()
                            optimizer.step()
                            del L

                        # the clipping parameter reduces as time goes on
                        epsilon2*=.995

                        # the regulation term also reduces
                        # this reduces exploration in later runs
                        beta*=.995

                        # get the average reward of the parallel environments
                        mean_rewards.append(total_rewards)

                        # display some progress every 20 iterations
                        if (e+1)%20 ==0 :
                            print("################################")
                            print("Episode: {0:d}, score: {1:f}".format(e+1,np.mean(total_rewards)))
                            print(total_rewards)
                    Data.append([param_dico,mean_rewards])

{'SGD_epoch': 4, 'lr': 0.0001, 'discount_rate': 0.99, 'epsilon': 0.5, 'beta': 0.01}
################################
Episode: 20, score: 0.000924
0.000923857847370472
################################
Episode: 40, score: 0.000893
0.0008934009952593576
################################
Episode: 60, score: 0.000997
0.0009974619066389987
################################
Episode: 80, score: 0.001617
0.001616751232898326
################################
Episode: 100, score: 0.001358
0.001357867989953853
################################
Episode: 120, score: 0.002084
0.0020837562986020814
################################
Episode: 140, score: 0.002299
0.002299492334389142
{'SGD_epoch': 4, 'lr': 0.0001, 'discount_rate': 0.99, 'epsilon': 0.5, 'beta': 0.03}
################################
Episode: 20, score: 0.000152
0.00015228426055557232
################################
Episode: 40, score: 0.000635
0.0006345177523148847
################################
Episode: 60, score: 0.000607
0.000606598971

################################
Episode: 40, score: 0.000898
0.0008984771372778766
################################
Episode: 60, score: 0.000442
0.0004416243556111597
################################
Episode: 80, score: 0.001162
0.0011624365222408686
################################
Episode: 100, score: 0.000693
0.0006928933855278541
################################
Episode: 120, score: 0.000886
0.0008857867822315789
################################
Episode: 140, score: 0.000571
0.0005710659770833962
{'SGD_epoch': 4, 'lr': 0.0001, 'discount_rate': 0.95, 'epsilon': 0.3, 'beta': 0.03}
################################
Episode: 20, score: 0.000447
0.0004467004976296788
################################
Episode: 40, score: 0.000685
0.0006852791725000754
################################
Episode: 60, score: 0.001467
0.0014670050433520133
################################
Episode: 80, score: 0.001008
0.0010076141906760368
################################
Episode: 100, score: 0.001061
0.00106091

################################
Episode: 80, score: 0.001525
0.0015253806765649826
################################
Episode: 100, score: 0.000655
0.0006548223203889609
################################
Episode: 120, score: 0.001464
0.0014644669723427538
################################
Episode: 140, score: 0.001761
0.0017614212804261198
{'SGD_epoch': 4, 'lr': 0.0001, 'discount_rate': 0.9, 'epsilon': 0.1, 'beta': 0.03}
################################
Episode: 20, score: 0.000371
0.0003705583673518926
################################
Episode: 40, score: 0.000698
0.0006979695275463731
################################
Episode: 60, score: 0.000558
0.0005583756220370985
################################
Episode: 80, score: 0.000404
0.00040355329047226665
################################
Episode: 100, score: 0.001145
0.001144670025176052
################################
Episode: 120, score: 0.000860
0.0008604060721389836
################################
Episode: 140, score: 0.001170
0.0011700

################################
Episode: 120, score: 0.002063
0.002063451730528005
################################
Episode: 140, score: 0.001650
0.0016497461560187
{'SGD_epoch': 4, 'lr': 0.0005, 'discount_rate': 0.95, 'epsilon': 0.5, 'beta': 0.03}
################################
Episode: 20, score: 0.000848
0.0008477157170926859
################################
Episode: 40, score: 0.001023
0.0010228426167315941
################################
Episode: 60, score: 0.001487
0.0014873096114260897
################################
Episode: 80, score: 0.001843
0.001842639552722425
################################
Episode: 100, score: 0.001668
0.0016675126530835169
################################
Episode: 120, score: 0.002657
0.002657360346694737
################################
Episode: 140, score: 0.001439
0.0014390862622501584
{'SGD_epoch': 4, 'lr': 0.0005, 'discount_rate': 0.95, 'epsilon': 0.5, 'beta': 0.06}
################################
Episode: 20, score: 0.000906
0.0009060913503

{'SGD_epoch': 4, 'lr': 0.0005, 'discount_rate': 0.9, 'epsilon': 0.3, 'beta': 0.03}
################################
Episode: 20, score: 0.000670
0.0006700507464445182
################################
Episode: 40, score: 0.000883
0.0008832487112223194
################################
Episode: 60, score: 0.001084
0.001083756320953823
################################
Episode: 80, score: 0.000987
0.0009873096226019605
################################
Episode: 100, score: 0.001102
0.0011015228180186398
################################
Episode: 120, score: 0.002096
0.002096446653648379
################################
Episode: 140, score: 0.001025
0.0010253806877408536
{'SGD_epoch': 4, 'lr': 0.0005, 'discount_rate': 0.9, 'epsilon': 0.3, 'beta': 0.06}
################################
Episode: 20, score: 0.000992
0.0009923857646204795
################################
Episode: 40, score: 0.001218
0.0012182740844445786
################################
Episode: 60, score: 0.001091
0.0010913705339

################################
Episode: 40, score: 0.001723
0.0017233502152872266
################################
Episode: 60, score: 0.002982
0.002982233435879958
################################
Episode: 80, score: 0.002675
0.002675126843759554
################################
Episode: 100, score: 0.004112
0.004111675035000453
################################
Episode: 120, score: 0.002668
0.002667512630731775
################################
Episode: 140, score: 0.004084
0.0040837562538985975
{'SGD_epoch': 4, 'lr': 0.001, 'discount_rate': 0.99, 'epsilon': 0.1, 'beta': 0.06}
################################
Episode: 20, score: 0.001426
0.0014263959072038607
################################
Episode: 40, score: 0.000975
0.0009746192675556629
################################
Episode: 60, score: 0.001619
0.0016192893039075856
################################
Episode: 80, score: 0.004228
0.004228426301426391
################################
Episode: 100, score: 0.004201
0.00420050752032

KeyboardInterrupt: 

In [137]:
# "SGD_epoch":SGD_epoch,"lr":lr,"discount_rate":discount_rate,"epsilon":epsilon,"beta":beta}
Databis = Data
Databis = np.asarray(Databis)
Moyenne = []
mean_iter = 50
Var = "beta"
Value = 0.06
for i,data in enumerate(Databis):
    for key, value in data[0].items():
        if (key == Var and value == Value):
            Moyenne.append(np.mean(data[1][130:]))
Moyenne = np.asarray(Moyenne)
print(np.mean(Moyenne))

0.001469245523751156


In [None]:
# Discount rate = 0.99 max
# lr = 0.001 max
# epsilon = 0.1 min
# beta = 0.03 medium (moins clair)

In [79]:
import pickle

with open('Hyp_searchPPO.pkl', 'wb') as file:
    # A new file will be created
    pickle.dump(Data, file)

In [None]:
# Whenn update the paramters
# Get the distribution
# Making the entropy term
# log_prob -> Distrib(log_prob(a)) # Avec a -> a[perm].clone() # From Make Batch # From
# When evaluation -> :#only used when evaluate the policy.Making the performance more stable
# When select action -> a = self.actor.dist_mode(state)
# EVALUATE -> Proba =0
# Select_action -> Interact with the environment 

# WHAT TO DO
# Render = FALSE -> Select_action a, log_prob(a) -> Don't put in Clip -> Put inside the memory.
# On a a et log_prob(a)

# Training : 
# -> States 
# Actor -> Get_dist : mu,sigma = forward(state)
# dist=  Normal(mu,sigma)
# return Distrib
# log_prob_a_now -> Distrib(log_prob(a[index])) OK 


# This function is differentiable, so gradients will flow back from the result of this operation to input.
# For -> s, a, td_target, adv, logprob_a

In [None]:
#One style of policy gradient implementation, popularized in [Mni+16] and well-suited for use
#with recurrent neural networks, runs the policy for T timesteps (where T is much less than the
#episode length), and uses the collected samples for an update.

In [None]:
# Each iteration, each of N (parallel) actors coThen we
#construct the surrogate loss on these NT timesteps of data, and optimize it with minibatch SGD
#(or usually for better performance, Adam [KB14]), for K epochs.llect T timesteps of data.
# Optimize surrogate L wrt, with K epochs and minibatch size M  NT

# Clipping = 0.2
# Horizon (2047) 512
# Minibatch size = 64, 4096
# Log stdev of action distribution LinearAnneal (-0.7,-1.6)
# GAE parameter 0.95
# 3x10-4
# Discount 0.99
# Num_epoch entre 10 et 15.

In [None]:
# Difficult to put the good beta for entropy rate
# Decay Rate of Variance <- Difficult Variance of the Normal Law
# Strong interaction of epsilon and N -> Seems good with very low Epsilon and N.

In [None]:
# IMPLEMENT ACTOR-CRITIC
# ACTOR-CRITIC CALCA
# IMPLEMENT REPLAY BUFFER
# IMPLEMENT CMA-ES