# Continuous Control

---

In this notebook, you will learn how to use the Unity ML-Agents environment for the second project of the [Deep Reinforcement Learning Nanodegree](https://www.udacity.com/course/deep-reinforcement-learning-nanodegree--nd893) program.

### 1. Start the Environment

We begin by importing the necessary packages.  If the code cell below returns an error, please revisit the project instructions to double-check that you have installed [Unity ML-Agents](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Installation.md) and [NumPy](http://www.numpy.org/).

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from unityagents import UnityEnvironment
import collections
from multiprocessing import Process
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter

In [2]:
env = UnityEnvironment(file_name="C:/Users/gabyc/Desktop/Reinforcment_TP/deep-reinforcement-learning/p2_continuous-control/Multi_agent/Reacher_Windows_x86_64/Reacher.exe")

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_size -> 5.0
		goal_speed -> 1.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


# Network

In [4]:
class Policy(nn.Module):
    # Actor network 
    def __init__(self,input_size,nb_action):
        super(Policy, self).__init__()
        self.nb_action = nb_action
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.fc1 = nn.Linear(input_size,200)
        self.fc2 = nn.Linear(200,75)
        self.fc3 = nn.Linear(75,nb_action)
        self.fc3bis = nn.Linear(75,nb_action)
        
        
    def forward(self, x):
        x = x.float()
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        mu = F.tanh(self.fc3(x)) # Tanh because action_values between -1 and 1.

        sigma = torch.ones(self.nb_action,requires_grad=False).to(self.device)/2 
        m = torch.distributions.normal.Normal(mu,sigma,False)# False, whereas constraint on mu = 0
        return m

In [5]:
class Critic(nn.Module):
    # Critic network 
    def __init__(self,input_size):
        super(Critic, self).__init__()
        
        self.fc1 = nn.Linear(input_size,150)
        self.fc2 = nn.Linear(150,50)
        self.fc3 = nn.Linear(50,1) # 1 output -> Value estimate
        
    def forward(self, x):
        x = x.float()
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return  F.relu(self.fc3(x)) 

# Training

In [6]:
def New_prob(policy,states,actions,device):
    # The Gradient FLOW on action
    # The Gradient fon't FLOW on state 
    # No Clipping.
    Tab = []
    Action_sample_tab = []
    m = policy(states[0])
    
    proba = m.log_prob(actions[0])

    # STORE
    Tab.append(proba)
    Action_sample_tab.append(actions[0])
    
    # Loop over the state and action (a,s)
    for state_iter,action_iter in zip(states[1:],actions[1:]):
        m = policy(state_iter)
        proba = m.log_prob(action_iter) # Prob on the previous action but new policy
   
        # STORE
        Tab.append(proba)
        Action_sample_tab.append(action_iter)

    return torch.stack(Tab),torch.stack(Action_sample_tab)

In [7]:
def clipped_surrogate(Delta_t,critic,device,policy, old_probs,actions, states, rewards,batch_size,
                      discount = 0.995, epsilon=0.1, beta=0.01):
    

    
    # Convert REWARD TO REWARD FUTURE
    rewards = np.asarray(rewards)
    reward_futur = np.zeros((rewards.shape[0],rewards.shape[1]))
    longueur = rewards.shape[0] - 1
    reward_futur[longueur] = rewards[longueur]
    new_discount = 0
    for i in range(1,rewards.shape[0]):
        new_discount = discount**(longueur-i) 
        reward_futur[longueur-i] = reward_futur[longueur-(i-1)] + rewards[longueur-i]*new_discount
        
    # Compute normalized reward
    mean = np.mean(reward_futur, axis=1)
    std = np.std(reward_futur, axis=1)+1.0e-10
    normalized_rewards = (reward_futur-mean[:, np.newaxis])/std[:, np.newaxis]
    normalized_rewards = torch.from_numpy(normalized_rewards).float().to(device)
    normalized_rewards = normalized_rewards.unsqueeze(2)
    normalized_rewards = normalized_rewards.repeat(1, 1, old_probs.shape[2])
    
    
    #Normalize At
    Delta_t = Delta_t.detach()
    Delta_t = Delta_t.repeat(1, 1, old_probs.shape[2])
    Delta_t = (Delta_t- Delta_t.mean())/Delta_t.std()
    
    
    new_prob,action_sample = New_prob(policy, states,actions,device)
    
    # Compute each 
    Fraction = torch.exp(new_prob-(old_probs+1e-10))
    Cote1 = Delta_t*Fraction 
    Cote2 = Delta_t*torch.clamp(Fraction, 1-epsilon, 1+epsilon) 
    Cote1 = Cote1[:, :,:, None]
    Cote2 = Cote2[:, :,:, None]
    comp = torch.cat((Cote1, Cote2),3)
    Gradient = torch.min(comp,3)[0].to(device) # Surrogate function


    entropy = -(torch.exp(new_prob)*old_probs+1.e-10)+ \
        (1.0-torch.exp(new_prob))*(1.0-old_probs+1.e-10) # Entropy to enhance exploration

    writer.add_scalar('Entropy',torch.mean(beta*(entropy)),iteration_all)
    writer.add_scalar('Gradient',torch.mean(Gradient),iteration_all)
    
    MSE = TD_Training(critic,states,reward_futur,discount,device) # Critic network training
    writer.add_scalar('Loss/Critic',MSE,iteration_all)

    return -torch.mean(beta*(entropy) + Gradient)
        

In [8]:
def TD_Training(Critic,states,reward,discount,device):
    states = states.detach()
    reward = torch.from_numpy(reward).detach()
    value_loss = []
    for st in states:
        Valuet = Critic(st)
        value_loss.append(Valuet)
        
    Loss = 0.5*(discount*reward.to(device).unsqueeze(2) - torch.stack(value_loss)).pow(2).mean() # Simple MSE Loss
    optimizer_c.zero_grad()
    Loss.backward()
    optimizer_c.step()
    return Loss

# Evaluation

In [9]:
def collect_trajectories(env,env_info,policy,device,tmax):
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    state = env_info.vector_observations # get the current state (for each agent)
    states_tab , action_tab, reward_tab, prob_tab = [],[],[], []
    t = 0
    while True:
        state = torch.from_numpy(state).to(device)
        policy.eval()
        with torch.no_grad(): # Everything with torch no grad.
            m = policy(state) 

        
            # Sample maybe on gradient as to check that
            sample = m.sample()
            action_tab.append(sample) # No clip and store

            # Proba not on clip and detach from Gradient.
            proba = m.log_prob(sample)
            #proba = torch.exp(proba) #Proba on CUDA no detach
            
            # Interact with the environment 
            sample = torch.clip(sample.detach().cpu(), -1, 1) # CLIP BEFORE TAKING THE PROBA OR AFTER?
            sample = sample.numpy()


            # Step the environment
            env_info = env.step(sample)[brain_name]           # send all actions to the environment
            next_states = env_info.vector_observations         # get next state (for each agent)
            rewards = env_info.rewards                         # get reward (for each agent)
            dones = env_info.local_done                        # see if episode finished

            # Store values
            prob_tab.append(proba)
            reward_tab.append(np.asarray(rewards))
            states_tab.append(state)

            # BREAK IF END OF THE EPISODE
            if np.any(dones):                                  # exit loop if episode finished
                break
            if t >= tmax:
                break
            state = next_states
            t +=1
    writer.add_histogram('MU/Sample_mu_action0',torch.mean(torch.stack(action_tab)[:,:,0],axis=1),iteration_all)
    writer.add_histogram('MU/Sample_mu_action1',torch.mean(torch.stack(action_tab)[:,:,1],axis=1),iteration_all)
    writer.add_histogram('MU/Sample_mu_action2',torch.mean(torch.stack(action_tab)[:,:,2],axis=1),iteration_all)
    writer.add_histogram('MU/Sample_mu_action3',torch.mean(torch.stack(action_tab)[:,:,3],axis=1),iteration_all)
    return states_tab, action_tab, reward_tab,prob_tab

In [10]:
def TD_evaluation(Critic,states,reward,discount,device):
    # Calculate TD error during the evaluation step
    Delta_t = []
    Tab = []
    Critic.eval()
    with torch.no_grad(): 
        Valuet = Critic(states[0])
    
        for rw,st in zip(reward[0:],states[1:]):
            Valuetplus1 = Critic(st)
            Tab.append(Valuetplus1)
            TD_error = torch.from_numpy(rw).to(device).unsqueeze(1) + discount*Valuetplus1 - Valuet #TD ERROR
            Delta_t.append(TD_error)
            
            Valuet = Valuetplus1
    writer.add_histogram('Values',torch.mean(torch.stack(Tab),axis=1),e)
    return torch.stack(Delta_t)

In [11]:
def GAE_evaluation(Delta_t,discount,lambd):
    # GAE Function adapted from https://github.com/numblr/drlnd-cocontrol
    flipped = torch.flip(Delta_t, dims=(0,))
    result = torch.zeros_like(flipped)
    result[0,:,:] = flipped[0, :, :]
    for i in range(1, flipped.size()[0]):
        result[i,:,:] = discount * lambd * result[i-1,:,:] + flipped[i,:,:]

    return torch.flip(result, dims=(0,))

# Launch Main code

In [12]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
env_info = env.reset(train_mode=True)[brain_name]  
states = env_info.vector_observations # get the current state (for each agent
num_agents = len(states)
scores = np.zeros(num_agents)                          # initialize the score (for each agent)
nb_states = len(states[0])
action_size = brain.vector_action_space_size
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
policy = Policy(nb_states,action_size).to(device) # Policy network
optimizer = optim.Adam(policy.parameters(), lr=2e-4)
critic = Critic(nb_states).to(device) # Critic network
optimizer_c = optim.Adam(critic.parameters(), lr=2e-4)
writer = SummaryWriter()

In [None]:
###################################################### MAIN_CODE #################################################
# training loop max iterations
episode = 5000


tmax = 1000
discount_rate = .9997
epsilon = 0.1
beta = .01
SGD_epoch = 8
batch_size = 64
lambd = 0.95

# keep track of progress
mean_rewards = []
writer.add_text("CONFIG","aleatoire :" + str(aleatoire) + "tmax :" + str(tmax) + "batch_size :" + str(batch_size) + "discount_rate :" + str(discount_rate) + "epsilon" + str(epsilon)+ "beta" + str(beta) + "SGD_epoch :" + str(SGD_epoch) + "lambd :" + str(lambd) + "lr : 2e-4 x2")
iteration_all = 0
for e in range(episode):
    
    # EVALUATION STEP
    # collect trajectories
    states, actions, rewards,prob = collect_trajectories(env,env_info, policy,device,tmax)
    total_rewards = np.mean(np.sum(rewards,axis=0))
    
    # Compute advantages estimate
    Delta_t = TD_evaluation(critic,states,rewards,discount_rate,device)
    writer.add_scalar('DeltaT',torch.mean(Delta_t),iteration_all)
    Delta_t = GAE_evaluation(Delta_t,discount_rate,lambd)
    writer.add_scalar('Advantage',torch.mean(Delta_t),iteration_all)
    
    states = torch.stack(states)[:-1]
    actions = torch.stack(actions)[:-1]
    prob = torch.stack(prob)[:-1]
    rewards = np.asarray(rewards)[:-1]
    
    # TRAINING STEP
    indices = torch.split(torch.from_numpy(np.arange(0,states.shape[0],1)),batch_size,0) # Make chunk of the trajectory
    for epoch in range(SGD_epoch):
        # TRAINING OVER THE BATCH SIZE
        for chunks in indices:
            iteration_all += 1
            chunk = chunks.long()
            chunk_numpy = chunk.numpy().astype('int')

            states_chunk = states[chunk]
            actions_chunk = actions[chunk]
            prob_chunk = prob[chunk]
            rewards_chunk = rewards[chunk_numpy]
            Delta_t_chunk = Delta_t[chunk]
            rewards_chunk = rewards_chunk.tolist()
            
            L = clipped_surrogate(Delta_t_chunk,critic,device,policy,prob_chunk,actions_chunk, states_chunk, rewards_chunk,batch_size, epsilon=epsilon, beta=beta)
            optimizer.zero_grad()
            L.backward()
            optimizer.step()
            writer.add_scalar('Loss/Policy',L,iteration_all)
            del L
    writer.add_scalar('Score',total_rewards,e)
            
    mean_rewards.append(total_rewards)
    # display some progress every 20 iterations
    if (e+1)%20 ==0 :
        print("################################")
        print("Episode: {0:d}, score: {1:f}".format(e+1,total_rewards))
        print(total_rewards)
        



In [None]:
writer.close()

In [None]:
torch.save(policy.state_dict(), 'PPO_critic_stable.pth')
torch.save(critic.state_dict(), 'PPO_actor_stable.pth')